├── .gitignore
├── LICENSE
├── README.md
├── coco_rem
├── coco_evaluator.py
├── configs
│ ├── README.md
│ ├── common
│ │ ├── coco_schedule.py
│ │ ├── data
│ │ │ ├── coco.py
│ │ │ └── constants.py
│ │ ├── models
│ │ │ ├── cascade_rcnn.py
│ │ │ ├── mask2former.py
│ │ │ ├── mask_rcnn_fpn.py
│ │ │ └── mask_rcnn_vitdet.py
│ │ ├── optim.py
│ │ └── train.py
│ ├── convnext
│ │ ├── cascade_mask_rcnn_convnext_base_1k_3x.py
│ │ ├── cascade_mask_rcnn_convnext_base_22k_3x.py
│ │ ├── cascade_mask_rcnn_convnext_large_22k_3x.py
│ │ ├── cascade_mask_rcnn_convnext_small_1k_3x.py
│ │ ├── cascade_mask_rcnn_convnext_tiny_1k_3x.py
│ │ ├── cascade_mask_rcnn_convnext_xlarge_22k_3x.py
│ │ └── mask_rcnn_convnext_tiny_1k_3x.py
│ ├── d2lsj
│ │ ├── mask_rcnn_R_101_FPN_100ep.py
│ │ ├── mask_rcnn_R_101_FPN_200ep.py
│ │ ├── mask_rcnn_R_101_FPN_400ep.py
│ │ ├── mask_rcnn_R_50_FPN_100ep.py
│ │ ├── mask_rcnn_R_50_FPN_200ep.py
│ │ ├── mask_rcnn_R_50_FPN_400ep.py
│ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_100ep.py
│ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_200ep.py
│ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_400ep.py
│ │ ├── mask_rcnn_regnety_4gf_dds_FPN_100ep.py
│ │ ├── mask_rcnn_regnety_4gf_dds_FPN_200ep.py
│ │ └── mask_rcnn_regnety_4gf_dds_FPN_400ep.py
│ ├── d2main
│ │ ├── cascade_mask_rcnn_R_50_FPN_3x.py
│ │ ├── mask_rcnn_R_50_FPN_3x.py
│ │ ├── scratch_mask_rcnn_R_50_FPN_9x_gn.py
│ │ └── scratch_mask_rcnn_R_50_FPN_9x_syncbn.py
│ ├── mask2former
│ │ ├── maskformer2_R101_bs16_50ep.py
│ │ ├── maskformer2_R50_bs16_50ep.py
│ │ ├── maskformer2_swin_base_384_bs16_50ep.py
│ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.py
│ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.py
│ │ ├── maskformer2_swin_small_bs16_50ep.py
│ │ └── maskformer2_swin_tiny_bs16_50ep.py
│ ├── mvitv2
│ │ ├── cascade_mask_rcnn_mvitv2_b_3x.py
│ │ ├── cascade_mask_rcnn_mvitv2_b_in21k_3x.py
│ │ ├── cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py
│ │ ├── cascade_mask_rcnn_mvitv2_s_3x.py
│ │ ├── cascade_mask_rcnn_mvitv2_t_3x.py
│ │ └── mask_rcnn_mvitv2_t_3x.py
│ └── vitdet
│ │ ├── cascade_mask_rcnn_mvitv2_b_in21k_100ep.py
│ │ ├── cascade_mask_rcnn_mvitv2_h_in21k_36ep.py
│ │ ├── cascade_mask_rcnn_mvitv2_l_in21k_50ep.py
│ │ ├── cascade_mask_rcnn_swin_b_in21k_50ep.py
│ │ ├── cascade_mask_rcnn_swin_l_in21k_50ep.py
│ │ ├── cascade_mask_rcnn_vitdet_b_100ep.py
│ │ ├── cascade_mask_rcnn_vitdet_h_75ep.py
│ │ ├── cascade_mask_rcnn_vitdet_l_100ep.py
│ │ ├── mask_rcnn_vitdet_b_100ep.py
│ │ ├── mask_rcnn_vitdet_h_75ep.py
│ │ └── mask_rcnn_vitdet_l_100ep.py
├── data
│ ├── builtin.py
│ ├── lvis.py
│ └── manual_rem.py
├── mask_visualizer.py
├── modeling
│ ├── convnext.py
│ ├── rcnn_refiner.py
│ └── sam_refiner.py
└── trainer.py
├── images
├── coco_rem_example_1.jpg
└── coco_rem_example_2.jpg
├── requirements.txt
├── scripts
├── correct_labeling_errors.py
├── merge_instances.py
├── refine_boundaries.py
├── train_net.py
└── visualize_coco.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.diff
2 |
3 | # compilation and distribution
4 | __pycache__
5 | _ext
6 | *.pyc
7 | *.pyd
8 | *.so
9 | *.dll
10 | *.egg-info/
11 | build/
12 | dist/
13 | wheels/
14 |
15 | # Python virtual environments.
16 | .env
17 | .venv
18 | env/
19 | venv/
20 | ENV/
21 | env.bak/
22 | venv.bak/
23 |
24 | # Jupyter Notebook
25 | .ipynb_checkpoints
26 | /.virtual_documents
27 |
28 | # IPython
29 | profile_default/
30 | ipython_config.py
31 |
32 |
33 | # pytorch/python/numpy formats
34 | *.pth
35 | *.pkl
36 | *.npy
37 | *.pt
38 |
39 | # Editor temporaries
40 | *.swn
41 | *.swo
42 | *.swp
43 | *~
44 |
45 | # editor settings
46 | .idea
47 | .vscode
48 | _darcs
49 | pyrightconfig.json
50 |
51 | # project dirs
52 | datasets
53 | checkpoints
54 | output
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024, Karan Desai.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
4 | associated documentation files (the "Software"), to deal in the Software without restriction,
5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
7 | furnished to do so, subject to the following conditions:
8 |
9 | The above copyright notice and this permission notice shall be included in all copies or substantial
10 | portions of the Software.
11 |
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
15 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # COCO-ReM (COCO with Refined Masks)
2 |
3 | [](https://pytorch.org) [](https://huggingface.co/datasets/kdexd/coco-rem)
5 |
6 | [Shweta Singh](https://www.linkedin.com/in/shweta-singh-460154284/)†, [Aayan Yadav](https://www.linkedin.com/in/aayanyadav09/)†, [Jitesh Jain](https://praeclarumjj3.github.io/), [Humphrey Shi](https://www.humphreyshi.com/home), [Justin Johnson](https://web.eecs.umich.edu/~justincj/), [Karan Desai](https://kdexd.xyz/)
7 |
8 | † Equal Contribution
9 |
10 | [[`arxiv`](https://arxiv.org/abs/2403.18819)] [[`Dataset Website`](https://cocorem.xyz)]
11 |
12 | 
13 |
14 | Introducing COCO-ReM, a set of high-quality instance annotations for COCO images.
15 | COCO-ReM improves on imperfections prevailing in COCO-2017 such as coarse mask boundaries, non-exhaustive annotations,
16 | inconsistent handling of occlusions, and duplicate masks.
17 | Masks in COCO-ReM have a visibly better quality than COCO-2017, as shown below.
18 |
19 | 
20 |
21 | ## Contents
22 |
23 | 1. [News](#news)
24 | 2. [Setup Instructions](#setup-instructions)
25 | 3. [Download COCO-ReM](#download-coco-rem)
26 | 4. [Mask Visualization](#mask-visualization)
27 | 5. [Evaluation using COCO-ReM](#evaluation-using-coco-rem)
28 | 6. [Training with COCO-ReM](#training-with-coco-rem)
29 | 7. [Annotation Pipeline](#annotation-pipeline)
30 | - [Stage 1: Mask Boundary Refinement (automatic step)](#stage-1-mask-boundary-refinement)
31 | - [Stage 2: Exhaustive Instance Annotation (automatic step)](#stage-2-exhaustive-instance-annotation)
32 | - [Stage 3: Correction of Labeling Errors](#stage-3-correction-of-labeling-errors)
33 | 8. [Citation](#citation)
34 |
35 | ## News
36 |
37 | - **[July 7, 2024]**: Dataset now available on [**HuggingFace**](https://huggingface.co/datasets/kdexd/coco-rem) and [**code**](https://github.com/kdexd/coco-rem) is public!
38 | - **[July 1, 2024]**: COCO-ReM is accepted to ECCV 2024!
39 | - **[March 27, 2024]**: [**Dataset website**](https://cocorem.xyz) and [**arXiv preprint**](https://arxiv.org/abs/2403.18819) are public!
40 |
41 | ## Setup Instructions
42 |
43 | Clone the repository, create a conda environment, and install all dependencies as follows:
44 |
45 | ```bash
46 | git clone https://github.com/kdexd/coco-rem.git && cd coco-rem
47 | conda create -n coco_rem python=3.10
48 | conda activate coco_rem
49 | ```
50 |
51 | Install PyTorch and `torchvision` following the instructions on [pytorch.org](https://pytorch.org).
52 | Install Detectron2, [instructions are available here](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md).
53 | Then, install the dependencies:
54 |
55 | ```bash
56 | pip install -r requirements.txt
57 | pip install git+https://github.com/facebookresearch/segment-anything.git
58 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
59 |
60 | python setup.py develop
61 | ```
62 |
63 | ## Download COCO-ReM
64 |
65 | COCO-ReM is hosted on Huggingface Datasets at [@kdexd/coco-rem](https://huggingface.co/datasets/kdexd/coco-rem).
66 | Download the annotation files:
67 |
68 | ```
69 | for name in trainrem valrem; do
70 | wget https://huggingface.co/datasets/kdexd/coco-rem/resolve/main/instances_$name.json.zip
71 | unzip instances_$name.json.zip
72 | done
73 | ```
74 |
75 | **Dataset organization:** COCO and COCO-ReM and must be organized inside `datasets` directory as follows.
76 |
77 | ```
78 | $PROJECT_ROOT/datasets
79 | — coco/
80 | — train2017/ # Contains 118287 train images (.jpg files).
81 | — val2017/ # Contains 5000 val images (.jpg files).
82 | — annotations/
83 | — instances_train2017.json
84 | — instances_val2017.json
85 | - coco_rem/
86 | - instances_trainrem.json
87 | - instances_valrem.json
88 | -lvis
89 | - lvis_v1_val.json
90 | - lvis_v1_train.json
91 | ```
92 |
93 | -----
94 |
95 | ## Mask Visualization
96 |
97 | We include a lightweight script to quickly visualize masks of COCO-ReM and COCO-2017,
98 | both validation and training sets. For example, run the following command to visualize
99 | the masks for COCO-ReM validation set:
100 |
101 | ```bash
102 | python scripts/visualize_coco.py \
103 | --input-json datasets/coco_rem/instances_valrem.json \
104 | --image-dir datasets/coco/val2017 \
105 | --output visualization_output
106 | ```
107 |
108 | Read the documentation (`python scripts/visualize_coco.py --help`) for details about other arguments.
109 |
110 | -----
111 |
112 | ## Evaluation using COCO-ReM
113 |
114 | We support evaluation of all fifty object detectors available in the paper.
115 | First, run `python checkpoints/download.py` to download all the pre-trained models
116 | from their official repositories and save them in `checkpoints/pretrained_weights`.
117 |
118 | For example, to evaluate a [Mask R-CNN ViTDet-B model](https://arxiv.org/abs/2203.16527) using 8 GPUs
119 | and calculate average precision (AP) metrics, run the following command:
120 |
121 | ```bash
122 | python scripts/train_net.py --num-gpus 8 --eval-only \
123 | --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \
124 | train.init_checkpoint=checkpoints/pretrained_weights/vitdet/mask_rcnn_vitdet_b_100ep.pkl \
125 | dataloader.test.dataset.names=coco_rem_val \
126 | train.output_dir=evaluation_results
127 | ```
128 |
129 | ## Training with COCO-ReM
130 |
131 | We also support training ViTDet baselines on COCO-ReM using the Detectron2 library.
132 | Run the following command to train using 8 GPUs (with at least 32GB memory):
133 |
134 | ```bash
135 | python scripts/train_net.py --num-gpus 8 \
136 | --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \
137 | dataloader.train.dataset.names=coco_rem_train \
138 | dataloader.test.dataset.names=coco_rem_val \
139 | train.output_dir=training_output \
140 | dataloader.train.total_batch_size=16 train.grad_accum_steps=4
141 | ```
142 |
143 | For GPUs with less memory, update the parameters in the last line above:
144 | the batch size can be halved and gradient accumulation steps can be doubled, for same results.
145 |
146 | ## Annotation Pipeline
147 |
148 |
149 | ### Stage 1: Mask Boundary Refinement (automatic step)
150 |
151 | Download checkpoint for SAM from [segment-anything repository](https://github.com/facebookresearch/segment-anything) and place it in `checkpoint` folder.
152 |
153 | Run the following command to refine the boundaries of validation set masks using 8 GPUs:
154 |
155 | ```bash
156 | python scripts/refine_boundaries.py \
157 | --input-json datasets/coco/annotations/instances_val2017.json \
158 | --image-dir datasets/coco/val2017 \
159 | --num-gpus 8 \
160 | --output datasets/intermediate/cocoval_boundary_refined.json
161 | ```
162 |
163 | Read the documentation (`python scripts/refine_boundaries.py --help`) for details about other arguments.
164 |
165 | Use default values for other optional arguments to follow the strategy used in [paper](https://arxiv.org/abs/2403.18819).
166 |
167 | Do this stage for both COCO and LVIS datasets before the merging stage.
168 |
169 |
170 |
171 | ### Stage 2: Exhaustive Instance Annotation (automatic step)
172 |
173 | Run the following command to merge LVIS annotations for validation set of COCO using the strategy described in [paper](https://arxiv.org/abs/2403.18819):
174 |
175 | ```
176 | python scripts/merge_instances.py \
177 | --coco-json datasets/intermediate/cocoval_boundary_refined.json \
178 | --lvis-json datasets/intermediate/lvistrain_boundary_refined.json datasets/intermediate/lvisval_boundary_refined.json \
179 | --split val \
180 | --output datasets/intermediate/cocoval_lvis_merged.json
181 | ```
182 | Read the documentation (`python scripts/merge_instances.py --help`) for details about above arguments.
183 |
184 | Merging handpicked `(image,category)` non exhaustive instances from LVIS in validation set is done in the script of next stage.
185 |
186 |
187 |
188 | ### Stage 3: Correction of Labeling Errors
189 |
190 | This stage is done only for validation set.
191 |
192 | ```
193 | python scripts/correct_labeling_errors.py \
194 | --input datasets/intermediate/cocoval_lvis_merged.json \
195 | --output datasets/cocoval_refined.json
196 | ```
197 | **Note**: For the above json to be COCO-ReM we also have to perform the manual parts of Stage 1 and Stage 2.
198 |
199 | ## Citation
200 |
201 | If you found COCO-ReM useful in your research, please consider starring ⭐ us on GitHub and citing 📚 us in your research!
202 |
203 | ```bibtex
204 | @inproceedings{cocorem,
205 | title={Benchmarking Object Detectors with COCO: A New Path Forward},
206 | author={Singh, Shweta and Yadav, Aayan and Jain, Jitesh and Shi, Humphrey and Johnson, Justin and Desai, Karan},
207 | journal={ECCV},
208 | year={2024}
209 | }
210 | ```
211 |
--------------------------------------------------------------------------------
/coco_rem/coco_evaluator.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from __future__ import annotations
3 |
4 | import contextlib
5 | import copy
6 | import io
7 | import itertools
8 | import json
9 | import logging
10 | import os
11 | from collections import OrderedDict
12 |
13 | import detectron2.utils.comm as comm
14 | import numpy as np
15 | import pycocotools.mask as mask_util
16 | import torch
17 | from boundary_iou.coco_instance_api.coco import COCO
18 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
19 | from detectron2.data import MetadataCatalog
20 | from detectron2.evaluation.evaluator import DatasetEvaluator
21 | from detectron2.structures import BoxMode, Instances
22 | from detectron2.utils.file_io import PathManager
23 | from detectron2.utils.logger import create_small_table
24 | from tabulate import tabulate
25 |
26 |
27 | class COCOReMEvaluator(DatasetEvaluator):
28 | """
29 | Evaluate AP for COCO instance segmentation. The metrics range from 0 to 100
30 | (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed
31 | (e.g. due to no predictions made).
32 |
33 | See http://cocodataset.org/#detection-eval
34 |
35 | This implementation is functionally same as the original COCO evaluator of
36 | Detectron2 (:class:`detectron2.evaluation.COCOEvaluator`) except a few API
37 | and behavioral differences:
38 |
39 | 1. Only `Mask AP` and `Boundary AP` are supported, other metrics like `Box AP`
40 | and `Keypoint AP` are neither supported, nor calculated.
41 |
42 | 2. Max detections per image are always `[1, 10, 100]` following official COCO
43 | evaluation protocol, these are not customizable.
44 |
45 | 3. The official COCO evaluation API is used for calculating metrics, unlike
46 | Detectron2 that also allows using a fast, yet unofficial implementation.
47 | Hence, the calculated AP is suitable to report in research papers.
48 | """
49 |
50 | def __init__(self, dataset_name: str, distributed: bool = True, output_dir=None):
51 | """
52 | Args:
53 | dataset_name: Name of the dataset to be evaluated. It must have either
54 | registered metadata with a field named `json_file` which is a path
55 | to the COCO format annotation file.
56 | distributed: If True, will collect results from all ranks and run
57 | evaluation in the main process. Otherwise, will only evaluate
58 | the results in the current process.
59 | output_dir: An optional path to output directory where all results
60 | will be dumped as two files:
61 |
62 | 1. "instances_predictions.pth" a file that can be loaded with
63 | `torch.load` and contains all the results in the format they
64 | are produced by the model.
65 | 2. "coco_instances_results.json" in COCO result format.
66 | """
67 | self._logger = logging.getLogger(__name__)
68 | self._distributed = distributed
69 | self._output_dir = output_dir
70 | self._cpu_device = torch.device("cpu")
71 |
72 | self._metadata = MetadataCatalog.get(dataset_name)
73 | json_file = PathManager.get_local_path(self._metadata.json_file)
74 |
75 | with contextlib.redirect_stdout(io.StringIO()):
76 | self._coco_api = COCO(json_file)
77 |
78 | def reset(self):
79 | self._predictions = []
80 |
81 | def process(self, inputs, outputs):
82 | """
83 | Args:
84 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
85 | It is a list of dict. Each dict corresponds to an image and
86 | contains keys like "height", "width", "file_name", "image_id".
87 | outputs: the outputs of a COCO model. It is a list of dicts with key
88 | "instances" that contains :class:`Instances`.
89 | """
90 | for input, output in zip(inputs, outputs):
91 | prediction = {"image_id": input["image_id"]}
92 |
93 | if "instances" in output:
94 | instances = output["instances"].to(self._cpu_device)
95 | prediction["instances"] = instances_to_coco_json(
96 | instances, input["image_id"]
97 | )
98 | if "proposals" in output:
99 | prediction["proposals"] = output["proposals"].to(self._cpu_device)
100 | if len(prediction) > 1:
101 | self._predictions.append(prediction)
102 |
103 | def evaluate(self, img_ids=None):
104 | """
105 | Args:
106 | img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
107 | """
108 | if self._distributed:
109 | comm.synchronize()
110 | predictions = comm.gather(self._predictions, dst=0)
111 | predictions = list(itertools.chain(*predictions))
112 |
113 | if not comm.is_main_process():
114 | return {}
115 | else:
116 | predictions = self._predictions
117 |
118 | if len(predictions) == 0:
119 | self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
120 | return {}
121 |
122 | if self._output_dir:
123 | PathManager.mkdirs(self._output_dir)
124 | file_path = os.path.join(self._output_dir, "instances_predictions.pth")
125 | with PathManager.open(file_path, "wb") as f:
126 | torch.save(predictions, f)
127 |
128 | self._results = OrderedDict()
129 | if "instances" in predictions[0]:
130 | self._eval_predictions(predictions, img_ids=img_ids)
131 | # Copy so the caller can do whatever with results
132 | return copy.deepcopy(self._results)
133 |
134 | def _eval_predictions(self, predictions, img_ids=None):
135 | """
136 | Evaluate predictions. Fill self._results with the metrics of the tasks.
137 | """
138 | self._logger.info("Preparing results for COCO format ...")
139 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
140 |
141 | # unmap the category ids for COCO
142 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
143 | dataset_id_to_contiguous_id = (
144 | self._metadata.thing_dataset_id_to_contiguous_id
145 | )
146 | all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
147 | num_classes = len(all_contiguous_ids)
148 | assert (
149 | min(all_contiguous_ids) == 0
150 | and max(all_contiguous_ids) == num_classes - 1
151 | )
152 |
153 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
154 | for result in coco_results:
155 | category_id = result["category_id"]
156 | assert category_id < num_classes, (
157 | f"A prediction has class={category_id}, "
158 | f"but the dataset only has {num_classes} classes and "
159 | f"predicted class id should be in [0, {num_classes - 1}]."
160 | )
161 | result["category_id"] = reverse_id_mapping[category_id]
162 |
163 | if self._output_dir:
164 | file_path = os.path.join(self._output_dir, "coco_instances_results.json")
165 | self._logger.info("Saving results to {}".format(file_path))
166 | with PathManager.open(file_path, "w") as f:
167 | f.write(json.dumps(coco_results))
168 | f.flush()
169 |
170 | self._logger.info("Evaluating predictions with official COCO API...")
171 |
172 | for task in ["segm", "boundary"]:
173 | coco_eval = (
174 | _evaluate_predictions_on_coco(
175 | self._coco_api, coco_results, task, img_ids=img_ids
176 | )
177 | if len(coco_results) > 0
178 | else None # cocoapi does not handle empty results very well
179 | )
180 |
181 | res = self._derive_coco_results(
182 | coco_eval, task, class_names=self._metadata.get("thing_classes")
183 | )
184 | self._results[task] = res
185 |
186 | def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
187 | """
188 | Derive the desired score numbers from summarized COCOeval.
189 | """
190 |
191 | metrics = [
192 | "AP",
193 | "AP50",
194 | "AP75",
195 | "AP80",
196 | "AP85",
197 | "AP90",
198 | "AP95",
199 | "APs",
200 | "APm",
201 | "APl",
202 | ]
203 | if coco_eval is None:
204 | self._logger.warn("No predictions from the model!")
205 | return {metric: float("nan") for metric in metrics}
206 |
207 | # the standard metrics
208 | results = {
209 | metric: float(
210 | coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan"
211 | )
212 | for idx, metric in enumerate(metrics)
213 | }
214 | self._logger.info(
215 | "Evaluation results for {}: \n".format(iou_type)
216 | + create_small_table(results)
217 | )
218 | if not np.isfinite(sum(results.values())):
219 | self._logger.info("Some metrics cannot be computed and is shown as NaN.")
220 |
221 | if class_names is None or len(class_names) <= 1:
222 | return results
223 |
224 | # Compute per-category AP
225 | precisions = coco_eval.eval["precision"]
226 | # precision has dims (iou, recall, cls, area range, max dets)
227 | assert len(class_names) == precisions.shape[2]
228 |
229 | results_per_category = []
230 | for idx, name in enumerate(class_names):
231 | # area range index 0: all area ranges
232 | # max dets index -1: typically 100 per image
233 | precision = precisions[:, :, idx, 0, -1]
234 | precision = precision[precision > -1]
235 | ap = np.mean(precision) if precision.size else float("nan")
236 | results_per_category.append(("{}".format(name), float(ap * 100)))
237 |
238 | # tabulate it
239 | N_COLS = min(6, len(results_per_category) * 2)
240 | results_flatten = list(itertools.chain(*results_per_category))
241 | results_2d = itertools.zip_longest(
242 | *[results_flatten[i::N_COLS] for i in range(N_COLS)]
243 | )
244 | table = tabulate(
245 | results_2d,
246 | tablefmt="pipe",
247 | floatfmt=".3f",
248 | headers=["category", "AP"] * (N_COLS // 2),
249 | numalign="left",
250 | )
251 | self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
252 |
253 | results.update({"AP-" + name: ap for name, ap in results_per_category})
254 | return results
255 |
256 |
257 | def instances_to_coco_json(instances: Instances, img_id: int) -> list[dict]:
258 | """
259 | Dump an "Instances" object to a COCO-format json that's used for evaluation.
260 | """
261 | num_instance = len(instances)
262 | if num_instance == 0:
263 | return []
264 |
265 | boxes = instances.pred_boxes.tensor.numpy()
266 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
267 | boxes = boxes.tolist()
268 | scores = instances.scores.tolist()
269 | classes = instances.pred_classes.tolist()
270 |
271 | has_mask = instances.has("pred_masks")
272 | if has_mask:
273 | # use RLE to encode the masks, because they are too large and takes memory
274 | # since this evaluator stores outputs of the entire dataset
275 | rles = [
276 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
277 | for mask in instances.pred_masks
278 | ]
279 | for rle in rles:
280 | # "counts" is an array encoded by mask_util as a byte-stream. Python3's
281 | # json writer which always produces strings cannot serialize a bytestream
282 | # unless you decode it. Thankfully, utf-8 works out (which is also what
283 | # the pycocotools/_mask.pyx does).
284 | rle["counts"] = rle["counts"].decode("utf-8")
285 |
286 | results = []
287 | for k in range(num_instance):
288 | result = {
289 | "image_id": img_id,
290 | "category_id": classes[k],
291 | "bbox": boxes[k],
292 | "score": scores[k],
293 | }
294 | if has_mask:
295 | result["segmentation"] = rles[k]
296 | results.append(result)
297 | return results
298 |
299 |
300 | class COCOevalHighIoU(COCOeval):
301 | def summarize(self):
302 | """
303 | Compute and display summary metrics for evaluation results including AP
304 | with higher IOU thresholds (0.9 and 0.95).
305 | """
306 |
307 | def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
308 | p = self.params
309 | p.iouThrs = np.array(
310 | [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
311 | )
312 |
313 | iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
314 | titleStr = "Average Precision" if ap == 1 else "Average Recall"
315 | typeStr = "(AP)" if ap == 1 else "(AR)"
316 | iouStr = (
317 | "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
318 | if iouThr is None
319 | else "{:0.2f}".format(iouThr)
320 | )
321 |
322 | aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
323 | mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
324 | if ap == 1:
325 | # dimension of precision: [TxRxKxAxM]
326 | s = self.eval["precision"]
327 | # IoU
328 | if iouThr is not None:
329 | t = np.where(iouThr == p.iouThrs)[0]
330 | s = s[t]
331 | s = s[:, :, :, aind, mind]
332 | else:
333 | # dimension of recall: [TxKxAxM]
334 | s = self.eval["recall"]
335 | if iouThr is not None:
336 | t = np.where(iouThr == p.iouThrs)[0]
337 | s = s[t]
338 | s = s[:, :, aind, mind]
339 | if len(s[s > -1]) == 0:
340 | mean_s = -1
341 | else:
342 | mean_s = np.mean(s[s > -1])
343 | print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
344 | return mean_s
345 |
346 | def _summarizeDets():
347 | stats = np.zeros((16,))
348 | stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
349 | stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
350 | stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
351 | stats[3] = _summarize(1, iouThr=0.80, maxDets=self.params.maxDets[2])
352 | stats[4] = _summarize(1, iouThr=0.85, maxDets=self.params.maxDets[2])
353 | stats[5] = _summarize(1, iouThr=0.90, maxDets=self.params.maxDets[2])
354 | stats[6] = _summarize(1, iouThr=0.95, maxDets=self.params.maxDets[2])
355 | stats[7] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
356 | stats[8] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
357 | stats[9] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
358 | stats[10] = _summarize(0, maxDets=self.params.maxDets[0])
359 | stats[11] = _summarize(0, maxDets=self.params.maxDets[1])
360 | stats[12] = _summarize(0, maxDets=self.params.maxDets[2])
361 | stats[13] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
362 | stats[14] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
363 | stats[15] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
364 | return stats
365 |
366 | if not self.eval:
367 | raise Exception("Please run accumulate() first")
368 |
369 | self.stats = _summarizeDets()
370 |
371 | def __str__(self):
372 | self.summarize()
373 |
374 |
375 | def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, img_ids=None):
376 | """
377 | Evaluate the coco results using COCOEval API.
378 | """
379 | assert len(coco_results) > 0
380 |
381 | if iou_type in {"segm", "boundary"}:
382 | coco_results = copy.deepcopy(coco_results)
383 | # When evaluating mask AP, if the results contain bbox, cocoapi will
384 | # use the box area as the area of the instance, instead of the mask area.
385 | # This leads to a different definition of small/medium/large.
386 | # We remove the bbox field to let mask AP use mask area.
387 | for c in coco_results:
388 | c.pop("bbox", None)
389 |
390 | coco_dt = coco_gt.loadRes(coco_results)
391 | coco_eval = COCOevalHighIoU(coco_gt, coco_dt, iou_type)
392 |
393 | if img_ids is not None:
394 | coco_eval.params.imgIds = img_ids
395 |
396 | coco_eval.evaluate()
397 | coco_eval.accumulate()
398 | coco_eval.summarize()
399 |
400 | return coco_eval
401 |
--------------------------------------------------------------------------------
/coco_rem/configs/README.md:
--------------------------------------------------------------------------------
1 | # Model Configs for Benchmarking
2 |
3 | Each sub-directory contains Detectron2 config files (`LazyConfig` format) for
4 | all model checkpoints from public Github repos building with Detectron2.
5 |
6 | - `d2main`: Detectron2 model zoo (initial baselines).
7 | - `d2lsj`: Detectron2 model zoo (new LSJ baselines).
8 | - `vitdet`: https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet
9 | - `convnext`: https://github.com/facebookresearch/convnext
10 | - `mvitv2`: https://github.com/facebookresearch/detectron2/tree/main/projects/MViTv2
11 | - `mask2former`: https://github.com/facebookresearch/Mask2Former
12 |
13 | Additionally, `common` directory has config objects that are shared across many
14 | config files.
15 |
16 | ### Note on config structure
17 |
18 | Detectron2 lazy configs are described in the official Detectron2 documentation
19 | [here](https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html).
20 | Each config file requires five objects: `dataloader`, `model`, `optimizer`,
21 | `lr_multiplier`, `train`. Some configs may exclude two objects that are not
22 | required for evaluation - `optimizer` and `lr_multiplier`.
23 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/coco_schedule.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.solver import WarmupParamScheduler
3 | from fvcore.common.param_scheduler import MultiStepParamScheduler
4 |
5 |
6 | def default_lsj_epoch_scheduler(epochs: int):
7 | """
8 | Returns the config for a default multi-step LR scheduler that runs for fixed
9 | amount of COCO epochs, typically used with models using "LSJ" augmentations
10 | and training schedule (large-scale jittering augmentation and 50-400 epochs).
11 | """
12 |
13 | coco_100ep_iter = 184375
14 | coco_curr_iter = coco_100ep_iter * epochs // 100
15 |
16 | coco_100ep_milestones = [163889, 177546]
17 | coco_curr_milestones = [x * epochs // 100 for x in coco_100ep_milestones]
18 |
19 | lr_multiplier = L(WarmupParamScheduler)(
20 | scheduler=L(MultiStepParamScheduler)(
21 | values=[1.0, 0.1, 0.01],
22 | milestones=coco_curr_milestones,
23 | num_updates=coco_curr_iter,
24 | ),
25 | warmup_length=250 / coco_curr_iter,
26 | warmup_factor=0.001,
27 | )
28 | return lr_multiplier
29 |
30 |
31 | lr_multiplier_75ep = default_lsj_epoch_scheduler(75)
32 | lr_multiplier_100ep = default_lsj_epoch_scheduler(100)
33 | lr_multiplier_200ep = default_lsj_epoch_scheduler(200)
34 | lr_multiplier_400ep = default_lsj_epoch_scheduler(400)
35 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/data/coco.py:
--------------------------------------------------------------------------------
1 | import detectron2.data.transforms as T
2 | from detectron2.config import LazyCall as L
3 | from detectron2.data import (
4 | DatasetMapper,
5 | build_detection_test_loader,
6 | build_detection_train_loader,
7 | get_detection_dataset_dicts,
8 | )
9 | from omegaconf import OmegaConf
10 |
11 | from coco_rem.coco_evaluator import COCOReMEvaluator
12 |
13 | dataloader = OmegaConf.create()
14 |
15 | # Mapper with large-scale jittering (LSJ) augmentation.
16 | image_size = 1024
17 |
18 | dataloader.train = L(build_detection_train_loader)(
19 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
20 | mapper=L(DatasetMapper)(
21 | is_train=True,
22 | augmentations=[
23 | L(T.RandomFlip)(horizontal=True), # flip first
24 | L(T.ResizeScale)(
25 | min_scale=0.1,
26 | max_scale=2.0,
27 | target_height=image_size,
28 | target_width=image_size,
29 | ),
30 | L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
31 | ],
32 | image_format="RGB",
33 | use_instance_mask=True,
34 | instance_mask_format="bitmask",
35 | recompute_boxes=True,
36 | ),
37 | total_batch_size=64,
38 | num_workers=4,
39 | )
40 |
41 | # Resize shortest edge to 1024 pixels.
42 | dataloader.test = L(build_detection_test_loader)(
43 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
44 | mapper=L(DatasetMapper)(
45 | is_train=False,
46 | augmentations=[
47 | L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
48 | ],
49 | image_format="${...train.mapper.image_format}",
50 | ),
51 | num_workers=4,
52 | )
53 |
54 | # Update: Custom COCO evaluator that returns exactly same results as default
55 | # evaluator, with additionally returning AP90, AP95, and Boundary AP.
56 | dataloader.evaluator = L(COCOReMEvaluator)(
57 | dataset_name="${..test.dataset.names}",
58 | output_dir="${...train.output_dir}",
59 | )
60 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/data/constants.py:
--------------------------------------------------------------------------------
1 | constants = dict(
2 | imagenet_rgb256_mean=[123.675, 116.28, 103.53],
3 | imagenet_rgb256_std=[58.395, 57.12, 57.375],
4 | imagenet_bgr256_mean=[103.530, 116.280, 123.675],
5 | # When using pre-trained models in Detectron1 or any MSRA models,
6 | # std has been absorbed into its conv1 weights, so the std needs to be set 1.
7 | # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
8 | imagenet_bgr256_std=[1.0, 1.0, 1.0],
9 | )
10 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/models/cascade_rcnn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.box_regression import Box2BoxTransform
4 | from detectron2.modeling.matcher import Matcher
5 | from detectron2.modeling.roi_heads import (
6 | CascadeROIHeads,
7 | FastRCNNConvFCHead,
8 | FastRCNNOutputLayers,
9 | )
10 |
11 | from .mask_rcnn_fpn import model
12 |
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 |
16 | model.roi_heads.update(
17 | _target_=CascadeROIHeads,
18 | box_heads=[
19 | L(FastRCNNConvFCHead)(
20 | input_shape=ShapeSpec(channels=256, height=7, width=7),
21 | conv_dims=[],
22 | fc_dims=[1024, 1024],
23 | )
24 | for k in range(3)
25 | ],
26 | box_predictors=[
27 | L(FastRCNNOutputLayers)(
28 | input_shape=ShapeSpec(channels=1024),
29 | test_score_thresh=0.05,
30 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
31 | cls_agnostic_bbox_reg=True,
32 | num_classes="${...num_classes}",
33 | )
34 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
35 | ],
36 | proposal_matchers=[
37 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
38 | for th in [0.5, 0.6, 0.7]
39 | ],
40 | )
41 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask2former.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.backbone import BasicStem, ResNet
4 | from mask2former.maskformer_model import MaskFormer
5 | from mask2former.modeling.meta_arch.mask_former_head import MaskFormerHead
6 | from mask2former.modeling.pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
7 | from mask2former.modeling.transformer_decoder import MultiScaleMaskedTransformerDecoder
8 |
9 | from ..data.constants import constants
10 |
11 | model = L(MaskFormer)(
12 | backbone=L(ResNet)(
13 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
14 | stages=L(ResNet.make_default_stages)(
15 | depth=50,
16 | stride_in_1x1=False,
17 | norm="FrozenBN",
18 | ),
19 | out_features=["res2", "res3", "res4", "res5"],
20 | ),
21 | sem_seg_head=L(MaskFormerHead)(
22 | input_shape={
23 | "res2": L(ShapeSpec)(channels=256, stride=4),
24 | "res3": L(ShapeSpec)(channels=512, stride=8),
25 | "res4": L(ShapeSpec)(channels=1024, stride=16),
26 | "res5": L(ShapeSpec)(channels=2048, stride=32),
27 | },
28 | num_classes=80,
29 | pixel_decoder=L(MSDeformAttnPixelDecoder)(
30 | input_shape="${..input_shape}",
31 | transformer_dropout=0.0,
32 | transformer_nheads=8,
33 | transformer_dim_feedforward=1024,
34 | transformer_enc_layers=6,
35 | conv_dim=256,
36 | mask_dim=256,
37 | norm="GN",
38 | transformer_in_features=["res3", "res4", "res5"],
39 | common_stride=4,
40 | ),
41 | loss_weight=1.0,
42 | ignore_value=255,
43 | transformer_predictor=L(MultiScaleMaskedTransformerDecoder)(
44 | in_channels="${..pixel_decoder.conv_dim}",
45 | mask_classification=True,
46 | num_classes="${..num_classes}",
47 | hidden_dim="${..pixel_decoder.conv_dim}",
48 | num_queries="${...num_queries}",
49 | nheads=8,
50 | dim_feedforward=2048,
51 | dec_layers=9,
52 | pre_norm=False,
53 | mask_dim="${..pixel_decoder.mask_dim}",
54 | enforce_input_project=False,
55 | ),
56 | transformer_in_feature="multi_scale_pixel_decoder",
57 | ),
58 | criterion=None,
59 | num_queries=100,
60 | metadata=None,
61 | size_divisibility=32,
62 | sem_seg_postprocess_before_inference=True,
63 | object_mask_threshold=0.8,
64 | overlap_threshold=0.8,
65 | instance_on=True,
66 | semantic_on=False,
67 | panoptic_on=False,
68 | pixel_mean=constants.imagenet_rgb256_mean,
69 | pixel_std=constants.imagenet_rgb256_std,
70 | test_topk_per_image=100,
71 | )
72 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask_rcnn_fpn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
4 | from detectron2.modeling.backbone import FPN, BasicStem, ResNet
5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
6 | from detectron2.modeling.box_regression import Box2BoxTransform
7 | from detectron2.modeling.matcher import Matcher
8 | from detectron2.modeling.meta_arch import GeneralizedRCNN
9 | from detectron2.modeling.poolers import ROIPooler
10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
11 | from detectron2.modeling.roi_heads import (
12 | FastRCNNConvFCHead,
13 | FastRCNNOutputLayers,
14 | MaskRCNNConvUpsampleHead,
15 | StandardROIHeads,
16 | )
17 |
18 | from ..data.constants import constants
19 |
20 | model = L(GeneralizedRCNN)(
21 | backbone=L(FPN)(
22 | bottom_up=L(ResNet)(
23 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
24 | stages=L(ResNet.make_default_stages)(
25 | depth=50,
26 | stride_in_1x1=True,
27 | norm="FrozenBN",
28 | ),
29 | out_features=["res2", "res3", "res4", "res5"],
30 | ),
31 | in_features="${.bottom_up.out_features}",
32 | out_channels=256,
33 | top_block=L(LastLevelMaxPool)(),
34 | ),
35 | proposal_generator=L(RPN)(
36 | in_features=["p2", "p3", "p4", "p5", "p6"],
37 | head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
38 | anchor_generator=L(DefaultAnchorGenerator)(
39 | sizes=[[32], [64], [128], [256], [512]],
40 | aspect_ratios=[0.5, 1.0, 2.0],
41 | strides=[4, 8, 16, 32, 64],
42 | offset=0.0,
43 | ),
44 | anchor_matcher=L(Matcher)(
45 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
46 | ),
47 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
48 | batch_size_per_image=256,
49 | positive_fraction=0.5,
50 | pre_nms_topk=(2000, 1000),
51 | post_nms_topk=(1000, 1000),
52 | nms_thresh=0.7,
53 | ),
54 | roi_heads=L(StandardROIHeads)(
55 | num_classes=80,
56 | batch_size_per_image=512,
57 | positive_fraction=0.25,
58 | proposal_matcher=L(Matcher)(
59 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
60 | ),
61 | box_in_features=["p2", "p3", "p4", "p5"],
62 | box_pooler=L(ROIPooler)(
63 | output_size=7,
64 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
65 | sampling_ratio=0,
66 | pooler_type="ROIAlignV2",
67 | ),
68 | box_head=L(FastRCNNConvFCHead)(
69 | input_shape=ShapeSpec(channels=256, height=7, width=7),
70 | conv_dims=[],
71 | fc_dims=[1024, 1024],
72 | ),
73 | box_predictor=L(FastRCNNOutputLayers)(
74 | input_shape=ShapeSpec(channels=1024),
75 | test_score_thresh=0.05,
76 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
77 | num_classes="${..num_classes}",
78 | ),
79 | mask_in_features=["p2", "p3", "p4", "p5"],
80 | mask_pooler=L(ROIPooler)(
81 | output_size=14,
82 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
83 | sampling_ratio=0,
84 | pooler_type="ROIAlignV2",
85 | ),
86 | mask_head=L(MaskRCNNConvUpsampleHead)(
87 | input_shape=ShapeSpec(channels=256, width=14, height=14),
88 | num_classes="${..num_classes}",
89 | conv_dims=[256, 256, 256, 256, 256],
90 | ),
91 | ),
92 | pixel_mean=constants.imagenet_rgb256_mean,
93 | pixel_std=constants.imagenet_rgb256_std,
94 | input_format="RGB",
95 | )
96 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask_rcnn_vitdet.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import torch.nn as nn
4 | from detectron2.config import LazyCall as L
5 | from detectron2.modeling import SimpleFeaturePyramid, ViT
6 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
7 |
8 | from .mask_rcnn_fpn import model
9 |
10 | # Base
11 | embed_dim, depth, num_heads, dp = 768, 12, 12, 0.1
12 | # Creates Simple Feature Pyramid from ViT backbone
13 | model.backbone = L(SimpleFeaturePyramid)(
14 | net=L(ViT)( # Single-scale ViT backbone
15 | img_size=1024,
16 | patch_size=16,
17 | embed_dim=embed_dim,
18 | depth=depth,
19 | num_heads=num_heads,
20 | drop_path_rate=dp,
21 | window_size=14,
22 | mlp_ratio=4,
23 | qkv_bias=True,
24 | norm_layer=partial(nn.LayerNorm, eps=1e-6),
25 | window_block_indexes=[
26 | # 2, 5, 8 11 for global attention
27 | 0,
28 | 1,
29 | 3,
30 | 4,
31 | 6,
32 | 7,
33 | 9,
34 | 10,
35 | ],
36 | residual_block_indexes=[],
37 | use_rel_pos=True,
38 | out_feature="last_feat",
39 | ),
40 | in_feature="${.net.out_feature}",
41 | out_channels=256,
42 | scale_factors=(4.0, 2.0, 1.0, 0.5),
43 | top_block=L(LastLevelMaxPool)(),
44 | norm="LN",
45 | square_pad=1024,
46 | )
47 |
48 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN"
49 |
50 | # 2conv in RPN:
51 | model.proposal_generator.head.conv_dims = [-1, -1]
52 |
53 | # 4conv1fc box head
54 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
55 | model.roi_heads.box_head.fc_dims = [1024]
56 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/optim.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from detectron2.config import LazyCall as L
3 | from detectron2.solver.build import get_default_optimizer_params
4 |
5 | SGD = L(torch.optim.SGD)(
6 | params=L(get_default_optimizer_params)(
7 | # params.model is meant to be set to the model object, before instantiating
8 | # the optimizer.
9 | weight_decay_norm=0.0
10 | ),
11 | lr=0.02,
12 | momentum=0.9,
13 | weight_decay=1e-4,
14 | )
15 |
16 |
17 | AdamW = L(torch.optim.AdamW)(
18 | params=L(get_default_optimizer_params)(
19 | # params.model is meant to be set to the model object, before instantiating
20 | # the optimizer.
21 | base_lr="${..lr}",
22 | weight_decay_norm=0.0,
23 | ),
24 | lr=1e-4,
25 | betas=(0.9, 0.999),
26 | weight_decay=0.1,
27 | )
28 |
--------------------------------------------------------------------------------
/coco_rem/configs/common/train.py:
--------------------------------------------------------------------------------
1 | # Common training-related configs that are designed for "scripts/evaluate.py"
2 | # You can use your own instead, together with your own train_net.py
3 | train = dict(
4 | output_dir="./output",
5 | init_checkpoint="",
6 | max_iter=90000,
7 | amp=dict(enabled=True), # options for Automatic Mixed Precision
8 | ddp=dict( # options for DistributedDataParallel
9 | broadcast_buffers=False,
10 | find_unused_parameters=False,
11 | fp16_compression=False,
12 | ),
13 | checkpointer=dict(period=5000, max_to_keep=100), # options for PeriodicCheckpointer
14 | eval_period=5000,
15 | log_period=20,
16 | device="cuda"
17 | # ...
18 | )
19 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_1k_3x.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.box_regression import Box2BoxTransform
4 | from detectron2.modeling.roi_heads import FastRCNNConvFCHead, FastRCNNOutputLayers
5 |
6 | from coco_rem.modeling.convnext import ConvNeXt
7 |
8 | from ..common.data.coco import dataloader
9 | from ..common.data.constants import constants
10 | from ..common.models.cascade_rcnn import model
11 | from ..common.train import train
12 |
13 | model.backbone.bottom_up = L(ConvNeXt)(
14 | in_chans=3,
15 | depths=[3, 3, 27, 3],
16 | dims=[128, 256, 512, 1024],
17 | drop_path_rate=0.7,
18 | layer_scale_init_value=1.0,
19 | out_features=["res2", "res3", "res4", "res5"],
20 | )
21 |
22 | model.roi_heads.update(
23 | # 4conv1fc box heads with BatchNorm
24 | box_heads=[
25 | L(FastRCNNConvFCHead)(
26 | input_shape=ShapeSpec(channels=256, height=7, width=7),
27 | conv_dims=[256, 256, 256, 256],
28 | fc_dims=[1024],
29 | conv_norm="SyncBN",
30 | )
31 | for k in range(3)
32 | ],
33 | box_predictors=[
34 | L(FastRCNNOutputLayers)(
35 | input_shape=ShapeSpec(channels=1024),
36 | test_score_thresh=0.05,
37 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
38 | #
39 | # Cascade R-CNN implementation in Detectron2 has class-agnostic box reg
40 | # but checkpoints from ConvNext repo (MMDetection) use class-specific.
41 | cls_agnostic_bbox_reg=False,
42 | num_classes="${...num_classes}",
43 | )
44 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
45 | ],
46 | )
47 |
48 | train.init_checkpoint = None # Load externally.
49 | train.max_iter *= 3
50 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 |
3 | # This config is IDENTICAL to ConvNeXt-B (ImageNet-1K) - the only difference
4 | # is pre-training dataset of backbone (ImageNet-22K vs 1K) but weights in,
5 | # `train.init_checkpoint` (to be provided externally) override everything.
6 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_large_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.dims = [192, 384, 768, 1536]
4 | model.backbone.bottom_up.drop_path_rate = 0.7
5 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_small_1k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.dims = [96, 192, 384, 768]
4 | model.backbone.bottom_up.drop_path_rate = 0.6
5 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_tiny_1k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.depths = [3, 3, 9, 3]
4 | model.backbone.bottom_up.dims = [96, 192, 384, 768]
5 | model.backbone.bottom_up.drop_path_rate = 0.4
6 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_xlarge_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.dims = [256, 512, 1024, 2048]
4 | model.backbone.bottom_up.drop_path_rate = 0.8
5 |
--------------------------------------------------------------------------------
/coco_rem/configs/convnext/mask_rcnn_convnext_tiny_1k_3x.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 |
3 | from coco_rem.modeling.convnext import ConvNeXt
4 |
5 | from ..common.data.coco import dataloader
6 | from ..common.data.constants import constants
7 | from ..common.models.mask_rcnn_fpn import model
8 | from ..common.train import train
9 |
10 | model.backbone.bottom_up = L(ConvNeXt)(
11 | in_chans=3,
12 | depths=[3, 3, 9, 3],
13 | dims=[96, 192, 384, 768],
14 | drop_path_rate=0.4,
15 | layer_scale_init_value=1.0,
16 | out_features=["res2", "res3", "res4", "res5"],
17 | )
18 |
19 | train.init_checkpoint = None # Load externally.
20 | train.max_iter *= 3
21 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_100ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 |
3 | model.backbone.bottom_up.stages.depth = 101
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 2 # 100ep -> 200ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 4 # 100ep -> 400ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_100ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.layers.batch_norm import NaiveSyncBatchNorm
2 |
3 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier
4 | from ..common.data.coco import dataloader
5 | from ..common.data.constants import constants
6 | from ..common.models.mask_rcnn_fpn import model
7 | from ..common.optim import SGD as optimizer
8 | from ..common.train import train
9 |
10 | dataloader.train.mapper.image_format = "BGR"
11 | model.pixel_mean = constants.imagenet_bgr256_mean
12 | model.pixel_std = constants.imagenet_bgr256_std
13 | model.input_format = "BGR"
14 |
15 | # train from scratch
16 | train.init_checkpoint = ""
17 | train.amp.enabled = True
18 | train.ddp.fp16_compression = True
19 | model.backbone.bottom_up.freeze_at = 0
20 |
21 | # SyncBN
22 | model.backbone.bottom_up.stem.norm = "SyncBN"
23 | model.backbone.bottom_up.stages.norm = "SyncBN"
24 | model.backbone.norm = "SyncBN"
25 |
26 | # Using NaiveSyncBatchNorm because heads may have empty input. That is not supported by
27 | # torch.nn.SyncBatchNorm. We can remove this after
28 | # https://github.com/pytorch/pytorch/issues/36530 is fixed.
29 | model.roi_heads.box_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N")
30 | model.roi_heads.mask_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N")
31 |
32 | # 2conv in RPN:
33 | model.proposal_generator.head.conv_dims = [-1, -1]
34 |
35 | # 4conv1fc box head
36 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
37 | model.roi_heads.box_head.fc_dims = [1024]
38 |
39 | # Equivalent to 100 epochs.
40 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
41 | train.max_iter = 184375
42 |
43 | optimizer.lr = 0.1
44 | optimizer.weight_decay = 4e-5
45 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 2 # 100ep -> 200ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 4 # 100ep -> 400ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_100ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.modeling.backbone import RegNet
3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem
4 |
5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
6 |
7 | # Config source:
8 | model.backbone.bottom_up = L(RegNet)(
9 | stem_class=SimpleStem,
10 | stem_width=32,
11 | block_class=ResBottleneckBlock,
12 | depth=23,
13 | w_a=38.65,
14 | w_0=96,
15 | w_m=2.43,
16 | group_width=40,
17 | norm="SyncBN",
18 | out_features=["s1", "s2", "s3", "s4"],
19 | )
20 | model.pixel_std = [57.375, 57.120, 58.395]
21 |
22 | # RegNets benefit from enabling cudnn benchmark mode
23 | train.cudnn_benchmark = True
24 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 2 # 100ep -> 200ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 4 # 100ep -> 400ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_100ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.modeling.backbone import RegNet
3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem
4 |
5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
6 |
7 | model.backbone.bottom_up = L(RegNet)(
8 | stem_class=SimpleStem,
9 | stem_width=32,
10 | block_class=ResBottleneckBlock,
11 | depth=22,
12 | w_a=31.41,
13 | w_0=96,
14 | w_m=2.24,
15 | group_width=64,
16 | se_ratio=0.25,
17 | norm="SyncBN",
18 | out_features=["s1", "s2", "s3", "s4"],
19 | )
20 | model.pixel_std = [57.375, 57.120, 58.395]
21 |
22 | # RegNets benefit from enabling cudnn benchmark mode
23 | train.cudnn_benchmark = True
24 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 2 # 100ep -> 200ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train
2 |
3 | train.max_iter *= 4 # 100ep -> 400ep
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2main/cascade_mask_rcnn_R_50_FPN_3x.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from ..common.data.constants import constants
3 | from ..common.models.cascade_rcnn import model
4 | from ..common.train import train
5 |
6 | dataloader.train.mapper.image_format = "BGR"
7 | model.pixel_mean = constants.imagenet_bgr256_mean
8 | model.pixel_std = constants.imagenet_bgr256_std
9 | model.input_format = "BGR"
10 |
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
12 | train.max_iter *= 3
13 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2main/mask_rcnn_R_50_FPN_3x.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from ..common.data.constants import constants
3 | from ..common.models.mask_rcnn_fpn import model
4 | from ..common.train import train
5 |
6 | dataloader.train.mapper.image_format = "BGR"
7 | model.pixel_mean = constants.imagenet_bgr256_mean
8 | model.pixel_std = constants.imagenet_bgr256_std
9 | model.input_format = "BGR"
10 |
11 | model.backbone.bottom_up.freeze_at = 2
12 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
13 | train.max_iter *= 3
14 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_gn.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from ..common.data.constants import constants
3 | from ..common.models.mask_rcnn_fpn import model
4 | from ..common.train import train
5 |
6 | dataloader.train.mapper.image_format = "BGR"
7 | model.pixel_mean = constants.imagenet_bgr256_mean
8 | model.pixel_std = constants.imagenet_bgr256_std
9 | model.input_format = "BGR"
10 |
11 | # Handle Caffe2 model specs:
12 | model.backbone.bottom_up.stages.stride_in_1x1 = False
13 | model.pixel_std = [57.375, 57.120, 58.395]
14 |
15 | model.backbone.bottom_up.stem.norm = "GN"
16 | model.backbone.bottom_up.stages.norm = "GN"
17 | model.backbone.norm = "GN"
18 | model.roi_heads.box_head.conv_norm = "GN"
19 | model.roi_heads.mask_head.conv_norm = "GN"
20 |
21 | # 4conv1fc box head
22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
23 | model.roi_heads.box_head.fc_dims = [1024]
24 |
25 | dataloader.train.total_batch_size = 64
26 | train.max_iter *= 9
27 |
--------------------------------------------------------------------------------
/coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_syncbn.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from ..common.data.constants import constants
3 | from ..common.models.mask_rcnn_fpn import model
4 | from ..common.train import train
5 |
6 | dataloader.train.mapper.image_format = "BGR"
7 | model.pixel_mean = constants.imagenet_bgr256_mean
8 | model.pixel_std = constants.imagenet_bgr256_std
9 | model.input_format = "BGR"
10 |
11 | # Handle Caffe2 model specs:
12 | model.backbone.bottom_up.stages.stride_in_1x1 = False
13 | model.pixel_std = [57.375, 57.120, 58.395]
14 |
15 | model.backbone.bottom_up.stem.norm = "SyncBN"
16 | model.backbone.bottom_up.stages.norm = "SyncBN"
17 | model.backbone.norm = "SyncBN"
18 | model.roi_heads.box_head.conv_norm = "SyncBN"
19 | model.roi_heads.mask_head.conv_norm = "SyncBN"
20 |
21 | # 4conv1fc box head
22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
23 | model.roi_heads.box_head.fc_dims = [1024]
24 |
25 | dataloader.train.total_batch_size = 64
26 | train.max_iter *= 9
27 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_R101_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_R50_bs16_50ep import dataloader, model, train
2 |
3 | model.backbone.stages.depth = 101
4 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
5 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_R50_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from ..common.models.mask2former import model
3 | from ..common.train import train
4 |
5 | # Initialization and trainer settings
6 | train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
7 |
8 | # Schedule
9 | # 50 ep = 368750 iters * 16 images/iter / 118000 images/ep
10 | dataloader.train.total_batch_size = 16
11 | train.max_iter = 368750
12 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_base_384_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 |
4 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train
5 |
6 | model.backbone.depths = [2, 2, 18, 2]
7 | model.backbone.num_heads = [4, 8, 16, 32]
8 | model.backbone.window_size = 12
9 | model.backbone.embed_dim = 128
10 | model.backbone.pretrain_img_size = 384
11 |
12 | model.sem_seg_head.pixel_decoder.input_shape = {
13 | "p0": L(ShapeSpec)(channels=128, stride=4),
14 | "p1": L(ShapeSpec)(channels=256, stride=8),
15 | "p2": L(ShapeSpec)(channels=512, stride=16),
16 | "p3": L(ShapeSpec)(channels=1024, stride=32),
17 | }
18 |
19 | train.init_checkpoint = (
20 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384.pth"
21 | )
22 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_base_IN21k_384_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train
2 |
3 | train.init_checkpoint = (
4 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth"
5 | )
6 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_large_IN21k_384_bs16_100ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 |
4 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train
5 |
6 | model.num_queries = 200
7 | model.backbone.num_heads = [6, 12, 24, 48]
8 | model.backbone.embed_dim = 192
9 |
10 | model.sem_seg_head.pixel_decoder.input_shape = {
11 | "p0": L(ShapeSpec)(channels=192, stride=4),
12 | "p1": L(ShapeSpec)(channels=384, stride=8),
13 | "p2": L(ShapeSpec)(channels=768, stride=16),
14 | "p3": L(ShapeSpec)(channels=1536, stride=32),
15 | }
16 |
17 | train.max_iter *= 2
18 | train.init_checkpoint = (
19 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth"
20 | )
21 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_small_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train
2 |
3 | model.backbone.depths = [2, 2, 18, 2]
4 |
5 | train.init_checkpoint = (
6 | "detectron2://ImageNetPretrained/swin/swin_small_patch4_window7_224.pth"
7 | )
8 |
--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_tiny_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling import SwinTransformer
4 |
5 | from .maskformer2_R50_bs16_50ep import dataloader, model, train
6 |
7 | model.backbone = L(SwinTransformer)(
8 | depths=[2, 2, 6, 2],
9 | embed_dim=96,
10 | num_heads=[3, 6, 12, 24],
11 | drop_path_rate=0.3,
12 | )
13 |
14 | model.sem_seg_head.pixel_decoder.input_shape = {
15 | "p0": L(ShapeSpec)(channels=96, stride=4),
16 | "p1": L(ShapeSpec)(channels=192, stride=8),
17 | "p2": L(ShapeSpec)(channels=384, stride=16),
18 | "p3": L(ShapeSpec)(channels=768, stride=32),
19 | }
20 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["p1", "p2", "p3"]
21 |
22 | train.init_checkpoint = (
23 | "detectron2://ImageNetPretrained/swin/swin_tiny_patch4_window7_224.pth"
24 | )
25 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.depth = 24
4 | model.backbone.bottom_up.last_block_indexes = (1, 4, 20, 23)
5 | model.backbone.bottom_up.drop_path_rate = 0.4
6 |
7 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in1k.pyth"
8 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_in21k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_b_3x import dataloader, model, train
2 |
3 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth"
4 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py:
--------------------------------------------------------------------------------
1 | from ..common.data.coco import dataloader
2 | from .cascade_mask_rcnn_mvitv2_b_3x import model, train
3 |
4 | model.backbone.bottom_up.embed_dim = 192
5 | model.backbone.bottom_up.depth = 80
6 | model.backbone.bottom_up.num_heads = 3
7 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79)
8 | model.backbone.bottom_up.drop_path_rate = 0.6
9 | model.backbone.bottom_up.use_act_checkpoint = True
10 |
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth"
12 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_s_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train
2 |
3 | model.backbone.bottom_up.depth = 16
4 | model.backbone.bottom_up.last_block_indexes = (0, 2, 13, 15)
5 |
6 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_S_in1k.pyth"
7 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_t_3x.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.box_regression import Box2BoxTransform
4 | from detectron2.modeling.matcher import Matcher
5 | from detectron2.modeling.roi_heads import (
6 | CascadeROIHeads,
7 | FastRCNNConvFCHead,
8 | FastRCNNOutputLayers,
9 | )
10 |
11 | from .mask_rcnn_mvitv2_t_3x import dataloader, model, train
12 |
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 |
16 | model.roi_heads.update(
17 | _target_=CascadeROIHeads,
18 | box_heads=[
19 | L(FastRCNNConvFCHead)(
20 | input_shape=ShapeSpec(channels=256, height=7, width=7),
21 | conv_dims=[256, 256, 256, 256],
22 | fc_dims=[1024],
23 | conv_norm="SyncBN",
24 | )
25 | for _ in range(3)
26 | ],
27 | box_predictors=[
28 | L(FastRCNNOutputLayers)(
29 | input_shape=ShapeSpec(channels=1024),
30 | test_score_thresh=0.05,
31 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
32 | cls_agnostic_bbox_reg=True,
33 | num_classes="${...num_classes}",
34 | )
35 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
36 | ],
37 | proposal_matchers=[
38 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
39 | for th in [0.5, 0.6, 0.7]
40 | ],
41 | )
42 |
43 | model.roi_heads.mask_head.conv_norm = "SyncBN"
44 |
45 | # 2conv in RPN:
46 | # https://github.com/tensorflow/tpu/blob/b24729de804fdb751b06467d3dce0637fa652060/models/official/detection/modeling/architecture/heads.py#L95-L97 # noqa: E501, B950
47 | model.proposal_generator.head.conv_dims = [-1, -1]
48 |
--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/mask_rcnn_mvitv2_t_3x.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import torch.nn as nn
4 | from detectron2.config import LazyCall as L
5 | from detectron2.modeling import MViT
6 |
7 | from ..common.data.coco import dataloader
8 | from ..common.data.constants import constants
9 | from ..common.models.mask_rcnn_fpn import model
10 | from ..common.train import train
11 |
12 | model.backbone.bottom_up = L(MViT)(
13 | embed_dim=96,
14 | depth=10,
15 | num_heads=1,
16 | last_block_indexes=(0, 2, 7, 9),
17 | residual_pooling=True,
18 | drop_path_rate=0.2,
19 | norm_layer=partial(nn.LayerNorm, eps=1e-6),
20 | out_features=("scale2", "scale3", "scale4", "scale5"),
21 | )
22 | model.backbone.in_features = "${.bottom_up.out_features}"
23 |
24 | # Initialization and trainer settings
25 | train.amp.enabled = True
26 | train.ddp.fp16_compression = True
27 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_T_in1k.pyth"
28 |
29 | # 36 epochs
30 | train.max_iter = 67500
31 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.config import LazyCall as L
4 | from detectron2.layers import ShapeSpec
5 | from detectron2.modeling import MViT
6 | from detectron2.modeling.box_regression import Box2BoxTransform
7 | from detectron2.modeling.matcher import Matcher
8 | from detectron2.modeling.roi_heads import (
9 | CascadeROIHeads,
10 | FastRCNNConvFCHead,
11 | FastRCNNOutputLayers,
12 | )
13 | from torch import nn
14 |
15 | from ..common.data.coco import dataloader
16 | from ..common.data.constants import constants
17 | from ..common.models.mask_rcnn_fpn import model
18 | from ..common.train import train
19 |
20 | model.backbone.bottom_up = L(MViT)(
21 | embed_dim=96,
22 | depth=24,
23 | num_heads=1,
24 | last_block_indexes=(1, 4, 20, 23),
25 | residual_pooling=True,
26 | drop_path_rate=0.4,
27 | norm_layer=partial(nn.LayerNorm, eps=1e-6),
28 | out_features=("scale2", "scale3", "scale4", "scale5"),
29 | )
30 | model.backbone.in_features = "${.bottom_up.out_features}"
31 | model.backbone.square_pad = 1024
32 |
33 | # New heads and LN
34 | model.backbone.norm = "LN" # Use LN in FPN
35 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN"
36 |
37 | # 2conv in RPN:
38 | model.proposal_generator.head.conv_dims = [-1, -1]
39 |
40 | # arguments that don't exist for Cascade R-CNN
41 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
42 | model.roi_heads.update(
43 | _target_=CascadeROIHeads,
44 | box_heads=[
45 | L(FastRCNNConvFCHead)(
46 | input_shape=ShapeSpec(channels=256, height=7, width=7),
47 | conv_dims=[256, 256, 256, 256],
48 | fc_dims=[1024],
49 | conv_norm="LN",
50 | )
51 | for _ in range(3)
52 | ],
53 | box_predictors=[
54 | L(FastRCNNOutputLayers)(
55 | input_shape=ShapeSpec(channels=1024),
56 | test_score_thresh=0.05,
57 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
58 | cls_agnostic_bbox_reg=True,
59 | num_classes="${...num_classes}",
60 | )
61 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
62 | ],
63 | proposal_matchers=[
64 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
65 | for th in [0.5, 0.6, 0.7]
66 | ],
67 | )
68 |
69 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth"
70 |
71 | # Schedule
72 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
73 | train.max_iter = 184375
74 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train
2 |
3 | model.backbone.bottom_up.embed_dim = 192
4 | model.backbone.bottom_up.depth = 80
5 | model.backbone.bottom_up.num_heads = 3
6 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79)
7 | model.backbone.bottom_up.drop_path_rate = 0.6
8 | model.backbone.bottom_up.use_act_checkpoint = True
9 |
10 |
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth"
12 |
13 |
14 | # 36 epochs
15 | train.max_iter = 67500
16 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train
2 |
3 | model.backbone.bottom_up.embed_dim = 144
4 | model.backbone.bottom_up.depth = 48
5 | model.backbone.bottom_up.num_heads = 2
6 | model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47)
7 | model.backbone.bottom_up.drop_path_rate = 0.5
8 |
9 |
10 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth"
11 |
12 | train.max_iter = train.max_iter // 2 # 100ep -> 50ep
13 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_swin_b_in21k_50ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.modeling import SwinTransformer
3 |
4 | from ..common.data.coco import dataloader
5 | from ..common.train import train
6 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import model
7 |
8 | model.backbone.bottom_up = L(SwinTransformer)(
9 | depths=[2, 2, 18, 2],
10 | drop_path_rate=0.4,
11 | embed_dim=128,
12 | num_heads=[4, 8, 16, 32],
13 | )
14 | model.backbone.in_features = ("p0", "p1", "p2", "p3")
15 | model.backbone.square_pad = 1024
16 |
17 | train.init_checkpoint = (
18 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window7_224_22k.pth"
19 | )
20 | # 50 ep = (184375 / 2) iters * 64 images/iter / 118000 images/ep
21 | train.max_iter = 184375 // 2
22 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_swin_l_in21k_50ep.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_swin_b_in21k_50ep import dataloader, model, train
2 |
3 | model.backbone.bottom_up.depths = [2, 2, 18, 2]
4 | model.backbone.bottom_up.drop_path_rate = 0.4
5 | model.backbone.bottom_up.embed_dim = 192
6 | model.backbone.bottom_up.num_heads = [6, 12, 24, 48]
7 |
8 |
9 | train.init_checkpoint = (
10 | "detectron2://ImageNetPretrained/swin/swin_large_patch4_window7_224_22k.pth"
11 | )
12 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_b_100ep.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.box_regression import Box2BoxTransform
4 | from detectron2.modeling.matcher import Matcher
5 | from detectron2.modeling.roi_heads import (
6 | CascadeROIHeads,
7 | FastRCNNConvFCHead,
8 | FastRCNNOutputLayers,
9 | )
10 |
11 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train
12 |
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 |
16 | model.roi_heads.update(
17 | _target_=CascadeROIHeads,
18 | box_heads=[
19 | L(FastRCNNConvFCHead)(
20 | input_shape=ShapeSpec(channels=256, height=7, width=7),
21 | conv_dims=[256, 256, 256, 256],
22 | fc_dims=[1024],
23 | conv_norm="LN",
24 | )
25 | for _ in range(3)
26 | ],
27 | box_predictors=[
28 | L(FastRCNNOutputLayers)(
29 | input_shape=ShapeSpec(channels=1024),
30 | test_score_thresh=0.05,
31 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
32 | cls_agnostic_bbox_reg=True,
33 | num_classes="${...num_classes}",
34 | )
35 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
36 | ],
37 | proposal_matchers=[
38 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
39 | for th in [0.5, 0.6, 0.7]
40 | ],
41 | )
42 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_h_75ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
4 |
5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier
6 | from .cascade_mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train
7 |
8 | model.backbone.net.embed_dim = 1280
9 | model.backbone.net.depth = 32
10 | model.backbone.net.num_heads = 16
11 | model.backbone.net.drop_path_rate = 0.5
12 | # 7, 15, 23, 31 for global attention
13 | model.backbone.net.window_block_indexes = (
14 | list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31))
15 | )
16 |
17 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True"
18 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep
19 |
20 | optimizer.params.lr_factor_func = partial(
21 | get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32
22 | )
23 | optimizer.params.overrides = {}
24 | optimizer.params.weight_decay_norm = None
25 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_l_100ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
4 |
5 | from .cascade_mask_rcnn_vitdet_b_100ep import (
6 | dataloader,
7 | lr_multiplier,
8 | model,
9 | optimizer,
10 | train,
11 | )
12 |
13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True"
14 |
15 | model.backbone.net.embed_dim = 1024
16 | model.backbone.net.depth = 24
17 | model.backbone.net.num_heads = 16
18 | model.backbone.net.drop_path_rate = 0.4
19 | # 5, 11, 17, 23 for global attention
20 | model.backbone.net.window_block_indexes = (
21 | list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23))
22 | )
23 |
24 | optimizer.params.lr_factor_func = partial(
25 | get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24
26 | )
27 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
4 |
5 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier
6 | from ..common.data.coco import dataloader
7 | from ..common.models.mask_rcnn_vitdet import model
8 | from ..common.optim import AdamW as optimizer
9 | from ..common.train import train
10 |
11 | # Initialization and trainer settings
12 | train.ddp.fp16_compression = True
13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth?matching_heuristics=True"
14 |
15 |
16 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
17 | train.max_iter = 184375
18 |
19 | # Layer-wise LR decay for ViT
20 | optimizer.params.lr_factor_func = partial(
21 | get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7
22 | )
23 | optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}}
24 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_h_75ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
4 |
5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier
6 | from .mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train
7 |
8 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True"
9 |
10 | model.backbone.net.embed_dim = 1280
11 | model.backbone.net.depth = 32
12 | model.backbone.net.num_heads = 16
13 | model.backbone.net.drop_path_rate = 0.5
14 | # 7, 15, 23, 31 for global attention
15 | model.backbone.net.window_block_indexes = (
16 | list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31))
17 | )
18 |
19 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep
20 |
21 | optimizer.params.lr_factor_func = partial(
22 | get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32
23 | )
24 | optimizer.params.overrides = {}
25 | optimizer.params.weight_decay_norm = None
26 |
--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_l_100ep.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
4 |
5 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train
6 |
7 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True"
8 |
9 | model.backbone.net.embed_dim = 1024
10 | model.backbone.net.depth = 24
11 | model.backbone.net.num_heads = 16
12 | model.backbone.net.drop_path_rate = 0.4
13 | # 5, 11, 17, 23 for global attention
14 | model.backbone.net.window_block_indexes = (
15 | list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23))
16 | )
17 |
18 | optimizer.params.lr_factor_func = partial(
19 | get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24
20 | )
21 |
--------------------------------------------------------------------------------
/coco_rem/data/builtin.py:
--------------------------------------------------------------------------------
1 | """
2 | Register COCO-ReM instances for training and evaluation.
3 | """
4 |
5 | import os
6 |
7 | from detectron2.data.datasets.coco import register_coco_instances
8 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
9 |
10 | _PREDEFINED_SPLITS_COCO_REM = {
11 | "coco_rem_train": ("coco/train2017", "coco_rem/instances_trainrem.json"),
12 | "coco_rem_val": ("coco/val2017", "coco_rem/instances_valrem.json"),
13 | }
14 |
15 |
16 | def register_all_coco_rem(root: str = "datasets"):
17 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO_REM.items():
18 | # Assume pre-defined datasets live in `./datasets`.
19 | register_coco_instances(
20 | key,
21 | _get_builtin_metadata("coco"),
22 | os.path.join(root, json_file) if "://" not in json_file else json_file,
23 | os.path.join(root, image_root),
24 | )
25 |
--------------------------------------------------------------------------------
/coco_rem/data/lvis.py:
--------------------------------------------------------------------------------
1 | from detectron2.data.datasets.lvis import (
2 | get_lvis_instances_meta,
3 | register_lvis_instances,
4 | )
5 |
6 | # This mapping is extracted from the official LVIS mapping:
7 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
8 | COCO_CATEGORIES_IN_LVIS = [
9 | {"coco_id": 1, "lvis_id": 793, "synset": "person.n.01"},
10 | {"coco_id": 2, "lvis_id": 94, "synset": "bicycle.n.01"},
11 | {"coco_id": 3, "lvis_id": 207, "synset": "car.n.01"},
12 | {"coco_id": 4, "lvis_id": 703, "synset": "motorcycle.n.01"},
13 | {"coco_id": 5, "lvis_id": 3, "synset": "airplane.n.01"},
14 | {"coco_id": 6, "lvis_id": 173, "synset": "bus.n.01"},
15 | {"coco_id": 7, "lvis_id": 1115, "synset": "train.n.01"},
16 | {"coco_id": 8, "lvis_id": 1123, "synset": "truck.n.01"},
17 | {"coco_id": 9, "lvis_id": 118, "synset": "boat.n.01"},
18 | {"coco_id": 10, "lvis_id": 1112, "synset": "traffic_light.n.01"},
19 | {"coco_id": 11, "lvis_id": 445, "synset": "fireplug.n.01"},
20 | {"coco_id": 13, "lvis_id": 1019, "synset": "stop_sign.n.01"},
21 | {"coco_id": 14, "lvis_id": 766, "synset": "parking_meter.n.01"},
22 | {"coco_id": 15, "lvis_id": 90, "synset": "bench.n.01"},
23 | {"coco_id": 16, "lvis_id": 99, "synset": "bird.n.01"},
24 | {"coco_id": 17, "lvis_id": 225, "synset": "cat.n.01"},
25 | {"coco_id": 18, "lvis_id": 378, "synset": "dog.n.01"},
26 | {"coco_id": 19, "lvis_id": 569, "synset": "horse.n.01"},
27 | {"coco_id": 20, "lvis_id": 943, "synset": "sheep.n.01"},
28 | {"coco_id": 21, "lvis_id": 80, "synset": "beef.n.01"},
29 | {"coco_id": 22, "lvis_id": 422, "synset": "elephant.n.01"},
30 | {"coco_id": 23, "lvis_id": 76, "synset": "bear.n.01"},
31 | {"coco_id": 24, "lvis_id": 1202, "synset": "zebra.n.01"},
32 | {"coco_id": 25, "lvis_id": 496, "synset": "giraffe.n.01"},
33 | {"coco_id": 27, "lvis_id": 34, "synset": "backpack.n.01"},
34 | {"coco_id": 28, "lvis_id": 1133, "synset": "umbrella.n.01"},
35 | {"coco_id": 31, "lvis_id": 35, "synset": "bag.n.04"},
36 | {"coco_id": 32, "lvis_id": 716, "synset": "necktie.n.01"},
37 | {"coco_id": 33, "lvis_id": 36, "synset": "bag.n.06"},
38 | {"coco_id": 34, "lvis_id": 474, "synset": "frisbee.n.01"},
39 | {"coco_id": 35, "lvis_id": 964, "synset": "ski.n.01"},
40 | {"coco_id": 36, "lvis_id": 976, "synset": "snowboard.n.01"},
41 | {"coco_id": 37, "lvis_id": 41, "synset": "ball.n.06"},
42 | {"coco_id": 38, "lvis_id": 611, "synset": "kite.n.03"},
43 | {"coco_id": 39, "lvis_id": 58, "synset": "baseball_bat.n.01"},
44 | {"coco_id": 40, "lvis_id": 60, "synset": "baseball_glove.n.01"},
45 | {"coco_id": 41, "lvis_id": 962, "synset": "skateboard.n.01"},
46 | {"coco_id": 42, "lvis_id": 1037, "synset": "surfboard.n.01"},
47 | {"coco_id": 43, "lvis_id": 1079, "synset": "tennis_racket.n.01"},
48 | {"coco_id": 44, "lvis_id": 133, "synset": "bottle.n.01"},
49 | {"coco_id": 46, "lvis_id": 1190, "synset": "wineglass.n.01"},
50 | {"coco_id": 47, "lvis_id": 344, "synset": "cup.n.01"},
51 | {"coco_id": 48, "lvis_id": 469, "synset": "fork.n.01"},
52 | {"coco_id": 49, "lvis_id": 615, "synset": "knife.n.01"},
53 | {"coco_id": 50, "lvis_id": 1000, "synset": "spoon.n.01"},
54 | {"coco_id": 51, "lvis_id": 139, "synset": "bowl.n.03"},
55 | {"coco_id": 52, "lvis_id": 45, "synset": "banana.n.02"},
56 | {"coco_id": 53, "lvis_id": 12, "synset": "apple.n.01"},
57 | {"coco_id": 54, "lvis_id": 912, "synset": "sandwich.n.01"},
58 | {"coco_id": 55, "lvis_id": 735, "synset": "orange.n.01"},
59 | {"coco_id": 56, "lvis_id": 154, "synset": "broccoli.n.01"},
60 | {"coco_id": 57, "lvis_id": 217, "synset": "carrot.n.01"},
61 | {"coco_id": 59, "lvis_id": 816, "synset": "pizza.n.01"},
62 | {"coco_id": 60, "lvis_id": 387, "synset": "doughnut.n.02"},
63 | {"coco_id": 61, "lvis_id": 183, "synset": "cake.n.03"},
64 | {"coco_id": 62, "lvis_id": 232, "synset": "chair.n.01"},
65 | {"coco_id": 63, "lvis_id": 982, "synset": "sofa.n.01"},
66 | {"coco_id": 64, "lvis_id": 837, "synset": "pot.n.04"},
67 | {"coco_id": 65, "lvis_id": 77, "synset": "bed.n.01"},
68 | {"coco_id": 67, "lvis_id": 367, "synset": "dining_table.n.01"},
69 | {"coco_id": 70, "lvis_id": 1097, "synset": "toilet.n.02"},
70 | {"coco_id": 72, "lvis_id": 1077, "synset": "television_receiver.n.01"},
71 | {"coco_id": 73, "lvis_id": 631, "synset": "laptop.n.01"},
72 | {"coco_id": 74, "lvis_id": 705, "synset": "mouse.n.04"},
73 | {"coco_id": 75, "lvis_id": 881, "synset": "remote_control.n.01"},
74 | {"coco_id": 76, "lvis_id": 296, "synset": "computer_keyboard.n.01"},
75 | {"coco_id": 77, "lvis_id": 230, "synset": "cellular_telephone.n.01"},
76 | {"coco_id": 78, "lvis_id": 687, "synset": "microwave.n.02"},
77 | {"coco_id": 79, "lvis_id": 739, "synset": "oven.n.01"},
78 | {"coco_id": 80, "lvis_id": 1095, "synset": "toaster.n.02"},
79 | {"coco_id": 81, "lvis_id": 961, "synset": "sink.n.01"},
80 | {"coco_id": 82, "lvis_id": 421, "synset": "electric_refrigerator.n.01"},
81 | {"coco_id": 84, "lvis_id": 127, "synset": "book.n.01"},
82 | {"coco_id": 85, "lvis_id": 271, "synset": "clock.n.01"},
83 | {"coco_id": 86, "lvis_id": 1139, "synset": "vase.n.01"},
84 | {"coco_id": 87, "lvis_id": 923, "synset": "scissors.n.01"},
85 | {"coco_id": 88, "lvis_id": 1071, "synset": "teddy.n.01"},
86 | {"coco_id": 89, "lvis_id": 534, "synset": "hand_blower.n.01"},
87 | {"coco_id": 90, "lvis_id": 1102, "synset": "toothbrush.n.01"},
88 | ]
89 |
90 |
91 | def register_cocofied_lvis():
92 | # COCO-fied LVIS v1 val - instances for COCO classes, masks from LVIS.
93 | register_lvis_instances(
94 | "lvis_v1_val_cocofied",
95 | get_lvis_instances_meta("lvis_v1_val_cocofied"),
96 | json_file="datasets/lvis/lvis_v1_val_cocofied.json",
97 | image_root="datasets/coco/",
98 | )
99 |
--------------------------------------------------------------------------------
/coco_rem/mask_visualizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import itertools
4 | from typing import Callable, Optional
5 |
6 | import cv2
7 | import matplotlib.colors as mplc
8 | import numpy as np
9 | import pycocotools.mask as mask_util
10 | import torch
11 | from detectron2.utils.visualizer import VisImage
12 | from torch.nn.functional import max_pool2d
13 |
14 | # Nice colors, taken from `colorblind` palette of the `seaborn` library with
15 | # very minor modifications for aesthetics.
16 | NICE_COLORS = [
17 | (0.0039, 0.4509, 0.6980), # blue
18 | (0.8905, 0.5607, 0.0196), # orange
19 | (0.0078, 0.6196, 0.4509), # green
20 | (0.9400, 0.2200, 0.1000), # red
21 | (0.6500, 0.3500, 0.9000), # purple
22 | (0.6980, 1.0000, 0.3500), # lime green
23 | (0.5019, 0.8705, 0.9176), # cyan
24 | (0.7921, 0.5686, 0.3803), # brown
25 | (0.9843, 0.6862, 0.8941), # pink
26 | (0.9254, 0.8823, 0.2001), # gold
27 | ]
28 |
29 |
30 | def binarize_mask(mask_or_polygons, height: int, width: int):
31 | """
32 | Convert input masks of any format to a binary mask (np.uint8 array with 1
33 | as foreground and 0 as background).
34 | """
35 | m = mask_or_polygons
36 | if isinstance(m, dict):
37 | # RLEs
38 | assert "counts" in m and "size" in m
39 | if isinstance(m["counts"], list): # uncompressed RLEs
40 | h, w = m["size"]
41 | assert h == height and w == width
42 | m = mask_util.frPyObjects(m, h, w)
43 | mask = mask_util.decode(m)[:, :]
44 |
45 | if isinstance(m, list): # list[ndarray]
46 | m = mask_util.frPyObjects(m, height, width)
47 | m = mask_util.merge(m)
48 | mask = mask_util.decode(m)[:, :]
49 |
50 | if isinstance(m, np.ndarray): # assumed to be a binary mask
51 | assert m.shape[1] != 2, m.shape
52 | assert m.shape == (
53 | height,
54 | width,
55 | ), f"mask shape: {m.shape}, target dims: {height}, {width}"
56 | mask = m.astype("uint8")
57 |
58 | return mask
59 |
60 |
61 | def _create_text_labels(classes, class_names, is_crowd=None):
62 | labels = None
63 | if classes is not None:
64 | if class_names is not None and len(class_names) > 0:
65 | labels = [class_names[i] for i in classes]
66 | else:
67 | labels = [str(i) for i in classes]
68 |
69 | if labels is not None and is_crowd is not None:
70 | labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
71 | return labels
72 |
73 |
74 | class MaskVisualizer:
75 | """Visualizer for labeled masks of COCO-format instance annotations."""
76 |
77 | def __init__(self, img_rgb: np.ndarray, class_names: list[str] | None = None):
78 | """
79 | Args:
80 | img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
81 | the height and width of the image respectively. C is the number of
82 | color channels. The image is required to be in RGB format since that
83 | is a requirement of the Matplotlib library. The image is also expected
84 | to be in the range [0, 255].
85 | class_names: List of names to associate with object class IDs of masks.
86 | """
87 | self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
88 | self.class_names = class_names
89 | self.output = VisImage(self.img)
90 | self.cpu_device = torch.device("cpu")
91 |
92 | # too small texts are useless, therefore clamp to 12
93 | self._default_font_size = max(
94 | np.sqrt(self.output.height * self.output.width) // 90, 12
95 | )
96 |
97 | def draw_dataset_dict(
98 | self,
99 | dic,
100 | draw_labels: bool = True,
101 | label_suffix_formatter: Optional[Callable] = None,
102 | ):
103 | """
104 | Draw annotations/segmentations in Detectron2 Dataset format.
105 |
106 | Args:
107 | dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
108 |
109 | Returns:
110 | output (VisImage): image object with visualizations.
111 | """
112 | annos = dic.get("annotations", None)
113 | if annos:
114 | if "segmentation" in annos[0]:
115 | masks = [x["segmentation"] for x in annos]
116 | else:
117 | masks = None
118 |
119 | if draw_labels:
120 | category_ids = [x["category_id"] for x in annos]
121 | labels = _create_text_labels(
122 | category_ids,
123 | class_names=self.class_names,
124 | is_crowd=[x.get("iscrowd", 0) for x in annos],
125 | )
126 |
127 | if label_suffix_formatter is not None:
128 | labels = label_suffix_formatter(dic, labels)
129 | else:
130 | labels = None
131 |
132 | self.overlay_instances(labels=labels, masks=masks)
133 |
134 | return self.output
135 |
136 | def overlay_instances(self, labels=None, masks=None, alpha=0.7):
137 | """
138 | Args:
139 | labels (list[str]): the text to be displayed for each instance.
140 | masks (masks-like object): Supported types are:
141 |
142 | * :class:`detectron2.structures.PolygonMasks`,
143 | :class:`detectron2.structures.BitMasks`.
144 | * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
145 | The first level of the list corresponds to individual instances. The second
146 | level to all the polygon that compose the instance, and the third level
147 | to the polygon coordinates. The third level should have the format of
148 | [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
149 | * list[ndarray]: each ndarray is a binary mask of shape (H, W).
150 | * list[dict]: each dict is a COCO-style RLE.
151 |
152 | Returns:
153 | output (VisImage): image object with visualizations.
154 | """
155 | num_instances = 0
156 | if masks is not None:
157 | masks = [
158 | binarize_mask(x, self.output.height, self.output.width) for x in masks
159 | ]
160 | if num_instances:
161 | assert len(masks) == num_instances
162 | else:
163 | num_instances = len(masks)
164 |
165 | if labels is not None:
166 | assert len(labels) == num_instances
167 |
168 | assigned_colors = list(
169 | itertools.islice(itertools.cycle(NICE_COLORS), num_instances)
170 | )
171 |
172 | if num_instances == 0:
173 | return self.output
174 |
175 | # Display in largest to smallest order to reduce occlusion.
176 | areas = np.asarray([x.sum() for x in masks])
177 |
178 | sorted_idxs = np.argsort(-areas).tolist()
179 | # Re-order overlapped instances in descending order.
180 | labels = [labels[k] for k in sorted_idxs] if labels is not None else None
181 | masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
182 | assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
183 |
184 | for i in range(num_instances):
185 | color = assigned_colors[i]
186 | text = labels[i] if labels is not None else ""
187 | self.draw_binary_mask(masks[i], color, text=text, alpha=alpha)
188 |
189 | return self.output
190 |
191 | def draw_text(self, text: str, x: float, y: float) -> VisImage:
192 | # fmt: off
193 | self.output.ax.text(
194 | x, y, text, size=self._default_font_size, family="sans-serif",
195 | bbox={"facecolor": "white", "alpha": 1.0, "pad": 1.0, "edgecolor": "none"},
196 | verticalalignment="top", horizontalalignment="center",
197 | color="black", zorder=10,
198 | )
199 | # fmt: on
200 | return self.output
201 |
202 | def draw_binary_mask(self, binary_mask, color, text=None, alpha=0.7):
203 | """
204 | Args:
205 | binary_mask: numpy array of shape (H, W), where H is the image height
206 | and W is the image width. Each value in the array is either a 0
207 | or 1 value of uint8 type.
208 | color: color of the mask. Refer to `matplotlib.colors` for a full list
209 | of formats that are accepted. If None, will pick a random color.
210 | text: A string to draw on the object.
211 | alpha: blending co-efficient. Smaller values => more transparent masks.
212 |
213 | Returns:
214 | output (VisImage): image object with mask drawn.
215 | """
216 | color = mplc.to_rgb(color)
217 |
218 | mask = binary_mask.astype("uint8") # opencv needs uint8
219 | shape2d = (binary_mask.shape[0], binary_mask.shape[1])
220 |
221 | # TODO: Use Path/PathPatch to draw vector graphics:
222 | # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
223 | rgba = np.zeros(shape2d + (4,), dtype="float32")
224 | rgba[:, :, :3] = color
225 | rgba[:, :, 3] = (mask == 1).astype("float32") * alpha
226 | self.output.ax.imshow(
227 | rgba, extent=(0, self.output.width, self.output.height, 0)
228 | )
229 |
230 | # Find mask boundary using dilation, then visualize as a black border.
231 | mask_tensor = torch.from_numpy(mask).float().unsqueeze(0)
232 | dilated = max_pool2d(mask_tensor, kernel_size=3, stride=1, padding=1)
233 | boundary = (dilated - mask_tensor)[0].numpy()
234 | boundary_rgba = np.zeros(shape2d + (4,), dtype="float32")
235 | boundary_rgba[:, :, 3] = boundary
236 | self.output.ax.imshow(
237 | boundary_rgba, extent=(0, self.output.width, self.output.height, 0)
238 | )
239 |
240 | if text is not None:
241 | # TODO sometimes drawn on wrong objects. the heuristics here can improve.
242 | _num_cc, cc_labels, stats, _ = cv2.connectedComponentsWithStats(
243 | binary_mask, 8
244 | )
245 | if stats[1:, -1].size == 0:
246 | return
247 | largest_component_id = np.argmax(stats[1:, -1]) + 1
248 |
249 | # draw text on the largest component, as well as other large components.
250 | for cid in range(1, _num_cc):
251 | if cid == largest_component_id or stats[cid, -1] > 100000:
252 | center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
253 | self.draw_text(text, *center)
254 |
255 | return self.output
256 |
257 | def get_output(self):
258 | return self.output
259 |
--------------------------------------------------------------------------------
/coco_rem/modeling/convnext.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from functools import partial
4 |
5 | import torch
6 | from detectron2.layers.batch_norm import LayerNorm as LayerNorm2d
7 | from detectron2.modeling.backbone import Backbone
8 | from timm.models.layers import DropPath, trunc_normal_
9 | from torch import nn
10 |
11 |
12 | class Block(nn.Module):
13 | def __init__(self, dim, drop_path=0.0, layer_scale_init_value=1e-6):
14 | super().__init__()
15 | self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
16 | self.norm = nn.LayerNorm(dim, eps=1e-6)
17 |
18 | self.pwconv1 = nn.Linear(dim, 4 * dim)
19 | self.act = nn.GELU()
20 |
21 | self.pwconv2 = nn.Linear(4 * dim, dim)
22 | self.gamma = (
23 | nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
24 | if layer_scale_init_value > 0
25 | else None
26 | )
27 | self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
28 |
29 | def forward(self, x):
30 | input = x
31 | x = self.dwconv(x)
32 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
33 | x = self.norm(x)
34 | x = self.pwconv1(x)
35 | x = self.act(x)
36 | x = self.pwconv2(x)
37 | if self.gamma is not None:
38 | x = self.gamma * x
39 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
40 |
41 | x = input + self.drop_path(x)
42 | return x
43 |
44 |
45 | class ConvNeXt(Backbone):
46 | """
47 | A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/abs/2201.03545
48 | """
49 |
50 | def __init__(
51 | self,
52 | in_chans: int = 3,
53 | depths: list[int] = [3, 3, 9, 3],
54 | dims: list[int] = [96, 192, 384, 768],
55 | drop_path_rate: float = 0.0,
56 | layer_scale_init_value: float = 1e-6,
57 | out_features: list[str] | None = None,
58 | ):
59 | """
60 | Args:
61 | in_chans: Number of input image channels.
62 | depths: Number of blocks at each stage.
63 | dims: Feature dimension at each stage.
64 | drop_path_rate: Stochastic depth rate.
65 | layer_scale_init_value: Init value for Layer Scale.
66 | out_features: Stage numbers of the outputs given to the Neck.
67 | """
68 | super().__init__()
69 |
70 | # stem and 3 intermediate downsampling conv layers
71 | self.downsample_layers = nn.ModuleList()
72 | stem = nn.Sequential(
73 | nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
74 | LayerNorm2d(dims[0], eps=1e-6),
75 | )
76 |
77 | self.downsample_layers.append(stem)
78 | for i in range(3):
79 | downsample_layer = nn.Sequential(
80 | LayerNorm2d(dims[i], eps=1e-6),
81 | nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
82 | )
83 | self.downsample_layers.append(downsample_layer)
84 |
85 | self.num_layers = len(depths)
86 | num_features = [int(dims[i] * 2**i) for i in range(self.num_layers)]
87 | self.num_features = num_features
88 | self._out_features = out_features
89 |
90 | self._out_feature_strides = {}
91 | self._out_feature_channels = {}
92 |
93 | # 4 feature resolution stages, each consisting of multiple residual blocks
94 | self.stages = nn.ModuleList()
95 | dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
96 | cur = 0
97 | strides = [4, 4, 4, 4]
98 | for i in range(4):
99 | stage = nn.Sequential(
100 | *[
101 | Block(
102 | dim=dims[i],
103 | drop_path=dp_rates[cur + j],
104 | layer_scale_init_value=layer_scale_init_value,
105 | )
106 | for j in range(depths[i])
107 | ]
108 | )
109 | self.stages.append(stage)
110 | cur += depths[i]
111 |
112 | self._out_feature_channels[f"res{i + 2}"] = dims[i]
113 | self._out_feature_strides[f"res{i + 2}"] = strides[i] * 2**i
114 |
115 | norm_layer = partial(LayerNorm2d, eps=1e-6)
116 | for i_layer in range(4):
117 | layer = norm_layer(dims[i_layer])
118 | layer_name = f"norm{i_layer}"
119 | self.add_module(layer_name, layer)
120 |
121 | self.apply(self._init_weights)
122 |
123 | def _init_weights(self, m):
124 | if isinstance(m, (nn.Conv2d, nn.Linear)):
125 | trunc_normal_(m.weight, std=0.02)
126 | nn.init.constant_(m.bias, 0)
127 |
128 | def init_weights(self, pretrained=None):
129 | """Initialize the weights in backbone.
130 | Args:
131 | pretrained (str, optional): Path to pre-trained weights.
132 | Defaults to None.
133 | """
134 |
135 | def _init_weights(m):
136 | if isinstance(m, nn.Linear):
137 | trunc_normal_(m.weight, std=0.02)
138 | if isinstance(m, nn.Linear) and m.bias is not None:
139 | nn.init.constant_(m.bias, 0)
140 | elif isinstance(m, nn.LayerNorm) or isinstance(m, LayerNorm2d):
141 | nn.init.constant_(m.bias, 0)
142 | nn.init.constant_(m.weight, 1.0)
143 |
144 | self.apply(_init_weights)
145 |
146 | def forward_features(self, x):
147 | outs = {}
148 | for i in range(4):
149 | x = self.downsample_layers[i](x)
150 | x = self.stages[i](x)
151 |
152 | if f"res{i + 2}" in self._out_features:
153 | norm_layer = getattr(self, f"norm{i}")
154 | x_out = norm_layer(x)
155 | out = x_out.contiguous()
156 | outs[f"res{i + 2}"] = out
157 |
158 | return outs
159 |
160 | def forward(self, x):
161 | x = self.forward_features(x)
162 | return x
163 |
--------------------------------------------------------------------------------
/coco_rem/modeling/rcnn_refiner.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from __future__ import annotations
3 |
4 | import torch
5 |
6 | from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
7 |
8 |
9 | class GeneralizedRCNNRefiner(GeneralizedRCNN):
10 | """
11 | An extension of R-CNN that produces masks conditioned on box prompts. This
12 | model skips the region proposal network and box ROI head, running only the
13 | mask head by cropping ROI features using input boxes.
14 | """
15 |
16 | def forward(self, batched_inputs: list[dict[str, torch.Tensor]]):
17 | assert not self.training, "`GeneralizedRCNNRefiner` only supports inference!"
18 |
19 | # Prepare `detected_instances: list[Instances]` for `inference()` method
20 | # to get mask predictions for ground-truth boxes.
21 | detected_instances = [x.pop("instances") for x in batched_inputs]
22 | for x in detected_instances:
23 | x.pred_classes = x.gt_classes
24 | x.pred_boxes = x.gt_boxes
25 | x.scores = torch.ones_like(x.pred_classes).float()
26 |
27 | return self.inference(batched_inputs, detected_instances)
28 |
--------------------------------------------------------------------------------
/coco_rem/modeling/sam_refiner.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from __future__ import annotations
7 |
8 | import einops as E
9 | import torch
10 | from segment_anything import sam_model_registry
11 | from segment_anything.utils import amg
12 | from torch import nn
13 |
14 |
15 | class SamRefiner(nn.Module):
16 | """
17 | SamRefiner: An extension of SAM that refines (low-quality) input masks via
18 | iteratively prompting boxes and points.
19 | """
20 |
21 | def __init__(
22 | self,
23 | arch: str,
24 | checkpoint: str,
25 | num_extra_points: int = 2,
26 | num_trials: int = 10,
27 | box_only_ids: list[int] = [],
28 | min_mask_region_area: int = 100,
29 | ):
30 | """
31 | Args:
32 | arch: SAM image encoder architecture (vit_b, vit_l, vit_h).
33 | checkpoint: Path to .pth file containing pre-trained SAM weights.
34 | num_extra_points: Number of extra points to iteratively prompt SAM
35 | with, after the initial box prompt. Points are sampled from the
36 | error region (bitwise XOR) between SAM prediction and ground-truth.
37 | num_trials: Number of refinement trials per instance mask, to improve
38 | the overall mask quality by ensembling.
39 | box_only_ids: Category IDs for which only box prompts will used.
40 | min_mask_region_area: If >0, postprocessing will be applied to remove
41 | islands and holes in masks with area smaller than this value.
42 | However, masks smaller than `10 * min_mask_region_area` will not
43 | remain unchanged to avoid removing useful details in tiny masks.
44 | """
45 | super().__init__()
46 |
47 | # Initialize SAM, freeze parameters, and transfer them here.
48 | _sam = sam_model_registry[arch](checkpoint)
49 | for param in _sam.parameters():
50 | param.requires_grad = False
51 |
52 | self.image_encoder = _sam.image_encoder
53 | self.prompt_encoder = _sam.prompt_encoder
54 | self.mask_decoder = _sam.mask_decoder
55 | self.img_size = _sam.image_encoder.img_size # 1024 pixels
56 |
57 | self.register_buffer("pixel_mean", _sam.pixel_mean)
58 | self.register_buffer("pixel_std", _sam.pixel_std)
59 |
60 | self.num_extra_points = num_extra_points
61 | self.num_trials = num_trials
62 | self.box_only_ids = box_only_ids
63 | self.min_mask_region_area = min_mask_region_area
64 |
65 | @torch.no_grad()
66 | def forward(
67 | self,
68 | image: torch.Tensor,
69 | masks: torch.Tensor,
70 | category_ids: list[int],
71 | original_size: tuple[int, int],
72 | ) -> torch.Tensor:
73 | """
74 | Regenerate an input mask by iteratively prompting points to SAM, same as
75 | the training procedure of SAM. This is done for multiple trials and masks
76 | are combining by averaging and thresholding in order to reduce variance.
77 | """
78 |
79 | # Normalize pixel values and pad to a square input.
80 | input_size = image.shape[-2:]
81 | image = (image[None, ...] - self.pixel_mean) / self.pixel_std
82 | padh = self.img_size - image.shape[-2]
83 | padw = self.img_size - image.shape[-1]
84 | image = nn.functional.pad(image, (0, padw, 0, padh))
85 |
86 | image_embeddings = self.image_encoder(image)
87 | all_masks = masks # Rename for convenience.
88 |
89 | all_refined_masks = []
90 | for src_mask, category_id in zip(all_masks, category_ids):
91 | xp = 0 if category_id in self.box_only_ids else self.num_extra_points
92 |
93 | # Repeat a single mask `num_trials` times to perform refinement trials
94 | # within the same batch.
95 | src_mask = E.repeat(src_mask, "h w -> n h w", n=self.num_trials)
96 |
97 | box_prompt = self._get_box_prompt(src_mask)
98 |
99 | # Iteratively prompt SAM with points sampled from error regions of
100 | # predicted masks. This is same as SAM's training procedure. The first
101 | # iteration will only use a box prompt.
102 | point_prompts, mask_prompt = None, None
103 |
104 | for _ in range(xp + 1):
105 | # Pass all prompts: points, initial box, logits from prev step.
106 | sparse_embeddings, dense_embeddings = self.prompt_encoder(
107 | point_prompts, box_prompt, mask_prompt
108 | )
109 |
110 | low_res_masks, _ = self.mask_decoder(
111 | image_embeddings=image_embeddings,
112 | image_pe=self.prompt_encoder.get_dense_pe(),
113 | sparse_prompt_embeddings=sparse_embeddings,
114 | dense_prompt_embeddings=dense_embeddings,
115 | multimask_output=False,
116 | )
117 | refined_masks = nn.functional.interpolate(
118 | low_res_masks,
119 | (self.img_size, self.img_size),
120 | mode="bilinear",
121 | align_corners=False,
122 | )
123 | refined_masks = refined_masks[..., : input_size[0], : input_size[1]]
124 |
125 | # Use source mask if SAM returned empty mask (happens for tiny boxes).
126 | if (refined_masks > 0).sum() == 0:
127 | refined_masks = src_mask[:, None, ...].float()
128 |
129 | # Update point prompts and mask prompt for next iteration.
130 | point_prompts = sample_point_from_error_region(
131 | src_mask, refined_masks[:, 0], point_prompts
132 | )
133 | mask_prompt = low_res_masks
134 |
135 | # Resize the refine masks to original size, then ensemble the trials
136 | # by thresholding at zero, then taking a majority vote.
137 | refined_masks = nn.functional.interpolate(
138 | refined_masks, original_size, mode="bilinear", align_corners=False
139 | )
140 | refined_masks = (refined_masks > 0).float()
141 | refined_mask = E.reduce(refined_masks, "n 1 h w -> h w", "mean")
142 | refined_mask = refined_mask > 0.5
143 |
144 | # Remove spurious islands/holes for large enough masks.
145 | _area = self.min_mask_region_area
146 | if _area > 0 and refined_mask.sum() > 10 * _area:
147 | _mask = refined_mask.cpu().numpy()
148 |
149 | _mask, _ = amg.remove_small_regions(_mask, _area, mode="holes")
150 | _mask, _ = amg.remove_small_regions(_mask, _area, mode="islands")
151 | refined_mask = torch.from_numpy(_mask).to(refined_mask.device)
152 |
153 | all_refined_masks.append(refined_mask)
154 |
155 | all_refined_masks = torch.stack(all_refined_masks)
156 | return all_refined_masks
157 |
158 | def _get_box_prompt(self, mask: torch.Tensor):
159 | """
160 | Make a box prompt to SAM, which is a bounding box of mask that is expanded
161 | using random noise, same as SAM's training procedure.
162 |
163 | Noise values drawn from Gaussian distributions having zero mean and
164 | standard deviation equal to 10% of box edge size, up to maximum 10 pixels.
165 | """
166 | box_prompt = amg.batched_mask_to_box(mask.bool()).float()
167 |
168 | box_w = box_prompt[:, 2] - box_prompt[:, 0]
169 | box_h = box_prompt[:, 3] - box_prompt[:, 1]
170 | noise_std = torch.stack([box_w, box_h, box_w, box_h], dim=1)
171 | noise_std = torch.clamp(noise_std * 0.1, max=10.0)
172 | noise_mean = torch.zeros_like(box_prompt)
173 |
174 | random_noise = torch.normal(noise_mean, noise_std)
175 |
176 | box_prompt[:, :2] = box_prompt[:, :2] - random_noise[:, :2].abs()
177 | box_prompt[:, 2:] = box_prompt[:, 2:] + random_noise[:, 2:].abs()
178 | box_prompt = box_prompt.clamp(min=0.0, max=self.img_size - 1)
179 | return box_prompt
180 |
181 |
182 | def sample_point_from_error_region(
183 | reference_masks: torch.Tensor,
184 | predicted_masks: torch.Tensor | None = None,
185 | previous_prompts: tuple[torch.Tensor, torch.Tensor] | None = None,
186 | ) -> tuple[torch.Tensor, torch.Tensor]:
187 | """
188 | Sample random points from the error regions between some reference masks
189 | (e.g. ground-truth) and predicted masks by SAM. Newly sampled points are
190 | labeled foreground (1) or background (0) depending on the pixel value in
191 | reference mask. This function simulates interactive segmentation setup for
192 | training SAM, as described in Segment Anything paper.
193 |
194 | Args:
195 | reference_masks: Batch of masks as a tensor of shape `(B, H, W)` containing
196 | pixel values in `{1, 0}` or `{True, False}` denoting foreground region.
197 | predicted_masks: Batch of masks predicted by SAM having same shape as the
198 | reference masks. This tensor may have real-valued logits, which will
199 | be internally binarized by thresholding at 0.
200 | previous_prompts: Optional tuple of `(point_coords, point_labels)` giving
201 | point prompts to SAM used in previous interactive iterations.
202 |
203 | Return:
204 | next_prompts: Tuple of `(point_coords, point_labels)` with newly sampled
205 | point co-ordinates and labels appended to `previous_prompts`.
206 | """
207 | # If predicted masks are not provided, assume that SAM predicted an empty mask.
208 | # This lets us sample a random point from anywhere inside the reference masks.
209 | if predicted_masks is None:
210 | predicted_masks = torch.zeros_like(reference_masks)
211 |
212 | points, point_labels = [], []
213 | for ref_mask, pr_mask in zip(reference_masks, predicted_masks):
214 | # Sample from the error region between given masks.
215 | error_region = torch.logical_xor(ref_mask > 0, pr_mask > 0)
216 | yx_choices = error_region.nonzero()
217 |
218 | # If there is no error region, sample from anywhere in GT mask.
219 | if len(yx_choices) == 0:
220 | yx_choices = ref_mask.nonzero()
221 |
222 | if len(yx_choices) == 0:
223 | yx_choices = torch.zeros((1, 2), device=ref_mask.device).long()
224 |
225 | idx = torch.randint(len(yx_choices), size=(1,)).item()
226 | point_xy = yx_choices[idx, [1, 0]]
227 | point_label = ref_mask[point_xy[1], point_xy[0]]
228 |
229 | points.append(point_xy)
230 | point_labels.append(point_label)
231 |
232 | points = E.rearrange(torch.stack(points), "b xy -> b 1 xy")
233 | point_labels = E.rearrange(torch.stack(point_labels).long(), "b -> b 1")
234 |
235 | # Append currently sampled points to previous prompts.
236 | if previous_prompts is not None:
237 | previous_points, previous_labels = previous_prompts
238 | points = torch.cat([previous_points, points], dim=1)
239 | point_labels = torch.cat([previous_labels, point_labels], dim=1)
240 |
241 | return (points, point_labels)
242 |
--------------------------------------------------------------------------------
/coco_rem/trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from __future__ import annotations
3 |
4 | import time
5 | from contextlib import nullcontext
6 |
7 | import torch
8 | from detectron2.engine import SimpleTrainer
9 | from detectron2.utils.events import get_event_storage
10 | from torch.cuda.amp import GradScaler, autocast
11 | from torch.nn.parallel import DistributedDataParallel
12 |
13 |
14 | class AMPWithGradAccumTrainer(SimpleTrainer):
15 | """
16 | Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision
17 | in the training loop and gradient accumulation after every `N` batches.
18 | """
19 |
20 | def __init__(
21 | self,
22 | model,
23 | data_loader,
24 | optimizer,
25 | gather_metric_period: int = 1,
26 | grad_scaler: GradScaler | None = None,
27 | precision: torch.dtype = torch.float16,
28 | log_grad_scaler: bool = False,
29 | grad_accum_steps: int = 1,
30 | ):
31 | """
32 | Args:
33 | model, data_loader, optimizer, gather_metric_period:
34 | same as in :class:`SimpleTrainer`.
35 | grad_scaler: torch GradScaler to automatically scale gradients.
36 | precision: torch.dtype as the target precision to cast to in computations.
37 | grad_accum_steps: Number of gradient accumulation steps.
38 | """
39 | unsupported = (
40 | "AMPTrainer does not support single-process multi-device training!"
41 | )
42 | if isinstance(model, DistributedDataParallel):
43 | assert not (model.device_ids and len(model.device_ids) > 1), unsupported
44 |
45 | super().__init__(model, data_loader, optimizer, gather_metric_period)
46 |
47 | if grad_scaler is None:
48 | grad_scaler = GradScaler()
49 | self.grad_scaler = grad_scaler
50 | self.precision = precision
51 | self.log_grad_scaler = log_grad_scaler
52 |
53 | assert grad_accum_steps >= 1, "grad_accum_steps must be >= 1."
54 | self.grad_accum_steps = grad_accum_steps
55 | self.grad_sync_manager = _GradAccumSyncManager(model, grad_accum_steps)
56 |
57 | def run_step(self):
58 | """
59 | Implement the AMP training logic along with gradient accumulation.
60 | """
61 | assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
62 |
63 | start = time.perf_counter()
64 | self.optimizer.zero_grad()
65 |
66 | # Record data loading time for all batches during gradient accumulation.
67 | total_data_time = 0.0
68 | prev_data_time = start
69 |
70 | for _ in range(self.grad_accum_steps):
71 | # Load batch and accumulate total time to load all batches throughout
72 | # all steps of gradient accumulation.
73 | data = next(self._data_loader_iter)
74 | current_data_time = time.perf_counter()
75 | total_data_time += current_data_time - prev_data_time
76 | prev_data_time = current_data_time
77 |
78 | with self.grad_sync_manager, autocast(dtype=self.precision):
79 | loss_dict = self.model(data)
80 | if isinstance(loss_dict, torch.Tensor):
81 | losses = loss_dict
82 | loss_dict = {"total_loss": loss_dict}
83 | else:
84 | losses = sum(loss_dict.values())
85 |
86 | normalized_losses = losses / self.grad_accum_steps
87 | self.grad_scaler.scale(normalized_losses).backward()
88 |
89 | if self.log_grad_scaler:
90 | storage = get_event_storage()
91 | storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale())
92 |
93 | self.after_backward()
94 |
95 | if self.async_write_metrics:
96 | # write metrics asynchronically
97 | self.concurrent_executor.submit(
98 | self._write_metrics, loss_dict, total_data_time, iter=self.iter
99 | )
100 | else:
101 | self._write_metrics(loss_dict, total_data_time)
102 |
103 | self.grad_scaler.step(self.optimizer)
104 | self.grad_scaler.update()
105 |
106 | def state_dict(self):
107 | ret = super().state_dict()
108 | ret["grad_scaler"] = self.grad_scaler.state_dict()
109 | return ret
110 |
111 | def load_state_dict(self, state_dict):
112 | super().load_state_dict(state_dict)
113 | self.grad_scaler.load_state_dict(state_dict["grad_scaler"])
114 |
115 |
116 | class _GradAccumSyncManager:
117 | """
118 | Distributed training with gradient accumulation can cause huge slowdowns if
119 | gradient synchronization is not done properly. This context manager does it.
120 | When using DDP and accumulation for `N` steps, gradients are not averaged
121 | across process for first `N - 1` steps. This context manager behaves as
122 | a no-op (`nullcontext`) when any of these conditions are true:
123 |
124 | - Training with single GPU or CPU only (`model` is not DDP object)
125 | - DDP with static graph (see https://github.com/pytorch/pytorch/issues/80832)
126 | - No gradient accumulation across multiple steps (`num_steps = 1`)
127 | """
128 |
129 | def __init__(self, model, num_steps: int):
130 | """
131 | Args:
132 | model: PyTorch module that is being trained with gradient accumulation.
133 | num_steps: Number of batches processed to accumulate gradients.
134 | """
135 | self.num_steps = num_steps
136 | self.step = 0
137 | self._sync = nullcontext()
138 |
139 | if isinstance(model, DistributedDataParallel) and not model.static_graph:
140 | self._no_sync = model.no_sync()
141 | else:
142 | self._no_sync = nullcontext()
143 |
144 | def __enter__(self):
145 | return self._no_sync if self.step < self.num_steps - 1 else self._sync
146 |
147 | def __exit__(self, *args, **kwargs):
148 | self.step = (self.step + 1) % self.num_steps
149 |
--------------------------------------------------------------------------------
/images/coco_rem_example_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_1.jpg
--------------------------------------------------------------------------------
/images/coco_rem_example_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_2.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fvcore==0.1.5.post20221221
2 | hydra-core>=1.1
3 | numpy==1.24.1
4 | omegaconf>=2.1
5 | pycocotools>=2.0
6 | einops>=0.6
7 | wget>=3.0
8 |
--------------------------------------------------------------------------------
/scripts/correct_labeling_errors.py:
--------------------------------------------------------------------------------
1 | """
2 | Add and remove a few instances from a given COCO JSON, and mark a few other
3 | instances as 'crowd' objects as their masks cover multiple instances.
4 | """
5 |
6 | import argparse
7 | import json
8 |
9 | import torch
10 | from pycocotools import mask as mask_utils
11 | from segment_anything.utils import amg
12 |
13 | import coco_rem.data.manual_rem as inv
14 |
15 |
16 | parser = argparse.ArgumentParser(description=__doc__)
17 | parser.add_argument(
18 | "--input",
19 | default="datasets/coco_rem/instances_valrem_interim.json",
20 | help="COCO-ReM JSON file to apply all manual corrections.",
21 | )
22 | parser.add_argument(
23 | "--output", required=True, help="Path to save output annotations JSON."
24 | )
25 |
26 |
27 | def main(_A: argparse.Namespace):
28 | coco_json = json.load(open(_A.input))
29 | print(f"Number of instances in input JSON: {len(coco_json['annotations'])}")
30 |
31 | # ------------------------------------------------------------------------
32 | # Step 1: Remove some instances.
33 | remove_info_tuples = [
34 | (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_REMOVE
35 | ]
36 | coco_json["annotations"] = [
37 | x
38 | for x in coco_json["annotations"]
39 | if (x["image_id"], x["source"], x["source_id"]) not in remove_info_tuples
40 | ]
41 |
42 | num_instances = len(coco_json["annotations"])
43 | print(f"Removed few instances, updated JSON has {num_instances} instances.")
44 |
45 | # ------------------------------------------------------------------------
46 | # Step 2: Set 'iscrowd = 1' for few instances.
47 | crowd_info_tuples = [
48 | (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_CROWD
49 | ]
50 | for ann in coco_json["annotations"]:
51 | if (ann["image_id"], ann["source"], ann["source_id"]) in crowd_info_tuples:
52 | ann["iscrowd"] = 1
53 |
54 | print(f"Set 'iscrowd = 1' for {len(inv.INSTANCES_TO_CROWD)} instances.")
55 |
56 | # ------------------------------------------------------------------------
57 | # Step 3: Add some instances.
58 | for idx, ann in enumerate(inv.INSTANCES_TO_ADD):
59 | # Convert compressed RLE to mask.
60 | binary_mask = mask_utils.decode(ann["segmentation"])
61 | binary_mask = torch.from_numpy(binary_mask)
62 |
63 | # Convert torch tensor to uncompressed RLE.
64 | ann["segmentation"] = amg.mask_to_rle_pytorch(binary_mask[None, ...])[0]
65 |
66 | # Fill other attributes for the annotation - a unique ID, source, bbox,
67 | # area, and `iscrowd = 0`.
68 | bbox_xyxy = amg.batched_mask_to_box(binary_mask[None, ...])[0]
69 |
70 | # Convert bounding box from XYXY to XYWH format.
71 | x1, y1, x2, y2 = bbox_xyxy.tolist()
72 | ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1]
73 |
74 | ann["area"] = amg.area_from_rle(ann["segmentation"])
75 | ann["id"] = 2024000000 + idx
76 | ann["source_id"] = ann["id"]
77 | ann["source"] = "manual"
78 | ann["iscrowd"] = 0
79 |
80 | coco_json["annotations"].append(ann)
81 |
82 | num_instances = len(coco_json["annotations"])
83 | print(f"Added few instances, updated JSON has {num_instances} instances.")
84 |
85 | json.dump(coco_json, open(_A.output, "w"))
86 | print(f"Saved the updated annotations JSON at {_A.output}!")
87 |
88 |
89 | if __name__ == "__main__":
90 | args = parser.parse_args()
91 | main(args)
92 |
--------------------------------------------------------------------------------
/scripts/merge_instances.py:
--------------------------------------------------------------------------------
1 | """
2 | Merge LVIS instance annotations for COCO categories (a.k.a "COCO-fied LVIS") into
3 | the original COCO instance annotations.
4 |
5 | COCO instance annotations are inconsistent, sometimes covering multiple objects
6 | into one mask. LVIS offers a much stronger guarantee that instances are labeled
7 | individually and exhaustively. For any `(image, category)` pair, if COCO-fied LVIS
8 | has instance annotations, then they replace the corresponding COCO annotations.
9 | """
10 |
11 | from __future__ import annotations
12 |
13 | import argparse
14 | import copy
15 | import json
16 | from collections import defaultdict
17 |
18 | from coco_rem.data.lvis import COCO_CATEGORIES_IN_LVIS
19 |
20 | parser = argparse.ArgumentParser(description=__doc__)
21 | _AA = parser.add_argument
22 | _AA("--coco-json", help="Path to COCO annotations JSON file.")
23 | _AA(
24 | "--lvis-json",
25 | nargs=2,
26 | help="Paths to LVIS train and val JSON files.",
27 | default=["datasets/lvis/lvis_v1_train.json", "datasets/lvis/lvis_v1_val.json"],
28 | )
29 | _AA("--split", choices=["train", "val"], help="Which dataset split to pre-process?")
30 | _AA("--output", required=True, help="Path to save the output annotations JSON.")
31 |
32 |
33 | def make_cocofied_lvis(lvis_json_paths: list[str], split: str):
34 | """
35 | Load LVIS instance annotations and filter them to keep instance annotations
36 | of the COCO categories for all images belonging to a COCO split (train/val).
37 | Category IDs in the output JSON are same as COCO IDs.
38 | """
39 |
40 | lvis_images, lvis_annos = [], []
41 | for _path in lvis_json_paths:
42 | lvis_json = json.load(open(_path))
43 | lvis_images.extend(lvis_json.pop("images"))
44 | lvis_annos.extend(lvis_json.pop("annotations"))
45 |
46 | # LVIS train/val splits are different than COCO (but in total, they cover the
47 | # same set of images). So we load both train and val annotations, then retain
48 | # images and their instances of the desired COCO split.
49 | keep_ids = set([x["id"] for x in lvis_images if split in x["coco_url"]])
50 | lvis_images = [x for x in lvis_images if x["id"] in keep_ids]
51 | lvis_annos = [x for x in lvis_annos if x["image_id"] in keep_ids]
52 |
53 | # Replace the category ID in instance annotation (LVIS -> COCO), and remove
54 | # LVIS instances that do not represent COCO categories.
55 | lvis_to_coco_id = {x["lvis_id"]: x["coco_id"] for x in COCO_CATEGORIES_IN_LVIS}
56 | lvis_annos = [x for x in lvis_annos if x["category_id"] in lvis_to_coco_id]
57 | for ann in lvis_annos:
58 | ann["category_id"] = lvis_to_coco_id[ann["category_id"]]
59 |
60 | # Replace category IDs in the "negative categories" list per image, like above.
61 | for image in lvis_images:
62 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
63 | image[key] = [x for x in image[key] if x in lvis_to_coco_id]
64 | image[key] = [lvis_to_coco_id[x] for x in image[key]]
65 |
66 | # Transfer metadata from original LVIS json to COCOfied LVIS json.
67 | cocofied_lvis = copy.deepcopy(lvis_json)
68 | cocofied_lvis["images"] = lvis_images
69 | cocofied_lvis["annotations"] = lvis_annos
70 |
71 | # Update category IDs of LVIS categories.
72 | cocofied_lvis["categories"] = [
73 | x for x in cocofied_lvis["categories"] if x["id"] in lvis_to_coco_id
74 | ]
75 | for ann in cocofied_lvis["categories"]:
76 | ann["id"] = lvis_to_coco_id[ann["id"]]
77 |
78 | print(f"COCO-fied LVIS stats for COCO {split} split:")
79 | print(f" - Number of images = {len(lvis_images)}")
80 | print(f" - Number of annotations = {len(lvis_annos)}")
81 |
82 | return cocofied_lvis
83 |
84 |
85 | def main(_A: argparse.Namespace):
86 | coco_json = json.load(open(_A.coco_json))
87 | lvis_json = make_cocofied_lvis(_A.lvis_json, _A.split)
88 |
89 | # Make a mapping from `(image_id, category_id) -> list[instances]` for both,
90 | # COCO and LVIS.
91 | coco_instances_dict = defaultdict(list)
92 | for ann in coco_json["annotations"]:
93 | # Mark the source of every annotation before merging.
94 | ann["source"] = "coco"
95 | ann["source_id"] = ann["id"]
96 | coco_instances_dict[(ann["image_id"], ann["category_id"])].append(ann)
97 |
98 | lvis_instances_dict = defaultdict(list)
99 | for ann in lvis_json["annotations"]:
100 | ann["source"] = "lvis"
101 | ann["source_id"] = ann["id"]
102 | lvis_instances_dict[(ann["image_id"], ann["category_id"])].append(ann)
103 |
104 | # ------------------------------------------------------------------------
105 | # For val set, remove all COCO-fied LVIS annotations for `(image, category)`
106 | # pair if instances are not annotated exhaustively.
107 | if _A.split == "val":
108 | _remove = [
109 | (image_info["id"], category_id)
110 | for image_info in lvis_json["images"]
111 | for category_id in image_info["not_exhaustive_category_ids"]
112 | ]
113 | lvis_instances_dict = {
114 | k: v for k, v in lvis_instances_dict.items() if k not in _remove
115 | }
116 |
117 | # ------------------------------------------------------------------------
118 | # If `(image, category)` tuple has more LVIS instances than COCO instances
119 | # then all COCO instances will be replaced by LVIS instances.
120 | merged_annotations = []
121 | for (image_id, category_id), anns_in_coco in coco_instances_dict.items():
122 | anns_in_lvis = lvis_instances_dict.get((image_id, category_id), [])
123 |
124 | if len(anns_in_lvis) > len(anns_in_coco):
125 | merged_annotations.extend(anns_in_lvis)
126 | else:
127 | merged_annotations.extend(anns_in_coco)
128 |
129 | # Some `(image, category)` instances of LVIS are completely absent in COCO.
130 | # Add all of these while merging.
131 | for (image_id, category_id), anns_in_lvis in lvis_instances_dict.items():
132 | if (image_id, category_id) not in coco_instances_dict:
133 | merged_annotations.extend(anns_in_lvis)
134 |
135 | coco_json["annotations"] = merged_annotations
136 |
137 | # Re-assign annotation IDs after merging.
138 | image_id_to_anns_coco = defaultdict(list)
139 | for ann in coco_json["annotations"]:
140 | image_id_to_anns_coco[ann["image_id"]].append(ann)
141 |
142 | for image_id, anns_in_coco in image_id_to_anns_coco.items():
143 | for idx, ann in enumerate(anns_in_coco):
144 | ann["id"] = image_id * 1000 + idx
145 |
146 | # ------------------------------------------------------------------------
147 | # Calculate number of annotations sourced from COCO/LVIS.
148 | num_coco_src = len([x for x in merged_annotations if x["source"] == "coco"])
149 | num_lvis_src = len([x for x in merged_annotations if x["source"] == "lvis"])
150 |
151 | print(f"Final COCO {_A.split} split statistics after merging:")
152 | print(f" - Number of images = {len(coco_json['images'])}")
153 | print(f" - Number of annotations = {len(coco_json['annotations'])}")
154 | print(f" - Annotations from COCO = {num_coco_src}")
155 | print(f" - Annotations from LVIS = {num_lvis_src}")
156 |
157 | json.dump(coco_json, open(_A.output, "w"))
158 | print(f"Saved the merged annotations JSON to {_A.output}")
159 |
160 |
161 | if __name__ == "__main__":
162 | _A = parser.parse_args()
163 |
164 | # Log all command-line arguments.
165 | print("Running with arguments:")
166 | for key, value in vars(_A).items():
167 | print(f"{key:<10}: {value}")
168 |
169 | main(_A)
170 |
--------------------------------------------------------------------------------
/scripts/refine_boundaries.py:
--------------------------------------------------------------------------------
1 | """
2 | Refine mask boundaries of input COCO JSON to obtain COCO-ReM. Refinement is done
3 | using the `SamRefiner` module in this package.
4 | """
5 |
6 | from __future__ import annotations
7 |
8 | import argparse
9 | import json
10 | import os
11 | from collections import defaultdict
12 |
13 | import torch
14 | from detectron2 import engine
15 | from detectron2.data.detection_utils import read_image
16 | from detectron2.data.transforms import ResizeShortestEdge
17 | from detectron2.structures import polygons_to_bitmask
18 | from detectron2.utils import comm
19 | from segment_anything.utils import amg
20 | from tqdm import tqdm
21 |
22 | from coco_rem.modeling.sam_refiner import SamRefiner
23 |
24 | # Add documentation of `SamRefiner` to this script documentation, so argparse can
25 | # display it with `--help`.
26 | __doc__ += f"\n\n{SamRefiner.__doc__}\n{SamRefiner.__init__.__doc__}"
27 |
28 | # fmt: off
29 | parser = argparse.ArgumentParser(
30 | description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
31 | )
32 | _AA = parser.add_argument
33 | _AA("--input-json", required=True, help="Path to COCO annotations JSON.")
34 | _AA("--image-dir", default="datasets/coco/val2017", help="COCO image directory.")
35 | _AA("--num-gpus", type=int, default=0, help="Number of GPUs for parallelization.")
36 | _AA("--output", required=True, help="Path to save output annotations JSON.")
37 |
38 | group = parser.add_argument_group("Input arguments to `SamRefiner`.")
39 | group.add_argument("--arch", default="vit_h", choices=["vit_b", "vit_l", "vit_h"])
40 | group.add_argument("--checkpoint", default="checkpoints/sam_vit_h_4b8939.pth")
41 | group.add_argument("--num-extra-points", type=int, default=2)
42 | group.add_argument("--num-trials", type=int, default=10)
43 | group.add_argument(
44 | "--box-only-names", nargs="+",
45 | default=["bed", "bicycle", "bowl", "dining table", "motorcycle", "scissors"],
46 | help="COCO category names for which we only use box prompts.",
47 | )
48 | # fmt: on
49 |
50 |
51 | def main(_A: argparse.Namespace):
52 | device = torch.device("cpu")
53 | if torch.cuda.is_available():
54 | device = torch.cuda.current_device()
55 |
56 | # ------------------------------------------------------------------------
57 | coco_json = json.load(open(_A.input_json))
58 |
59 | # Make a mapping between image ID and all instance annotations.
60 | image_id_annotations = defaultdict(list)
61 | for ann in coco_json["annotations"]:
62 | image_id_annotations[ann["image_id"]].append(ann)
63 |
64 | image_id_annotations = list(image_id_annotations.items())
65 |
66 | # Shard the dataset so each GPU only refines masks for a subset of images.
67 | WORLD_SIZE = comm.get_world_size()
68 | RANK = comm.get_rank()
69 |
70 | image_id_annotations = image_id_annotations[RANK::WORLD_SIZE]
71 | print(f"GPU {RANK}/{WORLD_SIZE} will process {len(image_id_annotations)} images.")
72 |
73 | # Get a list of category IDs for which only box prompts will be used.
74 | cat_id_map = {x["name"]: x["id"] for x in coco_json["categories"]}
75 | box_only_ids = [cat_id_map[x] for x in _A.box_only_names]
76 |
77 | # ------------------------------------------------------------------------
78 | # Instantiate model and input tranform (resize longest side to 1024 pixels).
79 | refiner = SamRefiner(
80 | _A.arch, _A.checkpoint, _A.num_extra_points, _A.num_trials, box_only_ids
81 | )
82 | refiner = refiner.eval().to(device)
83 |
84 | preprocess = ResizeShortestEdge(refiner.img_size, max_size=refiner.img_size)
85 |
86 | # ------------------------------------------------------------------------
87 | for image_id, annotations in tqdm(image_id_annotations, "Refining masks"):
88 | image_path = os.path.join(_A.image_dir, f"{image_id:0>12d}.jpg")
89 | image = read_image(image_path, "RGB")
90 | original_hw = image.shape[:2]
91 |
92 | # Pre-process image and masks.
93 | transform = preprocess.get_transform(image)
94 | image = transform.apply_image(image)
95 |
96 | # Get image height/width before and after applying resize transform.
97 | resized_hw = image.shape[:2]
98 |
99 | # Convert image to NCHW format tensor, RGB values in 0-255).
100 | image = torch.as_tensor(image, device=device)
101 | image = image.permute(2, 0, 1).contiguous()
102 |
103 | # Make batches of source masks (NHW bool tensor).
104 | source_masks = [ann["segmentation"] for ann in annotations]
105 | for idx, segm in enumerate(source_masks):
106 | if isinstance(segm, list):
107 | # Polygons.
108 | polygons = [torch.as_tensor(p).view(-1, 2) for p in segm]
109 | polygons = [p.view(-1) for p in transform.apply_polygons(polygons)]
110 | segm = polygons_to_bitmask(polygons, *resized_hw)
111 | elif isinstance(segm, dict):
112 | # RLE.
113 | segm = amg.rle_to_mask(segm).astype("uint8")
114 | segm = transform.apply_segmentation(segm)
115 |
116 | source_masks[idx] = torch.as_tensor(segm).bool()
117 |
118 | source_masks = torch.stack(source_masks).to(device)
119 | # --------------------------------------------------------------------
120 |
121 | category_ids = [ann["category_id"] for ann in annotations]
122 | refined_masks = refiner(image, source_masks, category_ids, original_hw)
123 |
124 | # Get tight boxes enclosing refined masks, then convert masks to RLE.
125 | refined_boxes_xyxy = amg.batched_mask_to_box(refined_masks)
126 | refined_masks = amg.mask_to_rle_pytorch(refined_masks)
127 |
128 | # Replace the source masks with refined masks in COCO annotations.
129 | # NOTE: Keep "crowd" annotations unchanged as they don't participate in
130 | # the calculation of COCO AP.
131 | for idx, ann in enumerate(annotations):
132 | if ann.get("iscrowd", 0) != 1:
133 | ann["segmentation"] = refined_masks[idx]
134 | ann["area"] = amg.area_from_rle(refined_masks[idx])
135 |
136 | # Recompute box enclosing the refined mask.
137 | x1, y1, x2, y2 = refined_boxes_xyxy[idx].tolist()
138 | ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1]
139 |
140 | # ------------------------------------------------------------------------
141 |
142 | # Combine the refined masks from all GPU processes to main process.
143 | all_refined_annotations = []
144 | for _, annotations in image_id_annotations:
145 | all_refined_annotations.extend(annotations)
146 |
147 | all_refined_annotations = comm.gather(all_refined_annotations, dst=0)
148 |
149 | # In main process, replace annotations in COCO JSON and save to output.
150 | if comm.is_main_process():
151 | coco_json["annotations"] = []
152 | for ann_list in all_refined_annotations:
153 | coco_json["annotations"].extend(ann_list)
154 |
155 | os.makedirs(os.path.dirname(_A.output), exist_ok=True)
156 | json.dump(coco_json, open(_A.output, "w"))
157 | print(f"Saved annotations JSON with refined masks to {_A.output}")
158 |
159 | comm.synchronize()
160 | print(f"GPU {RANK}/{WORLD_SIZE}: Refinement complete!")
161 |
162 |
163 | if __name__ == "__main__":
164 | _A = parser.parse_args()
165 |
166 | print("Running with arguments:")
167 | for key, value in vars(_A).items():
168 | print(f"{key:<30}: {value}")
169 |
170 | engine.launch(main, num_gpus_per_machine=_A.num_gpus, dist_url="auto", args=(_A,))
171 |
--------------------------------------------------------------------------------
/scripts/train_net.py:
--------------------------------------------------------------------------------
1 | """
2 | Train or evaluation a model using Detectron2-style lazy config.
3 | """
4 |
5 | from __future__ import annotations
6 |
7 | import argparse
8 | import json
9 | import logging
10 | import warnings
11 |
12 | import torch
13 | from detectron2 import engine
14 | from detectron2.checkpoint import DetectionCheckpointer
15 | from detectron2.config import LazyConfig, instantiate
16 | from detectron2.engine import hooks
17 | from detectron2.engine.defaults import create_ddp_model
18 | from detectron2.evaluation import inference_on_dataset, print_csv_format
19 | from detectron2.evaluation.testing import flatten_results_dict
20 | from detectron2.utils import comm
21 |
22 | from coco_rem.data.builtin import register_all_coco_rem
23 | from coco_rem.trainer import AMPWithGradAccumTrainer
24 |
25 | warnings.filterwarnings("ignore")
26 | logger = logging.getLogger("detectron2")
27 |
28 |
29 | parser = engine.default_argument_parser(__doc__)
30 | _AA = parser.add_argument
31 | _AA("--checkpoint-period", type=int, default=5000, help="Checkpoint saving period.")
32 | _AA("--log-period", type=int, default=10, help="Log training progress periodically.")
33 |
34 |
35 | def do_test(_C, model):
36 | data_loader = instantiate(_C.dataloader.test)
37 | evaluator = instantiate(_C.dataloader.evaluator)
38 |
39 | results = inference_on_dataset(model, data_loader, evaluator)
40 | print_csv_format(results)
41 | return results
42 |
43 |
44 | def main(_A: argparse.Namespace):
45 | # Register COCO-ReM dataset splits before starting the training job.
46 | register_all_coco_rem()
47 |
48 | _C = LazyConfig.load(_A.config_file)
49 | _C = LazyConfig.apply_overrides(_C, _A.opts)
50 |
51 | engine.default_setup(_C, _A)
52 |
53 | device = torch.cuda.current_device() if _A.num_gpus != 0 else torch.device("cpu")
54 |
55 | model = instantiate(_C.model).to(device)
56 | logger.info("Model:\n{}".format(model))
57 |
58 | model = create_ddp_model(model)
59 | DetectionCheckpointer(model).load(_C.train.get("init_checkpoint", None))
60 |
61 | if _A.eval_only:
62 | results = do_test(_C, model)
63 | if comm.is_main_process():
64 | results = flatten_results_dict(results)
65 | json.dump(results, open(f"{_C.train.output_dir}/eval_results.json", "w"))
66 | return
67 |
68 | train_loader = instantiate(_C.dataloader.train)
69 |
70 | _C.optimizer.params.model = model
71 | optim = instantiate(_C.optimizer)
72 |
73 | trainer_cls = AMPWithGradAccumTrainer if _C.train.amp else engine.SimpleTrainer
74 | trainer = trainer_cls(
75 | model, train_loader, optim, grad_accum_steps=_C.train.get("grad_accum_steps", 1)
76 | )
77 | checkpointer = DetectionCheckpointer(model, _C.train.output_dir, trainer=trainer)
78 |
79 | trainer.register_hooks(
80 | [
81 | hooks.IterationTimer(),
82 | hooks.LRScheduler(scheduler=instantiate(_C.lr_multiplier)),
83 | hooks.PeriodicCheckpointer(checkpointer, _A.checkpoint_period)
84 | if comm.is_main_process()
85 | else None,
86 | hooks.EvalHook(_A.checkpoint_period, lambda: do_test(_C, model)),
87 | hooks.PeriodicWriter(
88 | engine.default_writers(_C.train.output_dir, _C.train.max_iter),
89 | period=_A.log_period,
90 | )
91 | if comm.is_main_process()
92 | else None,
93 | ]
94 | )
95 |
96 | checkpointer.resume_or_load(_C.train.init_checkpoint, resume=_A.resume)
97 | if _A.resume and checkpointer.has_checkpoint():
98 | # The checkpoint stores the training iteration that just finished, thus we start
99 | # at the next iteration
100 | start_iter = trainer.iter + 1
101 | else:
102 | start_iter = 0
103 | trainer.train(start_iter, _C.train.max_iter)
104 |
105 |
106 | if __name__ == "__main__":
107 | _A = parser.parse_args()
108 | engine.launch(
109 | main,
110 | num_gpus_per_machine=_A.num_gpus,
111 | num_machines=_A.num_machines,
112 | machine_rank=_A.machine_rank,
113 | dist_url=_A.dist_url,
114 | args=(_A,),
115 | )
116 |
--------------------------------------------------------------------------------
/scripts/visualize_coco.py:
--------------------------------------------------------------------------------
1 | """
2 | Visualize instances from a COCO annotations JSON (COCO-2017 or COCO-ReM).
3 | """
4 |
5 | import argparse
6 | import logging
7 | import os
8 |
9 | import numpy as np
10 | from detectron2.data import DatasetCatalog, MetadataCatalog
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data.datasets import load_coco_json
13 | from tqdm import tqdm
14 |
15 | from coco_rem.mask_visualizer import MaskVisualizer
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 | # fmt: off
20 | parser = argparse.ArgumentParser(description=__doc__)
21 | _AA = parser.add_argument
22 | _AA(
23 | "--input-json", default="datasets/coco/annotations/instances_val2017.json",
24 | help="Path to JSON file containing COCO annotations."
25 | )
26 | _AA(
27 | "--image-dir", default="datasets/coco/val2017",
28 | help="Path to directory containing COCO images.",
29 | )
30 | _AA("--draw-labels", action="store_true", help="Whether to draw labels on masks.")
31 | _AA("--class-name", help="If provided, visualize masks of this class only.")
32 |
33 | _AA("--output", default="./viz", help="Path to output (saving) dir.")
34 | _AA("--filename-suffix", help="Add a suffix to saved image file name.")
35 | # fmt: on
36 |
37 |
38 | def add_id_to_labels(dic, labels):
39 | labels = [f"{lbl} ({x['id']})" for lbl, x in zip(labels, dic["annotations"])]
40 | return labels
41 |
42 |
43 | if __name__ == "__main__":
44 | _A = parser.parse_args()
45 | print("Arguments: " + str(_A))
46 |
47 | # Register the input COCO JSON file as a Detectron2 dataset to load nicely
48 | # formatted dataset dicts for visualization.
49 | # Extra annotation keys: all possible keys added in generated JSON files.
50 | name = "coco_or_lvis_v1_cocofied_to_visualize"
51 | extra_keys = ["source", "source_id", "id"]
52 |
53 | DatasetCatalog.register(
54 | name, lambda: load_coco_json(_A.input_json, _A.image_dir, name, extra_keys)
55 | )
56 | # ------------------------------------------------------------------------
57 | # Fix seed for reproducible colors.
58 | np.random.seed(0)
59 |
60 | dataset_dicts = DatasetCatalog.get(name)
61 | class_names = MetadataCatalog.get("coco_2017_val").thing_classes
62 | os.makedirs(_A.output, exist_ok=True)
63 |
64 | for ddict in tqdm(dataset_dicts):
65 | if _A.class_name is not None:
66 | ddict["annotations"] = [
67 | ann
68 | for ann in ddict["annotations"]
69 | if class_names[ann["category_id"]] == _A.class_name
70 | ]
71 |
72 | if len(ddict["annotations"]) > 0:
73 | img = utils.read_image(ddict["file_name"], "RGB")
74 | visualizer = MaskVisualizer(img, class_names)
75 | vis_image = visualizer.draw_dataset_dict(
76 | ddict, _A.draw_labels, label_suffix_formatter=add_id_to_labels
77 | )
78 |
79 | # Save the visualized image.
80 | filepath = os.path.join(_A.output, os.path.basename(ddict["file_name"]))
81 | if _A.class_name is not None:
82 | filepath = filepath.replace(".jpg", f"_{_A.class_name}.jpg")
83 |
84 | if _A.filename_suffix is not None:
85 | filepath = filepath.replace(".jpg", f"_{_A.filename_suffix}.jpg")
86 |
87 | vis_image.save(filepath)
88 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from setuptools import find_packages, setup
3 |
4 | setup(
5 | name="coco_rem",
6 | version="0.1",
7 | python_requires=">=3.8",
8 | zip_safe=True,
9 | packages=find_packages(include=["coco_rem"]),
10 | )
11 |
--------------------------------------------------------------------------------