├── .gitignore ├── LICENSE ├── README.md ├── coco_rem ├── coco_evaluator.py ├── configs │ ├── README.md │ ├── common │ │ ├── coco_schedule.py │ │ ├── data │ │ │ ├── coco.py │ │ │ └── constants.py │ │ ├── models │ │ │ ├── cascade_rcnn.py │ │ │ ├── mask2former.py │ │ │ ├── mask_rcnn_fpn.py │ │ │ └── mask_rcnn_vitdet.py │ │ ├── optim.py │ │ └── train.py │ ├── convnext │ │ ├── cascade_mask_rcnn_convnext_base_1k_3x.py │ │ ├── cascade_mask_rcnn_convnext_base_22k_3x.py │ │ ├── cascade_mask_rcnn_convnext_large_22k_3x.py │ │ ├── cascade_mask_rcnn_convnext_small_1k_3x.py │ │ ├── cascade_mask_rcnn_convnext_tiny_1k_3x.py │ │ ├── cascade_mask_rcnn_convnext_xlarge_22k_3x.py │ │ └── mask_rcnn_convnext_tiny_1k_3x.py │ ├── d2lsj │ │ ├── mask_rcnn_R_101_FPN_100ep.py │ │ ├── mask_rcnn_R_101_FPN_200ep.py │ │ ├── mask_rcnn_R_101_FPN_400ep.py │ │ ├── mask_rcnn_R_50_FPN_100ep.py │ │ ├── mask_rcnn_R_50_FPN_200ep.py │ │ ├── mask_rcnn_R_50_FPN_400ep.py │ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_100ep.py │ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_200ep.py │ │ ├── mask_rcnn_regnetx_4gf_dds_FPN_400ep.py │ │ ├── mask_rcnn_regnety_4gf_dds_FPN_100ep.py │ │ ├── mask_rcnn_regnety_4gf_dds_FPN_200ep.py │ │ └── mask_rcnn_regnety_4gf_dds_FPN_400ep.py │ ├── d2main │ │ ├── cascade_mask_rcnn_R_50_FPN_3x.py │ │ ├── mask_rcnn_R_50_FPN_3x.py │ │ ├── scratch_mask_rcnn_R_50_FPN_9x_gn.py │ │ └── scratch_mask_rcnn_R_50_FPN_9x_syncbn.py │ ├── mask2former │ │ ├── maskformer2_R101_bs16_50ep.py │ │ ├── maskformer2_R50_bs16_50ep.py │ │ ├── maskformer2_swin_base_384_bs16_50ep.py │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.py │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.py │ │ ├── maskformer2_swin_small_bs16_50ep.py │ │ └── maskformer2_swin_tiny_bs16_50ep.py │ ├── mvitv2 │ │ ├── cascade_mask_rcnn_mvitv2_b_3x.py │ │ ├── cascade_mask_rcnn_mvitv2_b_in21k_3x.py │ │ ├── cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py │ │ ├── cascade_mask_rcnn_mvitv2_s_3x.py │ │ ├── cascade_mask_rcnn_mvitv2_t_3x.py │ │ └── mask_rcnn_mvitv2_t_3x.py │ └── vitdet │ │ ├── cascade_mask_rcnn_mvitv2_b_in21k_100ep.py │ │ ├── cascade_mask_rcnn_mvitv2_h_in21k_36ep.py │ │ ├── cascade_mask_rcnn_mvitv2_l_in21k_50ep.py │ │ ├── cascade_mask_rcnn_swin_b_in21k_50ep.py │ │ ├── cascade_mask_rcnn_swin_l_in21k_50ep.py │ │ ├── cascade_mask_rcnn_vitdet_b_100ep.py │ │ ├── cascade_mask_rcnn_vitdet_h_75ep.py │ │ ├── cascade_mask_rcnn_vitdet_l_100ep.py │ │ ├── mask_rcnn_vitdet_b_100ep.py │ │ ├── mask_rcnn_vitdet_h_75ep.py │ │ └── mask_rcnn_vitdet_l_100ep.py ├── data │ ├── builtin.py │ ├── lvis.py │ └── manual_rem.py ├── mask_visualizer.py ├── modeling │ ├── convnext.py │ ├── rcnn_refiner.py │ └── sam_refiner.py └── trainer.py ├── images ├── coco_rem_example_1.jpg └── coco_rem_example_2.jpg ├── requirements.txt ├── scripts ├── correct_labeling_errors.py ├── merge_instances.py ├── refine_boundaries.py ├── train_net.py └── visualize_coco.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.diff 2 | 3 | # compilation and distribution 4 | __pycache__ 5 | _ext 6 | *.pyc 7 | *.pyd 8 | *.so 9 | *.dll 10 | *.egg-info/ 11 | build/ 12 | dist/ 13 | wheels/ 14 | 15 | # Python virtual environments. 16 | .env 17 | .venv 18 | env/ 19 | venv/ 20 | ENV/ 21 | env.bak/ 22 | venv.bak/ 23 | 24 | # Jupyter Notebook 25 | .ipynb_checkpoints 26 | /.virtual_documents 27 | 28 | # IPython 29 | profile_default/ 30 | ipython_config.py 31 | 32 | 33 | # pytorch/python/numpy formats 34 | *.pth 35 | *.pkl 36 | *.npy 37 | *.pt 38 | 39 | # Editor temporaries 40 | *.swn 41 | *.swo 42 | *.swp 43 | *~ 44 | 45 | # editor settings 46 | .idea 47 | .vscode 48 | _darcs 49 | pyrightconfig.json 50 | 51 | # project dirs 52 | datasets 53 | checkpoints 54 | output 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024, Karan Desai. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial 10 | portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 15 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # COCO-ReM (COCO with Refined Masks) 2 | 3 | [![Framework: PyTorch](https://img.shields.io/badge/Framework-PyTorch-orange.svg)](https://pytorch.org) [![HuggingFace Datasets](https://img.shields.io/badge/%F0%9F%A4%97-HuggingFace_Datasets-cyan.svg 4 | )](https://huggingface.co/datasets/kdexd/coco-rem) 5 | 6 | [Shweta Singh](https://www.linkedin.com/in/shweta-singh-460154284/), [Aayan Yadav](https://www.linkedin.com/in/aayanyadav09/), [Jitesh Jain](https://praeclarumjj3.github.io/), [Humphrey Shi](https://www.humphreyshi.com/home), [Justin Johnson](https://web.eecs.umich.edu/~justincj/), [Karan Desai](https://kdexd.xyz/) 7 | 8 | Equal Contribution 9 | 10 | [[`arxiv`](https://arxiv.org/abs/2403.18819)] [[`Dataset Website`](https://cocorem.xyz)] 11 | 12 | ![Random examples from COCO-ReM](./images/coco_rem_example_2.jpg) 13 | 14 | Introducing COCO-ReM, a set of high-quality instance annotations for COCO images. 15 | COCO-ReM improves on imperfections prevailing in COCO-2017 such as coarse mask boundaries, non-exhaustive annotations, 16 | inconsistent handling of occlusions, and duplicate masks. 17 | Masks in COCO-ReM have a visibly better quality than COCO-2017, as shown below. 18 | 19 | ![COCO and COCO-ReM](./images/coco_rem_example_1.jpg) 20 | 21 | ## Contents 22 | 23 | 1. [News](#news) 24 | 2. [Setup Instructions](#setup-instructions) 25 | 3. [Download COCO-ReM](#download-coco-rem) 26 | 4. [Mask Visualization](#mask-visualization) 27 | 5. [Evaluation using COCO-ReM](#evaluation-using-coco-rem) 28 | 6. [Training with COCO-ReM](#training-with-coco-rem) 29 | 7. [Annotation Pipeline](#annotation-pipeline) 30 | - [Stage 1: Mask Boundary Refinement (automatic step)](#stage-1-mask-boundary-refinement) 31 | - [Stage 2: Exhaustive Instance Annotation (automatic step)](#stage-2-exhaustive-instance-annotation) 32 | - [Stage 3: Correction of Labeling Errors](#stage-3-correction-of-labeling-errors) 33 | 8. [Citation](#citation) 34 | 35 | ## News 36 | 37 | - **[July 7, 2024]**: Dataset now available on [**HuggingFace**](https://huggingface.co/datasets/kdexd/coco-rem) and [**code**](https://github.com/kdexd/coco-rem) is public! 38 | - **[July 1, 2024]**: COCO-ReM is accepted to ECCV 2024! 39 | - **[March 27, 2024]**: [**Dataset website**](https://cocorem.xyz) and [**arXiv preprint**](https://arxiv.org/abs/2403.18819) are public! 40 | 41 | ## Setup Instructions 42 | 43 | Clone the repository, create a conda environment, and install all dependencies as follows: 44 | 45 | ```bash 46 | git clone https://github.com/kdexd/coco-rem.git && cd coco-rem 47 | conda create -n coco_rem python=3.10 48 | conda activate coco_rem 49 | ``` 50 | 51 | Install PyTorch and `torchvision` following the instructions on [pytorch.org](https://pytorch.org). 52 | Install Detectron2, [instructions are available here](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md). 53 | Then, install the dependencies: 54 | 55 | ```bash 56 | pip install -r requirements.txt 57 | pip install git+https://github.com/facebookresearch/segment-anything.git 58 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 59 | 60 | python setup.py develop 61 | ``` 62 | 63 | ## Download COCO-ReM 64 | 65 | COCO-ReM is hosted on Huggingface Datasets at [@kdexd/coco-rem](https://huggingface.co/datasets/kdexd/coco-rem). 66 | Download the annotation files: 67 | 68 | ``` 69 | for name in trainrem valrem; do 70 | wget https://huggingface.co/datasets/kdexd/coco-rem/resolve/main/instances_$name.json.zip 71 | unzip instances_$name.json.zip 72 | done 73 | ``` 74 | 75 | **Dataset organization:** COCO and COCO-ReM and must be organized inside `datasets` directory as follows. 76 | 77 | ``` 78 | $PROJECT_ROOT/datasets 79 | — coco/ 80 | — train2017/ # Contains 118287 train images (.jpg files). 81 | — val2017/ # Contains 5000 val images (.jpg files). 82 | — annotations/ 83 | — instances_train2017.json 84 | — instances_val2017.json 85 | - coco_rem/ 86 | - instances_trainrem.json 87 | - instances_valrem.json 88 | -lvis 89 | - lvis_v1_val.json 90 | - lvis_v1_train.json 91 | ``` 92 | 93 | ----- 94 | 95 | ## Mask Visualization 96 | 97 | We include a lightweight script to quickly visualize masks of COCO-ReM and COCO-2017, 98 | both validation and training sets. For example, run the following command to visualize 99 | the masks for COCO-ReM validation set: 100 | 101 | ```bash 102 | python scripts/visualize_coco.py \ 103 | --input-json datasets/coco_rem/instances_valrem.json \ 104 | --image-dir datasets/coco/val2017 \ 105 | --output visualization_output 106 | ``` 107 | 108 | Read the documentation (`python scripts/visualize_coco.py --help`) for details about other arguments. 109 | 110 | ----- 111 | 112 | ## Evaluation using COCO-ReM 113 | 114 | We support evaluation of all fifty object detectors available in the paper. 115 | First, run `python checkpoints/download.py` to download all the pre-trained models 116 | from their official repositories and save them in `checkpoints/pretrained_weights`. 117 | 118 | For example, to evaluate a [Mask R-CNN ViTDet-B model](https://arxiv.org/abs/2203.16527) using 8 GPUs 119 | and calculate average precision (AP) metrics, run the following command: 120 | 121 | ```bash 122 | python scripts/train_net.py --num-gpus 8 --eval-only \ 123 | --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \ 124 | train.init_checkpoint=checkpoints/pretrained_weights/vitdet/mask_rcnn_vitdet_b_100ep.pkl \ 125 | dataloader.test.dataset.names=coco_rem_val \ 126 | train.output_dir=evaluation_results 127 | ``` 128 | 129 | ## Training with COCO-ReM 130 | 131 | We also support training ViTDet baselines on COCO-ReM using the Detectron2 library. 132 | Run the following command to train using 8 GPUs (with at least 32GB memory): 133 | 134 | ```bash 135 | python scripts/train_net.py --num-gpus 8 \ 136 | --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \ 137 | dataloader.train.dataset.names=coco_rem_train \ 138 | dataloader.test.dataset.names=coco_rem_val \ 139 | train.output_dir=training_output \ 140 | dataloader.train.total_batch_size=16 train.grad_accum_steps=4 141 | ``` 142 | 143 | For GPUs with less memory, update the parameters in the last line above: 144 | the batch size can be halved and gradient accumulation steps can be doubled, for same results. 145 | 146 | ## Annotation Pipeline 147 | 148 | 149 | ### Stage 1: Mask Boundary Refinement (automatic step) 150 | 151 | Download checkpoint for SAM from [segment-anything repository](https://github.com/facebookresearch/segment-anything) and place it in `checkpoint` folder. 152 | 153 | Run the following command to refine the boundaries of validation set masks using 8 GPUs: 154 | 155 | ```bash 156 | python scripts/refine_boundaries.py \ 157 | --input-json datasets/coco/annotations/instances_val2017.json \ 158 | --image-dir datasets/coco/val2017 \ 159 | --num-gpus 8 \ 160 | --output datasets/intermediate/cocoval_boundary_refined.json 161 | ``` 162 | 163 | Read the documentation (`python scripts/refine_boundaries.py --help`) for details about other arguments. 164 | 165 | Use default values for other optional arguments to follow the strategy used in [paper](https://arxiv.org/abs/2403.18819). 166 | 167 | Do this stage for both COCO and LVIS datasets before the merging stage. 168 | 169 | 170 | 171 | ### Stage 2: Exhaustive Instance Annotation (automatic step) 172 | 173 | Run the following command to merge LVIS annotations for validation set of COCO using the strategy described in [paper](https://arxiv.org/abs/2403.18819): 174 | 175 | ``` 176 | python scripts/merge_instances.py \ 177 | --coco-json datasets/intermediate/cocoval_boundary_refined.json \ 178 | --lvis-json datasets/intermediate/lvistrain_boundary_refined.json datasets/intermediate/lvisval_boundary_refined.json \ 179 | --split val \ 180 | --output datasets/intermediate/cocoval_lvis_merged.json 181 | ``` 182 | Read the documentation (`python scripts/merge_instances.py --help`) for details about above arguments. 183 | 184 | Merging handpicked `(image,category)` non exhaustive instances from LVIS in validation set is done in the script of next stage. 185 | 186 | 187 | 188 | ### Stage 3: Correction of Labeling Errors 189 | 190 | This stage is done only for validation set. 191 | 192 | ``` 193 | python scripts/correct_labeling_errors.py \ 194 | --input datasets/intermediate/cocoval_lvis_merged.json \ 195 | --output datasets/cocoval_refined.json 196 | ``` 197 | **Note**: For the above json to be COCO-ReM we also have to perform the manual parts of Stage 1 and Stage 2. 198 | 199 | ## Citation 200 | 201 | If you found COCO-ReM useful in your research, please consider starring ⭐ us on GitHub and citing 📚 us in your research! 202 | 203 | ```bibtex 204 | @inproceedings{cocorem, 205 | title={Benchmarking Object Detectors with COCO: A New Path Forward}, 206 | author={Singh, Shweta and Yadav, Aayan and Jain, Jitesh and Shi, Humphrey and Johnson, Justin and Desai, Karan}, 207 | journal={ECCV}, 208 | year={2024} 209 | } 210 | ``` 211 | -------------------------------------------------------------------------------- /coco_rem/coco_evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from __future__ import annotations 3 | 4 | import contextlib 5 | import copy 6 | import io 7 | import itertools 8 | import json 9 | import logging 10 | import os 11 | from collections import OrderedDict 12 | 13 | import detectron2.utils.comm as comm 14 | import numpy as np 15 | import pycocotools.mask as mask_util 16 | import torch 17 | from boundary_iou.coco_instance_api.coco import COCO 18 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 19 | from detectron2.data import MetadataCatalog 20 | from detectron2.evaluation.evaluator import DatasetEvaluator 21 | from detectron2.structures import BoxMode, Instances 22 | from detectron2.utils.file_io import PathManager 23 | from detectron2.utils.logger import create_small_table 24 | from tabulate import tabulate 25 | 26 | 27 | class COCOReMEvaluator(DatasetEvaluator): 28 | """ 29 | Evaluate AP for COCO instance segmentation. The metrics range from 0 to 100 30 | (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed 31 | (e.g. due to no predictions made). 32 | 33 | See http://cocodataset.org/#detection-eval 34 | 35 | This implementation is functionally same as the original COCO evaluator of 36 | Detectron2 (:class:`detectron2.evaluation.COCOEvaluator`) except a few API 37 | and behavioral differences: 38 | 39 | 1. Only `Mask AP` and `Boundary AP` are supported, other metrics like `Box AP` 40 | and `Keypoint AP` are neither supported, nor calculated. 41 | 42 | 2. Max detections per image are always `[1, 10, 100]` following official COCO 43 | evaluation protocol, these are not customizable. 44 | 45 | 3. The official COCO evaluation API is used for calculating metrics, unlike 46 | Detectron2 that also allows using a fast, yet unofficial implementation. 47 | Hence, the calculated AP is suitable to report in research papers. 48 | """ 49 | 50 | def __init__(self, dataset_name: str, distributed: bool = True, output_dir=None): 51 | """ 52 | Args: 53 | dataset_name: Name of the dataset to be evaluated. It must have either 54 | registered metadata with a field named `json_file` which is a path 55 | to the COCO format annotation file. 56 | distributed: If True, will collect results from all ranks and run 57 | evaluation in the main process. Otherwise, will only evaluate 58 | the results in the current process. 59 | output_dir: An optional path to output directory where all results 60 | will be dumped as two files: 61 | 62 | 1. "instances_predictions.pth" a file that can be loaded with 63 | `torch.load` and contains all the results in the format they 64 | are produced by the model. 65 | 2. "coco_instances_results.json" in COCO result format. 66 | """ 67 | self._logger = logging.getLogger(__name__) 68 | self._distributed = distributed 69 | self._output_dir = output_dir 70 | self._cpu_device = torch.device("cpu") 71 | 72 | self._metadata = MetadataCatalog.get(dataset_name) 73 | json_file = PathManager.get_local_path(self._metadata.json_file) 74 | 75 | with contextlib.redirect_stdout(io.StringIO()): 76 | self._coco_api = COCO(json_file) 77 | 78 | def reset(self): 79 | self._predictions = [] 80 | 81 | def process(self, inputs, outputs): 82 | """ 83 | Args: 84 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 85 | It is a list of dict. Each dict corresponds to an image and 86 | contains keys like "height", "width", "file_name", "image_id". 87 | outputs: the outputs of a COCO model. It is a list of dicts with key 88 | "instances" that contains :class:`Instances`. 89 | """ 90 | for input, output in zip(inputs, outputs): 91 | prediction = {"image_id": input["image_id"]} 92 | 93 | if "instances" in output: 94 | instances = output["instances"].to(self._cpu_device) 95 | prediction["instances"] = instances_to_coco_json( 96 | instances, input["image_id"] 97 | ) 98 | if "proposals" in output: 99 | prediction["proposals"] = output["proposals"].to(self._cpu_device) 100 | if len(prediction) > 1: 101 | self._predictions.append(prediction) 102 | 103 | def evaluate(self, img_ids=None): 104 | """ 105 | Args: 106 | img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset 107 | """ 108 | if self._distributed: 109 | comm.synchronize() 110 | predictions = comm.gather(self._predictions, dst=0) 111 | predictions = list(itertools.chain(*predictions)) 112 | 113 | if not comm.is_main_process(): 114 | return {} 115 | else: 116 | predictions = self._predictions 117 | 118 | if len(predictions) == 0: 119 | self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") 120 | return {} 121 | 122 | if self._output_dir: 123 | PathManager.mkdirs(self._output_dir) 124 | file_path = os.path.join(self._output_dir, "instances_predictions.pth") 125 | with PathManager.open(file_path, "wb") as f: 126 | torch.save(predictions, f) 127 | 128 | self._results = OrderedDict() 129 | if "instances" in predictions[0]: 130 | self._eval_predictions(predictions, img_ids=img_ids) 131 | # Copy so the caller can do whatever with results 132 | return copy.deepcopy(self._results) 133 | 134 | def _eval_predictions(self, predictions, img_ids=None): 135 | """ 136 | Evaluate predictions. Fill self._results with the metrics of the tasks. 137 | """ 138 | self._logger.info("Preparing results for COCO format ...") 139 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 140 | 141 | # unmap the category ids for COCO 142 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 143 | dataset_id_to_contiguous_id = ( 144 | self._metadata.thing_dataset_id_to_contiguous_id 145 | ) 146 | all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 147 | num_classes = len(all_contiguous_ids) 148 | assert ( 149 | min(all_contiguous_ids) == 0 150 | and max(all_contiguous_ids) == num_classes - 1 151 | ) 152 | 153 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 154 | for result in coco_results: 155 | category_id = result["category_id"] 156 | assert category_id < num_classes, ( 157 | f"A prediction has class={category_id}, " 158 | f"but the dataset only has {num_classes} classes and " 159 | f"predicted class id should be in [0, {num_classes - 1}]." 160 | ) 161 | result["category_id"] = reverse_id_mapping[category_id] 162 | 163 | if self._output_dir: 164 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 165 | self._logger.info("Saving results to {}".format(file_path)) 166 | with PathManager.open(file_path, "w") as f: 167 | f.write(json.dumps(coco_results)) 168 | f.flush() 169 | 170 | self._logger.info("Evaluating predictions with official COCO API...") 171 | 172 | for task in ["segm", "boundary"]: 173 | coco_eval = ( 174 | _evaluate_predictions_on_coco( 175 | self._coco_api, coco_results, task, img_ids=img_ids 176 | ) 177 | if len(coco_results) > 0 178 | else None # cocoapi does not handle empty results very well 179 | ) 180 | 181 | res = self._derive_coco_results( 182 | coco_eval, task, class_names=self._metadata.get("thing_classes") 183 | ) 184 | self._results[task] = res 185 | 186 | def _derive_coco_results(self, coco_eval, iou_type, class_names=None): 187 | """ 188 | Derive the desired score numbers from summarized COCOeval. 189 | """ 190 | 191 | metrics = [ 192 | "AP", 193 | "AP50", 194 | "AP75", 195 | "AP80", 196 | "AP85", 197 | "AP90", 198 | "AP95", 199 | "APs", 200 | "APm", 201 | "APl", 202 | ] 203 | if coco_eval is None: 204 | self._logger.warn("No predictions from the model!") 205 | return {metric: float("nan") for metric in metrics} 206 | 207 | # the standard metrics 208 | results = { 209 | metric: float( 210 | coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan" 211 | ) 212 | for idx, metric in enumerate(metrics) 213 | } 214 | self._logger.info( 215 | "Evaluation results for {}: \n".format(iou_type) 216 | + create_small_table(results) 217 | ) 218 | if not np.isfinite(sum(results.values())): 219 | self._logger.info("Some metrics cannot be computed and is shown as NaN.") 220 | 221 | if class_names is None or len(class_names) <= 1: 222 | return results 223 | 224 | # Compute per-category AP 225 | precisions = coco_eval.eval["precision"] 226 | # precision has dims (iou, recall, cls, area range, max dets) 227 | assert len(class_names) == precisions.shape[2] 228 | 229 | results_per_category = [] 230 | for idx, name in enumerate(class_names): 231 | # area range index 0: all area ranges 232 | # max dets index -1: typically 100 per image 233 | precision = precisions[:, :, idx, 0, -1] 234 | precision = precision[precision > -1] 235 | ap = np.mean(precision) if precision.size else float("nan") 236 | results_per_category.append(("{}".format(name), float(ap * 100))) 237 | 238 | # tabulate it 239 | N_COLS = min(6, len(results_per_category) * 2) 240 | results_flatten = list(itertools.chain(*results_per_category)) 241 | results_2d = itertools.zip_longest( 242 | *[results_flatten[i::N_COLS] for i in range(N_COLS)] 243 | ) 244 | table = tabulate( 245 | results_2d, 246 | tablefmt="pipe", 247 | floatfmt=".3f", 248 | headers=["category", "AP"] * (N_COLS // 2), 249 | numalign="left", 250 | ) 251 | self._logger.info("Per-category {} AP: \n".format(iou_type) + table) 252 | 253 | results.update({"AP-" + name: ap for name, ap in results_per_category}) 254 | return results 255 | 256 | 257 | def instances_to_coco_json(instances: Instances, img_id: int) -> list[dict]: 258 | """ 259 | Dump an "Instances" object to a COCO-format json that's used for evaluation. 260 | """ 261 | num_instance = len(instances) 262 | if num_instance == 0: 263 | return [] 264 | 265 | boxes = instances.pred_boxes.tensor.numpy() 266 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) 267 | boxes = boxes.tolist() 268 | scores = instances.scores.tolist() 269 | classes = instances.pred_classes.tolist() 270 | 271 | has_mask = instances.has("pred_masks") 272 | if has_mask: 273 | # use RLE to encode the masks, because they are too large and takes memory 274 | # since this evaluator stores outputs of the entire dataset 275 | rles = [ 276 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 277 | for mask in instances.pred_masks 278 | ] 279 | for rle in rles: 280 | # "counts" is an array encoded by mask_util as a byte-stream. Python3's 281 | # json writer which always produces strings cannot serialize a bytestream 282 | # unless you decode it. Thankfully, utf-8 works out (which is also what 283 | # the pycocotools/_mask.pyx does). 284 | rle["counts"] = rle["counts"].decode("utf-8") 285 | 286 | results = [] 287 | for k in range(num_instance): 288 | result = { 289 | "image_id": img_id, 290 | "category_id": classes[k], 291 | "bbox": boxes[k], 292 | "score": scores[k], 293 | } 294 | if has_mask: 295 | result["segmentation"] = rles[k] 296 | results.append(result) 297 | return results 298 | 299 | 300 | class COCOevalHighIoU(COCOeval): 301 | def summarize(self): 302 | """ 303 | Compute and display summary metrics for evaluation results including AP 304 | with higher IOU thresholds (0.9 and 0.95). 305 | """ 306 | 307 | def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): 308 | p = self.params 309 | p.iouThrs = np.array( 310 | [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95] 311 | ) 312 | 313 | iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" 314 | titleStr = "Average Precision" if ap == 1 else "Average Recall" 315 | typeStr = "(AP)" if ap == 1 else "(AR)" 316 | iouStr = ( 317 | "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) 318 | if iouThr is None 319 | else "{:0.2f}".format(iouThr) 320 | ) 321 | 322 | aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] 323 | mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] 324 | if ap == 1: 325 | # dimension of precision: [TxRxKxAxM] 326 | s = self.eval["precision"] 327 | # IoU 328 | if iouThr is not None: 329 | t = np.where(iouThr == p.iouThrs)[0] 330 | s = s[t] 331 | s = s[:, :, :, aind, mind] 332 | else: 333 | # dimension of recall: [TxKxAxM] 334 | s = self.eval["recall"] 335 | if iouThr is not None: 336 | t = np.where(iouThr == p.iouThrs)[0] 337 | s = s[t] 338 | s = s[:, :, aind, mind] 339 | if len(s[s > -1]) == 0: 340 | mean_s = -1 341 | else: 342 | mean_s = np.mean(s[s > -1]) 343 | print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) 344 | return mean_s 345 | 346 | def _summarizeDets(): 347 | stats = np.zeros((16,)) 348 | stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) 349 | stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) 350 | stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) 351 | stats[3] = _summarize(1, iouThr=0.80, maxDets=self.params.maxDets[2]) 352 | stats[4] = _summarize(1, iouThr=0.85, maxDets=self.params.maxDets[2]) 353 | stats[5] = _summarize(1, iouThr=0.90, maxDets=self.params.maxDets[2]) 354 | stats[6] = _summarize(1, iouThr=0.95, maxDets=self.params.maxDets[2]) 355 | stats[7] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) 356 | stats[8] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) 357 | stats[9] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) 358 | stats[10] = _summarize(0, maxDets=self.params.maxDets[0]) 359 | stats[11] = _summarize(0, maxDets=self.params.maxDets[1]) 360 | stats[12] = _summarize(0, maxDets=self.params.maxDets[2]) 361 | stats[13] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) 362 | stats[14] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) 363 | stats[15] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) 364 | return stats 365 | 366 | if not self.eval: 367 | raise Exception("Please run accumulate() first") 368 | 369 | self.stats = _summarizeDets() 370 | 371 | def __str__(self): 372 | self.summarize() 373 | 374 | 375 | def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, img_ids=None): 376 | """ 377 | Evaluate the coco results using COCOEval API. 378 | """ 379 | assert len(coco_results) > 0 380 | 381 | if iou_type in {"segm", "boundary"}: 382 | coco_results = copy.deepcopy(coco_results) 383 | # When evaluating mask AP, if the results contain bbox, cocoapi will 384 | # use the box area as the area of the instance, instead of the mask area. 385 | # This leads to a different definition of small/medium/large. 386 | # We remove the bbox field to let mask AP use mask area. 387 | for c in coco_results: 388 | c.pop("bbox", None) 389 | 390 | coco_dt = coco_gt.loadRes(coco_results) 391 | coco_eval = COCOevalHighIoU(coco_gt, coco_dt, iou_type) 392 | 393 | if img_ids is not None: 394 | coco_eval.params.imgIds = img_ids 395 | 396 | coco_eval.evaluate() 397 | coco_eval.accumulate() 398 | coco_eval.summarize() 399 | 400 | return coco_eval 401 | -------------------------------------------------------------------------------- /coco_rem/configs/README.md: -------------------------------------------------------------------------------- 1 | # Model Configs for Benchmarking 2 | 3 | Each sub-directory contains Detectron2 config files (`LazyConfig` format) for 4 | all model checkpoints from public Github repos building with Detectron2. 5 | 6 | - `d2main`: Detectron2 model zoo (initial baselines). 7 | - `d2lsj`: Detectron2 model zoo (new LSJ baselines). 8 | - `vitdet`: https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet 9 | - `convnext`: https://github.com/facebookresearch/convnext 10 | - `mvitv2`: https://github.com/facebookresearch/detectron2/tree/main/projects/MViTv2 11 | - `mask2former`: https://github.com/facebookresearch/Mask2Former 12 | 13 | Additionally, `common` directory has config objects that are shared across many 14 | config files. 15 | 16 | ### Note on config structure 17 | 18 | Detectron2 lazy configs are described in the official Detectron2 documentation 19 | [here](https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html). 20 | Each config file requires five objects: `dataloader`, `model`, `optimizer`, 21 | `lr_multiplier`, `train`. Some configs may exclude two objects that are not 22 | required for evaluation - `optimizer` and `lr_multiplier`. 23 | -------------------------------------------------------------------------------- /coco_rem/configs/common/coco_schedule.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.solver import WarmupParamScheduler 3 | from fvcore.common.param_scheduler import MultiStepParamScheduler 4 | 5 | 6 | def default_lsj_epoch_scheduler(epochs: int): 7 | """ 8 | Returns the config for a default multi-step LR scheduler that runs for fixed 9 | amount of COCO epochs, typically used with models using "LSJ" augmentations 10 | and training schedule (large-scale jittering augmentation and 50-400 epochs). 11 | """ 12 | 13 | coco_100ep_iter = 184375 14 | coco_curr_iter = coco_100ep_iter * epochs // 100 15 | 16 | coco_100ep_milestones = [163889, 177546] 17 | coco_curr_milestones = [x * epochs // 100 for x in coco_100ep_milestones] 18 | 19 | lr_multiplier = L(WarmupParamScheduler)( 20 | scheduler=L(MultiStepParamScheduler)( 21 | values=[1.0, 0.1, 0.01], 22 | milestones=coco_curr_milestones, 23 | num_updates=coco_curr_iter, 24 | ), 25 | warmup_length=250 / coco_curr_iter, 26 | warmup_factor=0.001, 27 | ) 28 | return lr_multiplier 29 | 30 | 31 | lr_multiplier_75ep = default_lsj_epoch_scheduler(75) 32 | lr_multiplier_100ep = default_lsj_epoch_scheduler(100) 33 | lr_multiplier_200ep = default_lsj_epoch_scheduler(200) 34 | lr_multiplier_400ep = default_lsj_epoch_scheduler(400) 35 | -------------------------------------------------------------------------------- /coco_rem/configs/common/data/coco.py: -------------------------------------------------------------------------------- 1 | import detectron2.data.transforms as T 2 | from detectron2.config import LazyCall as L 3 | from detectron2.data import ( 4 | DatasetMapper, 5 | build_detection_test_loader, 6 | build_detection_train_loader, 7 | get_detection_dataset_dicts, 8 | ) 9 | from omegaconf import OmegaConf 10 | 11 | from coco_rem.coco_evaluator import COCOReMEvaluator 12 | 13 | dataloader = OmegaConf.create() 14 | 15 | # Mapper with large-scale jittering (LSJ) augmentation. 16 | image_size = 1024 17 | 18 | dataloader.train = L(build_detection_train_loader)( 19 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"), 20 | mapper=L(DatasetMapper)( 21 | is_train=True, 22 | augmentations=[ 23 | L(T.RandomFlip)(horizontal=True), # flip first 24 | L(T.ResizeScale)( 25 | min_scale=0.1, 26 | max_scale=2.0, 27 | target_height=image_size, 28 | target_width=image_size, 29 | ), 30 | L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), 31 | ], 32 | image_format="RGB", 33 | use_instance_mask=True, 34 | instance_mask_format="bitmask", 35 | recompute_boxes=True, 36 | ), 37 | total_batch_size=64, 38 | num_workers=4, 39 | ) 40 | 41 | # Resize shortest edge to 1024 pixels. 42 | dataloader.test = L(build_detection_test_loader)( 43 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False), 44 | mapper=L(DatasetMapper)( 45 | is_train=False, 46 | augmentations=[ 47 | L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), 48 | ], 49 | image_format="${...train.mapper.image_format}", 50 | ), 51 | num_workers=4, 52 | ) 53 | 54 | # Update: Custom COCO evaluator that returns exactly same results as default 55 | # evaluator, with additionally returning AP90, AP95, and Boundary AP. 56 | dataloader.evaluator = L(COCOReMEvaluator)( 57 | dataset_name="${..test.dataset.names}", 58 | output_dir="${...train.output_dir}", 59 | ) 60 | -------------------------------------------------------------------------------- /coco_rem/configs/common/data/constants.py: -------------------------------------------------------------------------------- 1 | constants = dict( 2 | imagenet_rgb256_mean=[123.675, 116.28, 103.53], 3 | imagenet_rgb256_std=[58.395, 57.12, 57.375], 4 | imagenet_bgr256_mean=[103.530, 116.280, 123.675], 5 | # When using pre-trained models in Detectron1 or any MSRA models, 6 | # std has been absorbed into its conv1 weights, so the std needs to be set 1. 7 | # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) 8 | imagenet_bgr256_std=[1.0, 1.0, 1.0], 9 | ) 10 | -------------------------------------------------------------------------------- /coco_rem/configs/common/models/cascade_rcnn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.box_regression import Box2BoxTransform 4 | from detectron2.modeling.matcher import Matcher 5 | from detectron2.modeling.roi_heads import ( 6 | CascadeROIHeads, 7 | FastRCNNConvFCHead, 8 | FastRCNNOutputLayers, 9 | ) 10 | 11 | from .mask_rcnn_fpn import model 12 | 13 | # arguments that don't exist for Cascade R-CNN 14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] 15 | 16 | model.roi_heads.update( 17 | _target_=CascadeROIHeads, 18 | box_heads=[ 19 | L(FastRCNNConvFCHead)( 20 | input_shape=ShapeSpec(channels=256, height=7, width=7), 21 | conv_dims=[], 22 | fc_dims=[1024, 1024], 23 | ) 24 | for k in range(3) 25 | ], 26 | box_predictors=[ 27 | L(FastRCNNOutputLayers)( 28 | input_shape=ShapeSpec(channels=1024), 29 | test_score_thresh=0.05, 30 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 31 | cls_agnostic_bbox_reg=True, 32 | num_classes="${...num_classes}", 33 | ) 34 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 35 | ], 36 | proposal_matchers=[ 37 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) 38 | for th in [0.5, 0.6, 0.7] 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /coco_rem/configs/common/models/mask2former.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.backbone import BasicStem, ResNet 4 | from mask2former.maskformer_model import MaskFormer 5 | from mask2former.modeling.meta_arch.mask_former_head import MaskFormerHead 6 | from mask2former.modeling.pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 7 | from mask2former.modeling.transformer_decoder import MultiScaleMaskedTransformerDecoder 8 | 9 | from ..data.constants import constants 10 | 11 | model = L(MaskFormer)( 12 | backbone=L(ResNet)( 13 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), 14 | stages=L(ResNet.make_default_stages)( 15 | depth=50, 16 | stride_in_1x1=False, 17 | norm="FrozenBN", 18 | ), 19 | out_features=["res2", "res3", "res4", "res5"], 20 | ), 21 | sem_seg_head=L(MaskFormerHead)( 22 | input_shape={ 23 | "res2": L(ShapeSpec)(channels=256, stride=4), 24 | "res3": L(ShapeSpec)(channels=512, stride=8), 25 | "res4": L(ShapeSpec)(channels=1024, stride=16), 26 | "res5": L(ShapeSpec)(channels=2048, stride=32), 27 | }, 28 | num_classes=80, 29 | pixel_decoder=L(MSDeformAttnPixelDecoder)( 30 | input_shape="${..input_shape}", 31 | transformer_dropout=0.0, 32 | transformer_nheads=8, 33 | transformer_dim_feedforward=1024, 34 | transformer_enc_layers=6, 35 | conv_dim=256, 36 | mask_dim=256, 37 | norm="GN", 38 | transformer_in_features=["res3", "res4", "res5"], 39 | common_stride=4, 40 | ), 41 | loss_weight=1.0, 42 | ignore_value=255, 43 | transformer_predictor=L(MultiScaleMaskedTransformerDecoder)( 44 | in_channels="${..pixel_decoder.conv_dim}", 45 | mask_classification=True, 46 | num_classes="${..num_classes}", 47 | hidden_dim="${..pixel_decoder.conv_dim}", 48 | num_queries="${...num_queries}", 49 | nheads=8, 50 | dim_feedforward=2048, 51 | dec_layers=9, 52 | pre_norm=False, 53 | mask_dim="${..pixel_decoder.mask_dim}", 54 | enforce_input_project=False, 55 | ), 56 | transformer_in_feature="multi_scale_pixel_decoder", 57 | ), 58 | criterion=None, 59 | num_queries=100, 60 | metadata=None, 61 | size_divisibility=32, 62 | sem_seg_postprocess_before_inference=True, 63 | object_mask_threshold=0.8, 64 | overlap_threshold=0.8, 65 | instance_on=True, 66 | semantic_on=False, 67 | panoptic_on=False, 68 | pixel_mean=constants.imagenet_rgb256_mean, 69 | pixel_std=constants.imagenet_rgb256_std, 70 | test_topk_per_image=100, 71 | ) 72 | -------------------------------------------------------------------------------- /coco_rem/configs/common/models/mask_rcnn_fpn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator 4 | from detectron2.modeling.backbone import FPN, BasicStem, ResNet 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 6 | from detectron2.modeling.box_regression import Box2BoxTransform 7 | from detectron2.modeling.matcher import Matcher 8 | from detectron2.modeling.meta_arch import GeneralizedRCNN 9 | from detectron2.modeling.poolers import ROIPooler 10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead 11 | from detectron2.modeling.roi_heads import ( 12 | FastRCNNConvFCHead, 13 | FastRCNNOutputLayers, 14 | MaskRCNNConvUpsampleHead, 15 | StandardROIHeads, 16 | ) 17 | 18 | from ..data.constants import constants 19 | 20 | model = L(GeneralizedRCNN)( 21 | backbone=L(FPN)( 22 | bottom_up=L(ResNet)( 23 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), 24 | stages=L(ResNet.make_default_stages)( 25 | depth=50, 26 | stride_in_1x1=True, 27 | norm="FrozenBN", 28 | ), 29 | out_features=["res2", "res3", "res4", "res5"], 30 | ), 31 | in_features="${.bottom_up.out_features}", 32 | out_channels=256, 33 | top_block=L(LastLevelMaxPool)(), 34 | ), 35 | proposal_generator=L(RPN)( 36 | in_features=["p2", "p3", "p4", "p5", "p6"], 37 | head=L(StandardRPNHead)(in_channels=256, num_anchors=3), 38 | anchor_generator=L(DefaultAnchorGenerator)( 39 | sizes=[[32], [64], [128], [256], [512]], 40 | aspect_ratios=[0.5, 1.0, 2.0], 41 | strides=[4, 8, 16, 32, 64], 42 | offset=0.0, 43 | ), 44 | anchor_matcher=L(Matcher)( 45 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True 46 | ), 47 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]), 48 | batch_size_per_image=256, 49 | positive_fraction=0.5, 50 | pre_nms_topk=(2000, 1000), 51 | post_nms_topk=(1000, 1000), 52 | nms_thresh=0.7, 53 | ), 54 | roi_heads=L(StandardROIHeads)( 55 | num_classes=80, 56 | batch_size_per_image=512, 57 | positive_fraction=0.25, 58 | proposal_matcher=L(Matcher)( 59 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False 60 | ), 61 | box_in_features=["p2", "p3", "p4", "p5"], 62 | box_pooler=L(ROIPooler)( 63 | output_size=7, 64 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), 65 | sampling_ratio=0, 66 | pooler_type="ROIAlignV2", 67 | ), 68 | box_head=L(FastRCNNConvFCHead)( 69 | input_shape=ShapeSpec(channels=256, height=7, width=7), 70 | conv_dims=[], 71 | fc_dims=[1024, 1024], 72 | ), 73 | box_predictor=L(FastRCNNOutputLayers)( 74 | input_shape=ShapeSpec(channels=1024), 75 | test_score_thresh=0.05, 76 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)), 77 | num_classes="${..num_classes}", 78 | ), 79 | mask_in_features=["p2", "p3", "p4", "p5"], 80 | mask_pooler=L(ROIPooler)( 81 | output_size=14, 82 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), 83 | sampling_ratio=0, 84 | pooler_type="ROIAlignV2", 85 | ), 86 | mask_head=L(MaskRCNNConvUpsampleHead)( 87 | input_shape=ShapeSpec(channels=256, width=14, height=14), 88 | num_classes="${..num_classes}", 89 | conv_dims=[256, 256, 256, 256, 256], 90 | ), 91 | ), 92 | pixel_mean=constants.imagenet_rgb256_mean, 93 | pixel_std=constants.imagenet_rgb256_std, 94 | input_format="RGB", 95 | ) 96 | -------------------------------------------------------------------------------- /coco_rem/configs/common/models/mask_rcnn_vitdet.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2.config import LazyCall as L 5 | from detectron2.modeling import SimpleFeaturePyramid, ViT 6 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 7 | 8 | from .mask_rcnn_fpn import model 9 | 10 | # Base 11 | embed_dim, depth, num_heads, dp = 768, 12, 12, 0.1 12 | # Creates Simple Feature Pyramid from ViT backbone 13 | model.backbone = L(SimpleFeaturePyramid)( 14 | net=L(ViT)( # Single-scale ViT backbone 15 | img_size=1024, 16 | patch_size=16, 17 | embed_dim=embed_dim, 18 | depth=depth, 19 | num_heads=num_heads, 20 | drop_path_rate=dp, 21 | window_size=14, 22 | mlp_ratio=4, 23 | qkv_bias=True, 24 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 25 | window_block_indexes=[ 26 | # 2, 5, 8 11 for global attention 27 | 0, 28 | 1, 29 | 3, 30 | 4, 31 | 6, 32 | 7, 33 | 9, 34 | 10, 35 | ], 36 | residual_block_indexes=[], 37 | use_rel_pos=True, 38 | out_feature="last_feat", 39 | ), 40 | in_feature="${.net.out_feature}", 41 | out_channels=256, 42 | scale_factors=(4.0, 2.0, 1.0, 0.5), 43 | top_block=L(LastLevelMaxPool)(), 44 | norm="LN", 45 | square_pad=1024, 46 | ) 47 | 48 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN" 49 | 50 | # 2conv in RPN: 51 | model.proposal_generator.head.conv_dims = [-1, -1] 52 | 53 | # 4conv1fc box head 54 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] 55 | model.roi_heads.box_head.fc_dims = [1024] 56 | -------------------------------------------------------------------------------- /coco_rem/configs/common/optim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from detectron2.config import LazyCall as L 3 | from detectron2.solver.build import get_default_optimizer_params 4 | 5 | SGD = L(torch.optim.SGD)( 6 | params=L(get_default_optimizer_params)( 7 | # params.model is meant to be set to the model object, before instantiating 8 | # the optimizer. 9 | weight_decay_norm=0.0 10 | ), 11 | lr=0.02, 12 | momentum=0.9, 13 | weight_decay=1e-4, 14 | ) 15 | 16 | 17 | AdamW = L(torch.optim.AdamW)( 18 | params=L(get_default_optimizer_params)( 19 | # params.model is meant to be set to the model object, before instantiating 20 | # the optimizer. 21 | base_lr="${..lr}", 22 | weight_decay_norm=0.0, 23 | ), 24 | lr=1e-4, 25 | betas=(0.9, 0.999), 26 | weight_decay=0.1, 27 | ) 28 | -------------------------------------------------------------------------------- /coco_rem/configs/common/train.py: -------------------------------------------------------------------------------- 1 | # Common training-related configs that are designed for "scripts/evaluate.py" 2 | # You can use your own instead, together with your own train_net.py 3 | train = dict( 4 | output_dir="./output", 5 | init_checkpoint="", 6 | max_iter=90000, 7 | amp=dict(enabled=True), # options for Automatic Mixed Precision 8 | ddp=dict( # options for DistributedDataParallel 9 | broadcast_buffers=False, 10 | find_unused_parameters=False, 11 | fp16_compression=False, 12 | ), 13 | checkpointer=dict(period=5000, max_to_keep=100), # options for PeriodicCheckpointer 14 | eval_period=5000, 15 | log_period=20, 16 | device="cuda" 17 | # ... 18 | ) 19 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_1k_3x.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.box_regression import Box2BoxTransform 4 | from detectron2.modeling.roi_heads import FastRCNNConvFCHead, FastRCNNOutputLayers 5 | 6 | from coco_rem.modeling.convnext import ConvNeXt 7 | 8 | from ..common.data.coco import dataloader 9 | from ..common.data.constants import constants 10 | from ..common.models.cascade_rcnn import model 11 | from ..common.train import train 12 | 13 | model.backbone.bottom_up = L(ConvNeXt)( 14 | in_chans=3, 15 | depths=[3, 3, 27, 3], 16 | dims=[128, 256, 512, 1024], 17 | drop_path_rate=0.7, 18 | layer_scale_init_value=1.0, 19 | out_features=["res2", "res3", "res4", "res5"], 20 | ) 21 | 22 | model.roi_heads.update( 23 | # 4conv1fc box heads with BatchNorm 24 | box_heads=[ 25 | L(FastRCNNConvFCHead)( 26 | input_shape=ShapeSpec(channels=256, height=7, width=7), 27 | conv_dims=[256, 256, 256, 256], 28 | fc_dims=[1024], 29 | conv_norm="SyncBN", 30 | ) 31 | for k in range(3) 32 | ], 33 | box_predictors=[ 34 | L(FastRCNNOutputLayers)( 35 | input_shape=ShapeSpec(channels=1024), 36 | test_score_thresh=0.05, 37 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 38 | # 39 | # Cascade R-CNN implementation in Detectron2 has class-agnostic box reg 40 | # but checkpoints from ConvNext repo (MMDetection) use class-specific. 41 | cls_agnostic_bbox_reg=False, 42 | num_classes="${...num_classes}", 43 | ) 44 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 45 | ], 46 | ) 47 | 48 | train.init_checkpoint = None # Load externally. 49 | train.max_iter *= 3 50 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_22k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train 2 | 3 | # This config is IDENTICAL to ConvNeXt-B (ImageNet-1K) - the only difference 4 | # is pre-training dataset of backbone (ImageNet-22K vs 1K) but weights in, 5 | # `train.init_checkpoint` (to be provided externally) override everything. 6 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_large_22k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.dims = [192, 384, 768, 1536] 4 | model.backbone.bottom_up.drop_path_rate = 0.7 5 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_small_1k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.dims = [96, 192, 384, 768] 4 | model.backbone.bottom_up.drop_path_rate = 0.6 5 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_tiny_1k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.depths = [3, 3, 9, 3] 4 | model.backbone.bottom_up.dims = [96, 192, 384, 768] 5 | model.backbone.bottom_up.drop_path_rate = 0.4 6 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/cascade_mask_rcnn_convnext_xlarge_22k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.dims = [256, 512, 1024, 2048] 4 | model.backbone.bottom_up.drop_path_rate = 0.8 5 | -------------------------------------------------------------------------------- /coco_rem/configs/convnext/mask_rcnn_convnext_tiny_1k_3x.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | 3 | from coco_rem.modeling.convnext import ConvNeXt 4 | 5 | from ..common.data.coco import dataloader 6 | from ..common.data.constants import constants 7 | from ..common.models.mask_rcnn_fpn import model 8 | from ..common.train import train 9 | 10 | model.backbone.bottom_up = L(ConvNeXt)( 11 | in_chans=3, 12 | depths=[3, 3, 9, 3], 13 | dims=[96, 192, 384, 768], 14 | drop_path_rate=0.4, 15 | layer_scale_init_value=1.0, 16 | out_features=["res2", "res3", "res4", "res5"], 17 | ) 18 | 19 | train.init_checkpoint = None # Load externally. 20 | train.max_iter *= 3 21 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_100ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train 2 | 3 | model.backbone.bottom_up.stages.depth = 101 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_200ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 2 # 100ep -> 200ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_400ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 4 # 100ep -> 400ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_100ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers.batch_norm import NaiveSyncBatchNorm 2 | 3 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier 4 | from ..common.data.coco import dataloader 5 | from ..common.data.constants import constants 6 | from ..common.models.mask_rcnn_fpn import model 7 | from ..common.optim import SGD as optimizer 8 | from ..common.train import train 9 | 10 | dataloader.train.mapper.image_format = "BGR" 11 | model.pixel_mean = constants.imagenet_bgr256_mean 12 | model.pixel_std = constants.imagenet_bgr256_std 13 | model.input_format = "BGR" 14 | 15 | # train from scratch 16 | train.init_checkpoint = "" 17 | train.amp.enabled = True 18 | train.ddp.fp16_compression = True 19 | model.backbone.bottom_up.freeze_at = 0 20 | 21 | # SyncBN 22 | model.backbone.bottom_up.stem.norm = "SyncBN" 23 | model.backbone.bottom_up.stages.norm = "SyncBN" 24 | model.backbone.norm = "SyncBN" 25 | 26 | # Using NaiveSyncBatchNorm because heads may have empty input. That is not supported by 27 | # torch.nn.SyncBatchNorm. We can remove this after 28 | # https://github.com/pytorch/pytorch/issues/36530 is fixed. 29 | model.roi_heads.box_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N") 30 | model.roi_heads.mask_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N") 31 | 32 | # 2conv in RPN: 33 | model.proposal_generator.head.conv_dims = [-1, -1] 34 | 35 | # 4conv1fc box head 36 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] 37 | model.roi_heads.box_head.fc_dims = [1024] 38 | 39 | # Equivalent to 100 epochs. 40 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 41 | train.max_iter = 184375 42 | 43 | optimizer.lr = 0.1 44 | optimizer.weight_decay = 4e-5 45 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_200ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 2 # 100ep -> 200ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_400ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 4 # 100ep -> 400ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_100ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.modeling.backbone import RegNet 3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem 4 | 5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train 6 | 7 | # Config source: 8 | model.backbone.bottom_up = L(RegNet)( 9 | stem_class=SimpleStem, 10 | stem_width=32, 11 | block_class=ResBottleneckBlock, 12 | depth=23, 13 | w_a=38.65, 14 | w_0=96, 15 | w_m=2.43, 16 | group_width=40, 17 | norm="SyncBN", 18 | out_features=["s1", "s2", "s3", "s4"], 19 | ) 20 | model.pixel_std = [57.375, 57.120, 58.395] 21 | 22 | # RegNets benefit from enabling cudnn benchmark mode 23 | train.cudnn_benchmark = True 24 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_200ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 2 # 100ep -> 200ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_400ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 4 # 100ep -> 400ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_100ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.modeling.backbone import RegNet 3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem 4 | 5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train 6 | 7 | model.backbone.bottom_up = L(RegNet)( 8 | stem_class=SimpleStem, 9 | stem_width=32, 10 | block_class=ResBottleneckBlock, 11 | depth=22, 12 | w_a=31.41, 13 | w_0=96, 14 | w_m=2.24, 15 | group_width=64, 16 | se_ratio=0.25, 17 | norm="SyncBN", 18 | out_features=["s1", "s2", "s3", "s4"], 19 | ) 20 | model.pixel_std = [57.375, 57.120, 58.395] 21 | 22 | # RegNets benefit from enabling cudnn benchmark mode 23 | train.cudnn_benchmark = True 24 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_200ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 2 # 100ep -> 200ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_400ep.py: -------------------------------------------------------------------------------- 1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train 2 | 3 | train.max_iter *= 4 # 100ep -> 400ep 4 | -------------------------------------------------------------------------------- /coco_rem/configs/d2main/cascade_mask_rcnn_R_50_FPN_3x.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from ..common.data.constants import constants 3 | from ..common.models.cascade_rcnn import model 4 | from ..common.train import train 5 | 6 | dataloader.train.mapper.image_format = "BGR" 7 | model.pixel_mean = constants.imagenet_bgr256_mean 8 | model.pixel_std = constants.imagenet_bgr256_std 9 | model.input_format = "BGR" 10 | 11 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 12 | train.max_iter *= 3 13 | -------------------------------------------------------------------------------- /coco_rem/configs/d2main/mask_rcnn_R_50_FPN_3x.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from ..common.data.constants import constants 3 | from ..common.models.mask_rcnn_fpn import model 4 | from ..common.train import train 5 | 6 | dataloader.train.mapper.image_format = "BGR" 7 | model.pixel_mean = constants.imagenet_bgr256_mean 8 | model.pixel_std = constants.imagenet_bgr256_std 9 | model.input_format = "BGR" 10 | 11 | model.backbone.bottom_up.freeze_at = 2 12 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 13 | train.max_iter *= 3 14 | -------------------------------------------------------------------------------- /coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_gn.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from ..common.data.constants import constants 3 | from ..common.models.mask_rcnn_fpn import model 4 | from ..common.train import train 5 | 6 | dataloader.train.mapper.image_format = "BGR" 7 | model.pixel_mean = constants.imagenet_bgr256_mean 8 | model.pixel_std = constants.imagenet_bgr256_std 9 | model.input_format = "BGR" 10 | 11 | # Handle Caffe2 model specs: 12 | model.backbone.bottom_up.stages.stride_in_1x1 = False 13 | model.pixel_std = [57.375, 57.120, 58.395] 14 | 15 | model.backbone.bottom_up.stem.norm = "GN" 16 | model.backbone.bottom_up.stages.norm = "GN" 17 | model.backbone.norm = "GN" 18 | model.roi_heads.box_head.conv_norm = "GN" 19 | model.roi_heads.mask_head.conv_norm = "GN" 20 | 21 | # 4conv1fc box head 22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] 23 | model.roi_heads.box_head.fc_dims = [1024] 24 | 25 | dataloader.train.total_batch_size = 64 26 | train.max_iter *= 9 27 | -------------------------------------------------------------------------------- /coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_syncbn.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from ..common.data.constants import constants 3 | from ..common.models.mask_rcnn_fpn import model 4 | from ..common.train import train 5 | 6 | dataloader.train.mapper.image_format = "BGR" 7 | model.pixel_mean = constants.imagenet_bgr256_mean 8 | model.pixel_std = constants.imagenet_bgr256_std 9 | model.input_format = "BGR" 10 | 11 | # Handle Caffe2 model specs: 12 | model.backbone.bottom_up.stages.stride_in_1x1 = False 13 | model.pixel_std = [57.375, 57.120, 58.395] 14 | 15 | model.backbone.bottom_up.stem.norm = "SyncBN" 16 | model.backbone.bottom_up.stages.norm = "SyncBN" 17 | model.backbone.norm = "SyncBN" 18 | model.roi_heads.box_head.conv_norm = "SyncBN" 19 | model.roi_heads.mask_head.conv_norm = "SyncBN" 20 | 21 | # 4conv1fc box head 22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] 23 | model.roi_heads.box_head.fc_dims = [1024] 24 | 25 | dataloader.train.total_batch_size = 64 26 | train.max_iter *= 9 27 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_R101_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from .maskformer2_R50_bs16_50ep import dataloader, model, train 2 | 3 | model.backbone.stages.depth = 101 4 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 5 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_R50_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from ..common.models.mask2former import model 3 | from ..common.train import train 4 | 5 | # Initialization and trainer settings 6 | train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 7 | 8 | # Schedule 9 | # 50 ep = 368750 iters * 16 images/iter / 118000 images/ep 10 | dataloader.train.total_batch_size = 16 11 | train.max_iter = 368750 12 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_swin_base_384_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | 4 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train 5 | 6 | model.backbone.depths = [2, 2, 18, 2] 7 | model.backbone.num_heads = [4, 8, 16, 32] 8 | model.backbone.window_size = 12 9 | model.backbone.embed_dim = 128 10 | model.backbone.pretrain_img_size = 384 11 | 12 | model.sem_seg_head.pixel_decoder.input_shape = { 13 | "p0": L(ShapeSpec)(channels=128, stride=4), 14 | "p1": L(ShapeSpec)(channels=256, stride=8), 15 | "p2": L(ShapeSpec)(channels=512, stride=16), 16 | "p3": L(ShapeSpec)(channels=1024, stride=32), 17 | } 18 | 19 | train.init_checkpoint = ( 20 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384.pth" 21 | ) 22 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_swin_base_IN21k_384_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train 2 | 3 | train.init_checkpoint = ( 4 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth" 5 | ) 6 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_swin_large_IN21k_384_bs16_100ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | 4 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train 5 | 6 | model.num_queries = 200 7 | model.backbone.num_heads = [6, 12, 24, 48] 8 | model.backbone.embed_dim = 192 9 | 10 | model.sem_seg_head.pixel_decoder.input_shape = { 11 | "p0": L(ShapeSpec)(channels=192, stride=4), 12 | "p1": L(ShapeSpec)(channels=384, stride=8), 13 | "p2": L(ShapeSpec)(channels=768, stride=16), 14 | "p3": L(ShapeSpec)(channels=1536, stride=32), 15 | } 16 | 17 | train.max_iter *= 2 18 | train.init_checkpoint = ( 19 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth" 20 | ) 21 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_swin_small_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train 2 | 3 | model.backbone.depths = [2, 2, 18, 2] 4 | 5 | train.init_checkpoint = ( 6 | "detectron2://ImageNetPretrained/swin/swin_small_patch4_window7_224.pth" 7 | ) 8 | -------------------------------------------------------------------------------- /coco_rem/configs/mask2former/maskformer2_swin_tiny_bs16_50ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling import SwinTransformer 4 | 5 | from .maskformer2_R50_bs16_50ep import dataloader, model, train 6 | 7 | model.backbone = L(SwinTransformer)( 8 | depths=[2, 2, 6, 2], 9 | embed_dim=96, 10 | num_heads=[3, 6, 12, 24], 11 | drop_path_rate=0.3, 12 | ) 13 | 14 | model.sem_seg_head.pixel_decoder.input_shape = { 15 | "p0": L(ShapeSpec)(channels=96, stride=4), 16 | "p1": L(ShapeSpec)(channels=192, stride=8), 17 | "p2": L(ShapeSpec)(channels=384, stride=16), 18 | "p3": L(ShapeSpec)(channels=768, stride=32), 19 | } 20 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["p1", "p2", "p3"] 21 | 22 | train.init_checkpoint = ( 23 | "detectron2://ImageNetPretrained/swin/swin_tiny_patch4_window7_224.pth" 24 | ) 25 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.depth = 24 4 | model.backbone.bottom_up.last_block_indexes = (1, 4, 20, 23) 5 | model.backbone.bottom_up.drop_path_rate = 0.4 6 | 7 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in1k.pyth" 8 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_in21k_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_mvitv2_b_3x import dataloader, model, train 2 | 3 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth" 4 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py: -------------------------------------------------------------------------------- 1 | from ..common.data.coco import dataloader 2 | from .cascade_mask_rcnn_mvitv2_b_3x import model, train 3 | 4 | model.backbone.bottom_up.embed_dim = 192 5 | model.backbone.bottom_up.depth = 80 6 | model.backbone.bottom_up.num_heads = 3 7 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79) 8 | model.backbone.bottom_up.drop_path_rate = 0.6 9 | model.backbone.bottom_up.use_act_checkpoint = True 10 | 11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth" 12 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_s_3x.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train 2 | 3 | model.backbone.bottom_up.depth = 16 4 | model.backbone.bottom_up.last_block_indexes = (0, 2, 13, 15) 5 | 6 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_S_in1k.pyth" 7 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_t_3x.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.box_regression import Box2BoxTransform 4 | from detectron2.modeling.matcher import Matcher 5 | from detectron2.modeling.roi_heads import ( 6 | CascadeROIHeads, 7 | FastRCNNConvFCHead, 8 | FastRCNNOutputLayers, 9 | ) 10 | 11 | from .mask_rcnn_mvitv2_t_3x import dataloader, model, train 12 | 13 | # arguments that don't exist for Cascade R-CNN 14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] 15 | 16 | model.roi_heads.update( 17 | _target_=CascadeROIHeads, 18 | box_heads=[ 19 | L(FastRCNNConvFCHead)( 20 | input_shape=ShapeSpec(channels=256, height=7, width=7), 21 | conv_dims=[256, 256, 256, 256], 22 | fc_dims=[1024], 23 | conv_norm="SyncBN", 24 | ) 25 | for _ in range(3) 26 | ], 27 | box_predictors=[ 28 | L(FastRCNNOutputLayers)( 29 | input_shape=ShapeSpec(channels=1024), 30 | test_score_thresh=0.05, 31 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 32 | cls_agnostic_bbox_reg=True, 33 | num_classes="${...num_classes}", 34 | ) 35 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 36 | ], 37 | proposal_matchers=[ 38 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) 39 | for th in [0.5, 0.6, 0.7] 40 | ], 41 | ) 42 | 43 | model.roi_heads.mask_head.conv_norm = "SyncBN" 44 | 45 | # 2conv in RPN: 46 | # https://github.com/tensorflow/tpu/blob/b24729de804fdb751b06467d3dce0637fa652060/models/official/detection/modeling/architecture/heads.py#L95-L97 # noqa: E501, B950 47 | model.proposal_generator.head.conv_dims = [-1, -1] 48 | -------------------------------------------------------------------------------- /coco_rem/configs/mvitv2/mask_rcnn_mvitv2_t_3x.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2.config import LazyCall as L 5 | from detectron2.modeling import MViT 6 | 7 | from ..common.data.coco import dataloader 8 | from ..common.data.constants import constants 9 | from ..common.models.mask_rcnn_fpn import model 10 | from ..common.train import train 11 | 12 | model.backbone.bottom_up = L(MViT)( 13 | embed_dim=96, 14 | depth=10, 15 | num_heads=1, 16 | last_block_indexes=(0, 2, 7, 9), 17 | residual_pooling=True, 18 | drop_path_rate=0.2, 19 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 20 | out_features=("scale2", "scale3", "scale4", "scale5"), 21 | ) 22 | model.backbone.in_features = "${.bottom_up.out_features}" 23 | 24 | # Initialization and trainer settings 25 | train.amp.enabled = True 26 | train.ddp.fp16_compression = True 27 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_T_in1k.pyth" 28 | 29 | # 36 epochs 30 | train.max_iter = 67500 31 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.config import LazyCall as L 4 | from detectron2.layers import ShapeSpec 5 | from detectron2.modeling import MViT 6 | from detectron2.modeling.box_regression import Box2BoxTransform 7 | from detectron2.modeling.matcher import Matcher 8 | from detectron2.modeling.roi_heads import ( 9 | CascadeROIHeads, 10 | FastRCNNConvFCHead, 11 | FastRCNNOutputLayers, 12 | ) 13 | from torch import nn 14 | 15 | from ..common.data.coco import dataloader 16 | from ..common.data.constants import constants 17 | from ..common.models.mask_rcnn_fpn import model 18 | from ..common.train import train 19 | 20 | model.backbone.bottom_up = L(MViT)( 21 | embed_dim=96, 22 | depth=24, 23 | num_heads=1, 24 | last_block_indexes=(1, 4, 20, 23), 25 | residual_pooling=True, 26 | drop_path_rate=0.4, 27 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 28 | out_features=("scale2", "scale3", "scale4", "scale5"), 29 | ) 30 | model.backbone.in_features = "${.bottom_up.out_features}" 31 | model.backbone.square_pad = 1024 32 | 33 | # New heads and LN 34 | model.backbone.norm = "LN" # Use LN in FPN 35 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN" 36 | 37 | # 2conv in RPN: 38 | model.proposal_generator.head.conv_dims = [-1, -1] 39 | 40 | # arguments that don't exist for Cascade R-CNN 41 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] 42 | model.roi_heads.update( 43 | _target_=CascadeROIHeads, 44 | box_heads=[ 45 | L(FastRCNNConvFCHead)( 46 | input_shape=ShapeSpec(channels=256, height=7, width=7), 47 | conv_dims=[256, 256, 256, 256], 48 | fc_dims=[1024], 49 | conv_norm="LN", 50 | ) 51 | for _ in range(3) 52 | ], 53 | box_predictors=[ 54 | L(FastRCNNOutputLayers)( 55 | input_shape=ShapeSpec(channels=1024), 56 | test_score_thresh=0.05, 57 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 58 | cls_agnostic_bbox_reg=True, 59 | num_classes="${...num_classes}", 60 | ) 61 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 62 | ], 63 | proposal_matchers=[ 64 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) 65 | for th in [0.5, 0.6, 0.7] 66 | ], 67 | ) 68 | 69 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth" 70 | 71 | # Schedule 72 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 73 | train.max_iter = 184375 74 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train 2 | 3 | model.backbone.bottom_up.embed_dim = 192 4 | model.backbone.bottom_up.depth = 80 5 | model.backbone.bottom_up.num_heads = 3 6 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79) 7 | model.backbone.bottom_up.drop_path_rate = 0.6 8 | model.backbone.bottom_up.use_act_checkpoint = True 9 | 10 | 11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth" 12 | 13 | 14 | # 36 epochs 15 | train.max_iter = 67500 16 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train 2 | 3 | model.backbone.bottom_up.embed_dim = 144 4 | model.backbone.bottom_up.depth = 48 5 | model.backbone.bottom_up.num_heads = 2 6 | model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47) 7 | model.backbone.bottom_up.drop_path_rate = 0.5 8 | 9 | 10 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth" 11 | 12 | train.max_iter = train.max_iter // 2 # 100ep -> 50ep 13 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_swin_b_in21k_50ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.modeling import SwinTransformer 3 | 4 | from ..common.data.coco import dataloader 5 | from ..common.train import train 6 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import model 7 | 8 | model.backbone.bottom_up = L(SwinTransformer)( 9 | depths=[2, 2, 18, 2], 10 | drop_path_rate=0.4, 11 | embed_dim=128, 12 | num_heads=[4, 8, 16, 32], 13 | ) 14 | model.backbone.in_features = ("p0", "p1", "p2", "p3") 15 | model.backbone.square_pad = 1024 16 | 17 | train.init_checkpoint = ( 18 | "detectron2://ImageNetPretrained/swin/swin_base_patch4_window7_224_22k.pth" 19 | ) 20 | # 50 ep = (184375 / 2) iters * 64 images/iter / 118000 images/ep 21 | train.max_iter = 184375 // 2 22 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_swin_l_in21k_50ep.py: -------------------------------------------------------------------------------- 1 | from .cascade_mask_rcnn_swin_b_in21k_50ep import dataloader, model, train 2 | 3 | model.backbone.bottom_up.depths = [2, 2, 18, 2] 4 | model.backbone.bottom_up.drop_path_rate = 0.4 5 | model.backbone.bottom_up.embed_dim = 192 6 | model.backbone.bottom_up.num_heads = [6, 12, 24, 48] 7 | 8 | 9 | train.init_checkpoint = ( 10 | "detectron2://ImageNetPretrained/swin/swin_large_patch4_window7_224_22k.pth" 11 | ) 12 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_b_100ep.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.box_regression import Box2BoxTransform 4 | from detectron2.modeling.matcher import Matcher 5 | from detectron2.modeling.roi_heads import ( 6 | CascadeROIHeads, 7 | FastRCNNConvFCHead, 8 | FastRCNNOutputLayers, 9 | ) 10 | 11 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train 12 | 13 | # arguments that don't exist for Cascade R-CNN 14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] 15 | 16 | model.roi_heads.update( 17 | _target_=CascadeROIHeads, 18 | box_heads=[ 19 | L(FastRCNNConvFCHead)( 20 | input_shape=ShapeSpec(channels=256, height=7, width=7), 21 | conv_dims=[256, 256, 256, 256], 22 | fc_dims=[1024], 23 | conv_norm="LN", 24 | ) 25 | for _ in range(3) 26 | ], 27 | box_predictors=[ 28 | L(FastRCNNOutputLayers)( 29 | input_shape=ShapeSpec(channels=1024), 30 | test_score_thresh=0.05, 31 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 32 | cls_agnostic_bbox_reg=True, 33 | num_classes="${...num_classes}", 34 | ) 35 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 36 | ], 37 | proposal_matchers=[ 38 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) 39 | for th in [0.5, 0.6, 0.7] 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_h_75ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 4 | 5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier 6 | from .cascade_mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train 7 | 8 | model.backbone.net.embed_dim = 1280 9 | model.backbone.net.depth = 32 10 | model.backbone.net.num_heads = 16 11 | model.backbone.net.drop_path_rate = 0.5 12 | # 7, 15, 23, 31 for global attention 13 | model.backbone.net.window_block_indexes = ( 14 | list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) 15 | ) 16 | 17 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True" 18 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep 19 | 20 | optimizer.params.lr_factor_func = partial( 21 | get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32 22 | ) 23 | optimizer.params.overrides = {} 24 | optimizer.params.weight_decay_norm = None 25 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_l_100ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 4 | 5 | from .cascade_mask_rcnn_vitdet_b_100ep import ( 6 | dataloader, 7 | lr_multiplier, 8 | model, 9 | optimizer, 10 | train, 11 | ) 12 | 13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True" 14 | 15 | model.backbone.net.embed_dim = 1024 16 | model.backbone.net.depth = 24 17 | model.backbone.net.num_heads = 16 18 | model.backbone.net.drop_path_rate = 0.4 19 | # 5, 11, 17, 23 for global attention 20 | model.backbone.net.window_block_indexes = ( 21 | list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) 22 | ) 23 | 24 | optimizer.params.lr_factor_func = partial( 25 | get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24 26 | ) 27 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 4 | 5 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier 6 | from ..common.data.coco import dataloader 7 | from ..common.models.mask_rcnn_vitdet import model 8 | from ..common.optim import AdamW as optimizer 9 | from ..common.train import train 10 | 11 | # Initialization and trainer settings 12 | train.ddp.fp16_compression = True 13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth?matching_heuristics=True" 14 | 15 | 16 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 17 | train.max_iter = 184375 18 | 19 | # Layer-wise LR decay for ViT 20 | optimizer.params.lr_factor_func = partial( 21 | get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7 22 | ) 23 | optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} 24 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/mask_rcnn_vitdet_h_75ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 4 | 5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier 6 | from .mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train 7 | 8 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True" 9 | 10 | model.backbone.net.embed_dim = 1280 11 | model.backbone.net.depth = 32 12 | model.backbone.net.num_heads = 16 13 | model.backbone.net.drop_path_rate = 0.5 14 | # 7, 15, 23, 31 for global attention 15 | model.backbone.net.window_block_indexes = ( 16 | list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) 17 | ) 18 | 19 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep 20 | 21 | optimizer.params.lr_factor_func = partial( 22 | get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32 23 | ) 24 | optimizer.params.overrides = {} 25 | optimizer.params.weight_decay_norm = None 26 | -------------------------------------------------------------------------------- /coco_rem/configs/vitdet/mask_rcnn_vitdet_l_100ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 4 | 5 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train 6 | 7 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True" 8 | 9 | model.backbone.net.embed_dim = 1024 10 | model.backbone.net.depth = 24 11 | model.backbone.net.num_heads = 16 12 | model.backbone.net.drop_path_rate = 0.4 13 | # 5, 11, 17, 23 for global attention 14 | model.backbone.net.window_block_indexes = ( 15 | list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) 16 | ) 17 | 18 | optimizer.params.lr_factor_func = partial( 19 | get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24 20 | ) 21 | -------------------------------------------------------------------------------- /coco_rem/data/builtin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Register COCO-ReM instances for training and evaluation. 3 | """ 4 | 5 | import os 6 | 7 | from detectron2.data.datasets.coco import register_coco_instances 8 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 9 | 10 | _PREDEFINED_SPLITS_COCO_REM = { 11 | "coco_rem_train": ("coco/train2017", "coco_rem/instances_trainrem.json"), 12 | "coco_rem_val": ("coco/val2017", "coco_rem/instances_valrem.json"), 13 | } 14 | 15 | 16 | def register_all_coco_rem(root: str = "datasets"): 17 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO_REM.items(): 18 | # Assume pre-defined datasets live in `./datasets`. 19 | register_coco_instances( 20 | key, 21 | _get_builtin_metadata("coco"), 22 | os.path.join(root, json_file) if "://" not in json_file else json_file, 23 | os.path.join(root, image_root), 24 | ) 25 | -------------------------------------------------------------------------------- /coco_rem/data/lvis.py: -------------------------------------------------------------------------------- 1 | from detectron2.data.datasets.lvis import ( 2 | get_lvis_instances_meta, 3 | register_lvis_instances, 4 | ) 5 | 6 | # This mapping is extracted from the official LVIS mapping: 7 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json 8 | COCO_CATEGORIES_IN_LVIS = [ 9 | {"coco_id": 1, "lvis_id": 793, "synset": "person.n.01"}, 10 | {"coco_id": 2, "lvis_id": 94, "synset": "bicycle.n.01"}, 11 | {"coco_id": 3, "lvis_id": 207, "synset": "car.n.01"}, 12 | {"coco_id": 4, "lvis_id": 703, "synset": "motorcycle.n.01"}, 13 | {"coco_id": 5, "lvis_id": 3, "synset": "airplane.n.01"}, 14 | {"coco_id": 6, "lvis_id": 173, "synset": "bus.n.01"}, 15 | {"coco_id": 7, "lvis_id": 1115, "synset": "train.n.01"}, 16 | {"coco_id": 8, "lvis_id": 1123, "synset": "truck.n.01"}, 17 | {"coco_id": 9, "lvis_id": 118, "synset": "boat.n.01"}, 18 | {"coco_id": 10, "lvis_id": 1112, "synset": "traffic_light.n.01"}, 19 | {"coco_id": 11, "lvis_id": 445, "synset": "fireplug.n.01"}, 20 | {"coco_id": 13, "lvis_id": 1019, "synset": "stop_sign.n.01"}, 21 | {"coco_id": 14, "lvis_id": 766, "synset": "parking_meter.n.01"}, 22 | {"coco_id": 15, "lvis_id": 90, "synset": "bench.n.01"}, 23 | {"coco_id": 16, "lvis_id": 99, "synset": "bird.n.01"}, 24 | {"coco_id": 17, "lvis_id": 225, "synset": "cat.n.01"}, 25 | {"coco_id": 18, "lvis_id": 378, "synset": "dog.n.01"}, 26 | {"coco_id": 19, "lvis_id": 569, "synset": "horse.n.01"}, 27 | {"coco_id": 20, "lvis_id": 943, "synset": "sheep.n.01"}, 28 | {"coco_id": 21, "lvis_id": 80, "synset": "beef.n.01"}, 29 | {"coco_id": 22, "lvis_id": 422, "synset": "elephant.n.01"}, 30 | {"coco_id": 23, "lvis_id": 76, "synset": "bear.n.01"}, 31 | {"coco_id": 24, "lvis_id": 1202, "synset": "zebra.n.01"}, 32 | {"coco_id": 25, "lvis_id": 496, "synset": "giraffe.n.01"}, 33 | {"coco_id": 27, "lvis_id": 34, "synset": "backpack.n.01"}, 34 | {"coco_id": 28, "lvis_id": 1133, "synset": "umbrella.n.01"}, 35 | {"coco_id": 31, "lvis_id": 35, "synset": "bag.n.04"}, 36 | {"coco_id": 32, "lvis_id": 716, "synset": "necktie.n.01"}, 37 | {"coco_id": 33, "lvis_id": 36, "synset": "bag.n.06"}, 38 | {"coco_id": 34, "lvis_id": 474, "synset": "frisbee.n.01"}, 39 | {"coco_id": 35, "lvis_id": 964, "synset": "ski.n.01"}, 40 | {"coco_id": 36, "lvis_id": 976, "synset": "snowboard.n.01"}, 41 | {"coco_id": 37, "lvis_id": 41, "synset": "ball.n.06"}, 42 | {"coco_id": 38, "lvis_id": 611, "synset": "kite.n.03"}, 43 | {"coco_id": 39, "lvis_id": 58, "synset": "baseball_bat.n.01"}, 44 | {"coco_id": 40, "lvis_id": 60, "synset": "baseball_glove.n.01"}, 45 | {"coco_id": 41, "lvis_id": 962, "synset": "skateboard.n.01"}, 46 | {"coco_id": 42, "lvis_id": 1037, "synset": "surfboard.n.01"}, 47 | {"coco_id": 43, "lvis_id": 1079, "synset": "tennis_racket.n.01"}, 48 | {"coco_id": 44, "lvis_id": 133, "synset": "bottle.n.01"}, 49 | {"coco_id": 46, "lvis_id": 1190, "synset": "wineglass.n.01"}, 50 | {"coco_id": 47, "lvis_id": 344, "synset": "cup.n.01"}, 51 | {"coco_id": 48, "lvis_id": 469, "synset": "fork.n.01"}, 52 | {"coco_id": 49, "lvis_id": 615, "synset": "knife.n.01"}, 53 | {"coco_id": 50, "lvis_id": 1000, "synset": "spoon.n.01"}, 54 | {"coco_id": 51, "lvis_id": 139, "synset": "bowl.n.03"}, 55 | {"coco_id": 52, "lvis_id": 45, "synset": "banana.n.02"}, 56 | {"coco_id": 53, "lvis_id": 12, "synset": "apple.n.01"}, 57 | {"coco_id": 54, "lvis_id": 912, "synset": "sandwich.n.01"}, 58 | {"coco_id": 55, "lvis_id": 735, "synset": "orange.n.01"}, 59 | {"coco_id": 56, "lvis_id": 154, "synset": "broccoli.n.01"}, 60 | {"coco_id": 57, "lvis_id": 217, "synset": "carrot.n.01"}, 61 | {"coco_id": 59, "lvis_id": 816, "synset": "pizza.n.01"}, 62 | {"coco_id": 60, "lvis_id": 387, "synset": "doughnut.n.02"}, 63 | {"coco_id": 61, "lvis_id": 183, "synset": "cake.n.03"}, 64 | {"coco_id": 62, "lvis_id": 232, "synset": "chair.n.01"}, 65 | {"coco_id": 63, "lvis_id": 982, "synset": "sofa.n.01"}, 66 | {"coco_id": 64, "lvis_id": 837, "synset": "pot.n.04"}, 67 | {"coco_id": 65, "lvis_id": 77, "synset": "bed.n.01"}, 68 | {"coco_id": 67, "lvis_id": 367, "synset": "dining_table.n.01"}, 69 | {"coco_id": 70, "lvis_id": 1097, "synset": "toilet.n.02"}, 70 | {"coco_id": 72, "lvis_id": 1077, "synset": "television_receiver.n.01"}, 71 | {"coco_id": 73, "lvis_id": 631, "synset": "laptop.n.01"}, 72 | {"coco_id": 74, "lvis_id": 705, "synset": "mouse.n.04"}, 73 | {"coco_id": 75, "lvis_id": 881, "synset": "remote_control.n.01"}, 74 | {"coco_id": 76, "lvis_id": 296, "synset": "computer_keyboard.n.01"}, 75 | {"coco_id": 77, "lvis_id": 230, "synset": "cellular_telephone.n.01"}, 76 | {"coco_id": 78, "lvis_id": 687, "synset": "microwave.n.02"}, 77 | {"coco_id": 79, "lvis_id": 739, "synset": "oven.n.01"}, 78 | {"coco_id": 80, "lvis_id": 1095, "synset": "toaster.n.02"}, 79 | {"coco_id": 81, "lvis_id": 961, "synset": "sink.n.01"}, 80 | {"coco_id": 82, "lvis_id": 421, "synset": "electric_refrigerator.n.01"}, 81 | {"coco_id": 84, "lvis_id": 127, "synset": "book.n.01"}, 82 | {"coco_id": 85, "lvis_id": 271, "synset": "clock.n.01"}, 83 | {"coco_id": 86, "lvis_id": 1139, "synset": "vase.n.01"}, 84 | {"coco_id": 87, "lvis_id": 923, "synset": "scissors.n.01"}, 85 | {"coco_id": 88, "lvis_id": 1071, "synset": "teddy.n.01"}, 86 | {"coco_id": 89, "lvis_id": 534, "synset": "hand_blower.n.01"}, 87 | {"coco_id": 90, "lvis_id": 1102, "synset": "toothbrush.n.01"}, 88 | ] 89 | 90 | 91 | def register_cocofied_lvis(): 92 | # COCO-fied LVIS v1 val - instances for COCO classes, masks from LVIS. 93 | register_lvis_instances( 94 | "lvis_v1_val_cocofied", 95 | get_lvis_instances_meta("lvis_v1_val_cocofied"), 96 | json_file="datasets/lvis/lvis_v1_val_cocofied.json", 97 | image_root="datasets/coco/", 98 | ) 99 | -------------------------------------------------------------------------------- /coco_rem/mask_visualizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import itertools 4 | from typing import Callable, Optional 5 | 6 | import cv2 7 | import matplotlib.colors as mplc 8 | import numpy as np 9 | import pycocotools.mask as mask_util 10 | import torch 11 | from detectron2.utils.visualizer import VisImage 12 | from torch.nn.functional import max_pool2d 13 | 14 | # Nice colors, taken from `colorblind` palette of the `seaborn` library with 15 | # very minor modifications for aesthetics. 16 | NICE_COLORS = [ 17 | (0.0039, 0.4509, 0.6980), # blue 18 | (0.8905, 0.5607, 0.0196), # orange 19 | (0.0078, 0.6196, 0.4509), # green 20 | (0.9400, 0.2200, 0.1000), # red 21 | (0.6500, 0.3500, 0.9000), # purple 22 | (0.6980, 1.0000, 0.3500), # lime green 23 | (0.5019, 0.8705, 0.9176), # cyan 24 | (0.7921, 0.5686, 0.3803), # brown 25 | (0.9843, 0.6862, 0.8941), # pink 26 | (0.9254, 0.8823, 0.2001), # gold 27 | ] 28 | 29 | 30 | def binarize_mask(mask_or_polygons, height: int, width: int): 31 | """ 32 | Convert input masks of any format to a binary mask (np.uint8 array with 1 33 | as foreground and 0 as background). 34 | """ 35 | m = mask_or_polygons 36 | if isinstance(m, dict): 37 | # RLEs 38 | assert "counts" in m and "size" in m 39 | if isinstance(m["counts"], list): # uncompressed RLEs 40 | h, w = m["size"] 41 | assert h == height and w == width 42 | m = mask_util.frPyObjects(m, h, w) 43 | mask = mask_util.decode(m)[:, :] 44 | 45 | if isinstance(m, list): # list[ndarray] 46 | m = mask_util.frPyObjects(m, height, width) 47 | m = mask_util.merge(m) 48 | mask = mask_util.decode(m)[:, :] 49 | 50 | if isinstance(m, np.ndarray): # assumed to be a binary mask 51 | assert m.shape[1] != 2, m.shape 52 | assert m.shape == ( 53 | height, 54 | width, 55 | ), f"mask shape: {m.shape}, target dims: {height}, {width}" 56 | mask = m.astype("uint8") 57 | 58 | return mask 59 | 60 | 61 | def _create_text_labels(classes, class_names, is_crowd=None): 62 | labels = None 63 | if classes is not None: 64 | if class_names is not None and len(class_names) > 0: 65 | labels = [class_names[i] for i in classes] 66 | else: 67 | labels = [str(i) for i in classes] 68 | 69 | if labels is not None and is_crowd is not None: 70 | labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] 71 | return labels 72 | 73 | 74 | class MaskVisualizer: 75 | """Visualizer for labeled masks of COCO-format instance annotations.""" 76 | 77 | def __init__(self, img_rgb: np.ndarray, class_names: list[str] | None = None): 78 | """ 79 | Args: 80 | img_rgb: a numpy array of shape (H, W, C), where H and W correspond to 81 | the height and width of the image respectively. C is the number of 82 | color channels. The image is required to be in RGB format since that 83 | is a requirement of the Matplotlib library. The image is also expected 84 | to be in the range [0, 255]. 85 | class_names: List of names to associate with object class IDs of masks. 86 | """ 87 | self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) 88 | self.class_names = class_names 89 | self.output = VisImage(self.img) 90 | self.cpu_device = torch.device("cpu") 91 | 92 | # too small texts are useless, therefore clamp to 12 93 | self._default_font_size = max( 94 | np.sqrt(self.output.height * self.output.width) // 90, 12 95 | ) 96 | 97 | def draw_dataset_dict( 98 | self, 99 | dic, 100 | draw_labels: bool = True, 101 | label_suffix_formatter: Optional[Callable] = None, 102 | ): 103 | """ 104 | Draw annotations/segmentations in Detectron2 Dataset format. 105 | 106 | Args: 107 | dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. 108 | 109 | Returns: 110 | output (VisImage): image object with visualizations. 111 | """ 112 | annos = dic.get("annotations", None) 113 | if annos: 114 | if "segmentation" in annos[0]: 115 | masks = [x["segmentation"] for x in annos] 116 | else: 117 | masks = None 118 | 119 | if draw_labels: 120 | category_ids = [x["category_id"] for x in annos] 121 | labels = _create_text_labels( 122 | category_ids, 123 | class_names=self.class_names, 124 | is_crowd=[x.get("iscrowd", 0) for x in annos], 125 | ) 126 | 127 | if label_suffix_formatter is not None: 128 | labels = label_suffix_formatter(dic, labels) 129 | else: 130 | labels = None 131 | 132 | self.overlay_instances(labels=labels, masks=masks) 133 | 134 | return self.output 135 | 136 | def overlay_instances(self, labels=None, masks=None, alpha=0.7): 137 | """ 138 | Args: 139 | labels (list[str]): the text to be displayed for each instance. 140 | masks (masks-like object): Supported types are: 141 | 142 | * :class:`detectron2.structures.PolygonMasks`, 143 | :class:`detectron2.structures.BitMasks`. 144 | * list[list[ndarray]]: contains the segmentation masks for all objects in one image. 145 | The first level of the list corresponds to individual instances. The second 146 | level to all the polygon that compose the instance, and the third level 147 | to the polygon coordinates. The third level should have the format of 148 | [x0, y0, x1, y1, ..., xn, yn] (n >= 3). 149 | * list[ndarray]: each ndarray is a binary mask of shape (H, W). 150 | * list[dict]: each dict is a COCO-style RLE. 151 | 152 | Returns: 153 | output (VisImage): image object with visualizations. 154 | """ 155 | num_instances = 0 156 | if masks is not None: 157 | masks = [ 158 | binarize_mask(x, self.output.height, self.output.width) for x in masks 159 | ] 160 | if num_instances: 161 | assert len(masks) == num_instances 162 | else: 163 | num_instances = len(masks) 164 | 165 | if labels is not None: 166 | assert len(labels) == num_instances 167 | 168 | assigned_colors = list( 169 | itertools.islice(itertools.cycle(NICE_COLORS), num_instances) 170 | ) 171 | 172 | if num_instances == 0: 173 | return self.output 174 | 175 | # Display in largest to smallest order to reduce occlusion. 176 | areas = np.asarray([x.sum() for x in masks]) 177 | 178 | sorted_idxs = np.argsort(-areas).tolist() 179 | # Re-order overlapped instances in descending order. 180 | labels = [labels[k] for k in sorted_idxs] if labels is not None else None 181 | masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None 182 | assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] 183 | 184 | for i in range(num_instances): 185 | color = assigned_colors[i] 186 | text = labels[i] if labels is not None else "" 187 | self.draw_binary_mask(masks[i], color, text=text, alpha=alpha) 188 | 189 | return self.output 190 | 191 | def draw_text(self, text: str, x: float, y: float) -> VisImage: 192 | # fmt: off 193 | self.output.ax.text( 194 | x, y, text, size=self._default_font_size, family="sans-serif", 195 | bbox={"facecolor": "white", "alpha": 1.0, "pad": 1.0, "edgecolor": "none"}, 196 | verticalalignment="top", horizontalalignment="center", 197 | color="black", zorder=10, 198 | ) 199 | # fmt: on 200 | return self.output 201 | 202 | def draw_binary_mask(self, binary_mask, color, text=None, alpha=0.7): 203 | """ 204 | Args: 205 | binary_mask: numpy array of shape (H, W), where H is the image height 206 | and W is the image width. Each value in the array is either a 0 207 | or 1 value of uint8 type. 208 | color: color of the mask. Refer to `matplotlib.colors` for a full list 209 | of formats that are accepted. If None, will pick a random color. 210 | text: A string to draw on the object. 211 | alpha: blending co-efficient. Smaller values => more transparent masks. 212 | 213 | Returns: 214 | output (VisImage): image object with mask drawn. 215 | """ 216 | color = mplc.to_rgb(color) 217 | 218 | mask = binary_mask.astype("uint8") # opencv needs uint8 219 | shape2d = (binary_mask.shape[0], binary_mask.shape[1]) 220 | 221 | # TODO: Use Path/PathPatch to draw vector graphics: 222 | # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon 223 | rgba = np.zeros(shape2d + (4,), dtype="float32") 224 | rgba[:, :, :3] = color 225 | rgba[:, :, 3] = (mask == 1).astype("float32") * alpha 226 | self.output.ax.imshow( 227 | rgba, extent=(0, self.output.width, self.output.height, 0) 228 | ) 229 | 230 | # Find mask boundary using dilation, then visualize as a black border. 231 | mask_tensor = torch.from_numpy(mask).float().unsqueeze(0) 232 | dilated = max_pool2d(mask_tensor, kernel_size=3, stride=1, padding=1) 233 | boundary = (dilated - mask_tensor)[0].numpy() 234 | boundary_rgba = np.zeros(shape2d + (4,), dtype="float32") 235 | boundary_rgba[:, :, 3] = boundary 236 | self.output.ax.imshow( 237 | boundary_rgba, extent=(0, self.output.width, self.output.height, 0) 238 | ) 239 | 240 | if text is not None: 241 | # TODO sometimes drawn on wrong objects. the heuristics here can improve. 242 | _num_cc, cc_labels, stats, _ = cv2.connectedComponentsWithStats( 243 | binary_mask, 8 244 | ) 245 | if stats[1:, -1].size == 0: 246 | return 247 | largest_component_id = np.argmax(stats[1:, -1]) + 1 248 | 249 | # draw text on the largest component, as well as other large components. 250 | for cid in range(1, _num_cc): 251 | if cid == largest_component_id or stats[cid, -1] > 100000: 252 | center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] 253 | self.draw_text(text, *center) 254 | 255 | return self.output 256 | 257 | def get_output(self): 258 | return self.output 259 | -------------------------------------------------------------------------------- /coco_rem/modeling/convnext.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import partial 4 | 5 | import torch 6 | from detectron2.layers.batch_norm import LayerNorm as LayerNorm2d 7 | from detectron2.modeling.backbone import Backbone 8 | from timm.models.layers import DropPath, trunc_normal_ 9 | from torch import nn 10 | 11 | 12 | class Block(nn.Module): 13 | def __init__(self, dim, drop_path=0.0, layer_scale_init_value=1e-6): 14 | super().__init__() 15 | self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) 16 | self.norm = nn.LayerNorm(dim, eps=1e-6) 17 | 18 | self.pwconv1 = nn.Linear(dim, 4 * dim) 19 | self.act = nn.GELU() 20 | 21 | self.pwconv2 = nn.Linear(4 * dim, dim) 22 | self.gamma = ( 23 | nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) 24 | if layer_scale_init_value > 0 25 | else None 26 | ) 27 | self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() 28 | 29 | def forward(self, x): 30 | input = x 31 | x = self.dwconv(x) 32 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 33 | x = self.norm(x) 34 | x = self.pwconv1(x) 35 | x = self.act(x) 36 | x = self.pwconv2(x) 37 | if self.gamma is not None: 38 | x = self.gamma * x 39 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 40 | 41 | x = input + self.drop_path(x) 42 | return x 43 | 44 | 45 | class ConvNeXt(Backbone): 46 | """ 47 | A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/abs/2201.03545 48 | """ 49 | 50 | def __init__( 51 | self, 52 | in_chans: int = 3, 53 | depths: list[int] = [3, 3, 9, 3], 54 | dims: list[int] = [96, 192, 384, 768], 55 | drop_path_rate: float = 0.0, 56 | layer_scale_init_value: float = 1e-6, 57 | out_features: list[str] | None = None, 58 | ): 59 | """ 60 | Args: 61 | in_chans: Number of input image channels. 62 | depths: Number of blocks at each stage. 63 | dims: Feature dimension at each stage. 64 | drop_path_rate: Stochastic depth rate. 65 | layer_scale_init_value: Init value for Layer Scale. 66 | out_features: Stage numbers of the outputs given to the Neck. 67 | """ 68 | super().__init__() 69 | 70 | # stem and 3 intermediate downsampling conv layers 71 | self.downsample_layers = nn.ModuleList() 72 | stem = nn.Sequential( 73 | nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), 74 | LayerNorm2d(dims[0], eps=1e-6), 75 | ) 76 | 77 | self.downsample_layers.append(stem) 78 | for i in range(3): 79 | downsample_layer = nn.Sequential( 80 | LayerNorm2d(dims[i], eps=1e-6), 81 | nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2), 82 | ) 83 | self.downsample_layers.append(downsample_layer) 84 | 85 | self.num_layers = len(depths) 86 | num_features = [int(dims[i] * 2**i) for i in range(self.num_layers)] 87 | self.num_features = num_features 88 | self._out_features = out_features 89 | 90 | self._out_feature_strides = {} 91 | self._out_feature_channels = {} 92 | 93 | # 4 feature resolution stages, each consisting of multiple residual blocks 94 | self.stages = nn.ModuleList() 95 | dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 96 | cur = 0 97 | strides = [4, 4, 4, 4] 98 | for i in range(4): 99 | stage = nn.Sequential( 100 | *[ 101 | Block( 102 | dim=dims[i], 103 | drop_path=dp_rates[cur + j], 104 | layer_scale_init_value=layer_scale_init_value, 105 | ) 106 | for j in range(depths[i]) 107 | ] 108 | ) 109 | self.stages.append(stage) 110 | cur += depths[i] 111 | 112 | self._out_feature_channels[f"res{i + 2}"] = dims[i] 113 | self._out_feature_strides[f"res{i + 2}"] = strides[i] * 2**i 114 | 115 | norm_layer = partial(LayerNorm2d, eps=1e-6) 116 | for i_layer in range(4): 117 | layer = norm_layer(dims[i_layer]) 118 | layer_name = f"norm{i_layer}" 119 | self.add_module(layer_name, layer) 120 | 121 | self.apply(self._init_weights) 122 | 123 | def _init_weights(self, m): 124 | if isinstance(m, (nn.Conv2d, nn.Linear)): 125 | trunc_normal_(m.weight, std=0.02) 126 | nn.init.constant_(m.bias, 0) 127 | 128 | def init_weights(self, pretrained=None): 129 | """Initialize the weights in backbone. 130 | Args: 131 | pretrained (str, optional): Path to pre-trained weights. 132 | Defaults to None. 133 | """ 134 | 135 | def _init_weights(m): 136 | if isinstance(m, nn.Linear): 137 | trunc_normal_(m.weight, std=0.02) 138 | if isinstance(m, nn.Linear) and m.bias is not None: 139 | nn.init.constant_(m.bias, 0) 140 | elif isinstance(m, nn.LayerNorm) or isinstance(m, LayerNorm2d): 141 | nn.init.constant_(m.bias, 0) 142 | nn.init.constant_(m.weight, 1.0) 143 | 144 | self.apply(_init_weights) 145 | 146 | def forward_features(self, x): 147 | outs = {} 148 | for i in range(4): 149 | x = self.downsample_layers[i](x) 150 | x = self.stages[i](x) 151 | 152 | if f"res{i + 2}" in self._out_features: 153 | norm_layer = getattr(self, f"norm{i}") 154 | x_out = norm_layer(x) 155 | out = x_out.contiguous() 156 | outs[f"res{i + 2}"] = out 157 | 158 | return outs 159 | 160 | def forward(self, x): 161 | x = self.forward_features(x) 162 | return x 163 | -------------------------------------------------------------------------------- /coco_rem/modeling/rcnn_refiner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from __future__ import annotations 3 | 4 | import torch 5 | 6 | from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN 7 | 8 | 9 | class GeneralizedRCNNRefiner(GeneralizedRCNN): 10 | """ 11 | An extension of R-CNN that produces masks conditioned on box prompts. This 12 | model skips the region proposal network and box ROI head, running only the 13 | mask head by cropping ROI features using input boxes. 14 | """ 15 | 16 | def forward(self, batched_inputs: list[dict[str, torch.Tensor]]): 17 | assert not self.training, "`GeneralizedRCNNRefiner` only supports inference!" 18 | 19 | # Prepare `detected_instances: list[Instances]` for `inference()` method 20 | # to get mask predictions for ground-truth boxes. 21 | detected_instances = [x.pop("instances") for x in batched_inputs] 22 | for x in detected_instances: 23 | x.pred_classes = x.gt_classes 24 | x.pred_boxes = x.gt_boxes 25 | x.scores = torch.ones_like(x.pred_classes).float() 26 | 27 | return self.inference(batched_inputs, detected_instances) 28 | -------------------------------------------------------------------------------- /coco_rem/modeling/sam_refiner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from __future__ import annotations 7 | 8 | import einops as E 9 | import torch 10 | from segment_anything import sam_model_registry 11 | from segment_anything.utils import amg 12 | from torch import nn 13 | 14 | 15 | class SamRefiner(nn.Module): 16 | """ 17 | SamRefiner: An extension of SAM that refines (low-quality) input masks via 18 | iteratively prompting boxes and points. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | arch: str, 24 | checkpoint: str, 25 | num_extra_points: int = 2, 26 | num_trials: int = 10, 27 | box_only_ids: list[int] = [], 28 | min_mask_region_area: int = 100, 29 | ): 30 | """ 31 | Args: 32 | arch: SAM image encoder architecture (vit_b, vit_l, vit_h). 33 | checkpoint: Path to .pth file containing pre-trained SAM weights. 34 | num_extra_points: Number of extra points to iteratively prompt SAM 35 | with, after the initial box prompt. Points are sampled from the 36 | error region (bitwise XOR) between SAM prediction and ground-truth. 37 | num_trials: Number of refinement trials per instance mask, to improve 38 | the overall mask quality by ensembling. 39 | box_only_ids: Category IDs for which only box prompts will used. 40 | min_mask_region_area: If >0, postprocessing will be applied to remove 41 | islands and holes in masks with area smaller than this value. 42 | However, masks smaller than `10 * min_mask_region_area` will not 43 | remain unchanged to avoid removing useful details in tiny masks. 44 | """ 45 | super().__init__() 46 | 47 | # Initialize SAM, freeze parameters, and transfer them here. 48 | _sam = sam_model_registry[arch](checkpoint) 49 | for param in _sam.parameters(): 50 | param.requires_grad = False 51 | 52 | self.image_encoder = _sam.image_encoder 53 | self.prompt_encoder = _sam.prompt_encoder 54 | self.mask_decoder = _sam.mask_decoder 55 | self.img_size = _sam.image_encoder.img_size # 1024 pixels 56 | 57 | self.register_buffer("pixel_mean", _sam.pixel_mean) 58 | self.register_buffer("pixel_std", _sam.pixel_std) 59 | 60 | self.num_extra_points = num_extra_points 61 | self.num_trials = num_trials 62 | self.box_only_ids = box_only_ids 63 | self.min_mask_region_area = min_mask_region_area 64 | 65 | @torch.no_grad() 66 | def forward( 67 | self, 68 | image: torch.Tensor, 69 | masks: torch.Tensor, 70 | category_ids: list[int], 71 | original_size: tuple[int, int], 72 | ) -> torch.Tensor: 73 | """ 74 | Regenerate an input mask by iteratively prompting points to SAM, same as 75 | the training procedure of SAM. This is done for multiple trials and masks 76 | are combining by averaging and thresholding in order to reduce variance. 77 | """ 78 | 79 | # Normalize pixel values and pad to a square input. 80 | input_size = image.shape[-2:] 81 | image = (image[None, ...] - self.pixel_mean) / self.pixel_std 82 | padh = self.img_size - image.shape[-2] 83 | padw = self.img_size - image.shape[-1] 84 | image = nn.functional.pad(image, (0, padw, 0, padh)) 85 | 86 | image_embeddings = self.image_encoder(image) 87 | all_masks = masks # Rename for convenience. 88 | 89 | all_refined_masks = [] 90 | for src_mask, category_id in zip(all_masks, category_ids): 91 | xp = 0 if category_id in self.box_only_ids else self.num_extra_points 92 | 93 | # Repeat a single mask `num_trials` times to perform refinement trials 94 | # within the same batch. 95 | src_mask = E.repeat(src_mask, "h w -> n h w", n=self.num_trials) 96 | 97 | box_prompt = self._get_box_prompt(src_mask) 98 | 99 | # Iteratively prompt SAM with points sampled from error regions of 100 | # predicted masks. This is same as SAM's training procedure. The first 101 | # iteration will only use a box prompt. 102 | point_prompts, mask_prompt = None, None 103 | 104 | for _ in range(xp + 1): 105 | # Pass all prompts: points, initial box, logits from prev step. 106 | sparse_embeddings, dense_embeddings = self.prompt_encoder( 107 | point_prompts, box_prompt, mask_prompt 108 | ) 109 | 110 | low_res_masks, _ = self.mask_decoder( 111 | image_embeddings=image_embeddings, 112 | image_pe=self.prompt_encoder.get_dense_pe(), 113 | sparse_prompt_embeddings=sparse_embeddings, 114 | dense_prompt_embeddings=dense_embeddings, 115 | multimask_output=False, 116 | ) 117 | refined_masks = nn.functional.interpolate( 118 | low_res_masks, 119 | (self.img_size, self.img_size), 120 | mode="bilinear", 121 | align_corners=False, 122 | ) 123 | refined_masks = refined_masks[..., : input_size[0], : input_size[1]] 124 | 125 | # Use source mask if SAM returned empty mask (happens for tiny boxes). 126 | if (refined_masks > 0).sum() == 0: 127 | refined_masks = src_mask[:, None, ...].float() 128 | 129 | # Update point prompts and mask prompt for next iteration. 130 | point_prompts = sample_point_from_error_region( 131 | src_mask, refined_masks[:, 0], point_prompts 132 | ) 133 | mask_prompt = low_res_masks 134 | 135 | # Resize the refine masks to original size, then ensemble the trials 136 | # by thresholding at zero, then taking a majority vote. 137 | refined_masks = nn.functional.interpolate( 138 | refined_masks, original_size, mode="bilinear", align_corners=False 139 | ) 140 | refined_masks = (refined_masks > 0).float() 141 | refined_mask = E.reduce(refined_masks, "n 1 h w -> h w", "mean") 142 | refined_mask = refined_mask > 0.5 143 | 144 | # Remove spurious islands/holes for large enough masks. 145 | _area = self.min_mask_region_area 146 | if _area > 0 and refined_mask.sum() > 10 * _area: 147 | _mask = refined_mask.cpu().numpy() 148 | 149 | _mask, _ = amg.remove_small_regions(_mask, _area, mode="holes") 150 | _mask, _ = amg.remove_small_regions(_mask, _area, mode="islands") 151 | refined_mask = torch.from_numpy(_mask).to(refined_mask.device) 152 | 153 | all_refined_masks.append(refined_mask) 154 | 155 | all_refined_masks = torch.stack(all_refined_masks) 156 | return all_refined_masks 157 | 158 | def _get_box_prompt(self, mask: torch.Tensor): 159 | """ 160 | Make a box prompt to SAM, which is a bounding box of mask that is expanded 161 | using random noise, same as SAM's training procedure. 162 | 163 | Noise values drawn from Gaussian distributions having zero mean and 164 | standard deviation equal to 10% of box edge size, up to maximum 10 pixels. 165 | """ 166 | box_prompt = amg.batched_mask_to_box(mask.bool()).float() 167 | 168 | box_w = box_prompt[:, 2] - box_prompt[:, 0] 169 | box_h = box_prompt[:, 3] - box_prompt[:, 1] 170 | noise_std = torch.stack([box_w, box_h, box_w, box_h], dim=1) 171 | noise_std = torch.clamp(noise_std * 0.1, max=10.0) 172 | noise_mean = torch.zeros_like(box_prompt) 173 | 174 | random_noise = torch.normal(noise_mean, noise_std) 175 | 176 | box_prompt[:, :2] = box_prompt[:, :2] - random_noise[:, :2].abs() 177 | box_prompt[:, 2:] = box_prompt[:, 2:] + random_noise[:, 2:].abs() 178 | box_prompt = box_prompt.clamp(min=0.0, max=self.img_size - 1) 179 | return box_prompt 180 | 181 | 182 | def sample_point_from_error_region( 183 | reference_masks: torch.Tensor, 184 | predicted_masks: torch.Tensor | None = None, 185 | previous_prompts: tuple[torch.Tensor, torch.Tensor] | None = None, 186 | ) -> tuple[torch.Tensor, torch.Tensor]: 187 | """ 188 | Sample random points from the error regions between some reference masks 189 | (e.g. ground-truth) and predicted masks by SAM. Newly sampled points are 190 | labeled foreground (1) or background (0) depending on the pixel value in 191 | reference mask. This function simulates interactive segmentation setup for 192 | training SAM, as described in Segment Anything paper. 193 | 194 | Args: 195 | reference_masks: Batch of masks as a tensor of shape `(B, H, W)` containing 196 | pixel values in `{1, 0}` or `{True, False}` denoting foreground region. 197 | predicted_masks: Batch of masks predicted by SAM having same shape as the 198 | reference masks. This tensor may have real-valued logits, which will 199 | be internally binarized by thresholding at 0. 200 | previous_prompts: Optional tuple of `(point_coords, point_labels)` giving 201 | point prompts to SAM used in previous interactive iterations. 202 | 203 | Return: 204 | next_prompts: Tuple of `(point_coords, point_labels)` with newly sampled 205 | point co-ordinates and labels appended to `previous_prompts`. 206 | """ 207 | # If predicted masks are not provided, assume that SAM predicted an empty mask. 208 | # This lets us sample a random point from anywhere inside the reference masks. 209 | if predicted_masks is None: 210 | predicted_masks = torch.zeros_like(reference_masks) 211 | 212 | points, point_labels = [], [] 213 | for ref_mask, pr_mask in zip(reference_masks, predicted_masks): 214 | # Sample from the error region between given masks. 215 | error_region = torch.logical_xor(ref_mask > 0, pr_mask > 0) 216 | yx_choices = error_region.nonzero() 217 | 218 | # If there is no error region, sample from anywhere in GT mask. 219 | if len(yx_choices) == 0: 220 | yx_choices = ref_mask.nonzero() 221 | 222 | if len(yx_choices) == 0: 223 | yx_choices = torch.zeros((1, 2), device=ref_mask.device).long() 224 | 225 | idx = torch.randint(len(yx_choices), size=(1,)).item() 226 | point_xy = yx_choices[idx, [1, 0]] 227 | point_label = ref_mask[point_xy[1], point_xy[0]] 228 | 229 | points.append(point_xy) 230 | point_labels.append(point_label) 231 | 232 | points = E.rearrange(torch.stack(points), "b xy -> b 1 xy") 233 | point_labels = E.rearrange(torch.stack(point_labels).long(), "b -> b 1") 234 | 235 | # Append currently sampled points to previous prompts. 236 | if previous_prompts is not None: 237 | previous_points, previous_labels = previous_prompts 238 | points = torch.cat([previous_points, points], dim=1) 239 | point_labels = torch.cat([previous_labels, point_labels], dim=1) 240 | 241 | return (points, point_labels) 242 | -------------------------------------------------------------------------------- /coco_rem/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from __future__ import annotations 3 | 4 | import time 5 | from contextlib import nullcontext 6 | 7 | import torch 8 | from detectron2.engine import SimpleTrainer 9 | from detectron2.utils.events import get_event_storage 10 | from torch.cuda.amp import GradScaler, autocast 11 | from torch.nn.parallel import DistributedDataParallel 12 | 13 | 14 | class AMPWithGradAccumTrainer(SimpleTrainer): 15 | """ 16 | Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision 17 | in the training loop and gradient accumulation after every `N` batches. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | model, 23 | data_loader, 24 | optimizer, 25 | gather_metric_period: int = 1, 26 | grad_scaler: GradScaler | None = None, 27 | precision: torch.dtype = torch.float16, 28 | log_grad_scaler: bool = False, 29 | grad_accum_steps: int = 1, 30 | ): 31 | """ 32 | Args: 33 | model, data_loader, optimizer, gather_metric_period: 34 | same as in :class:`SimpleTrainer`. 35 | grad_scaler: torch GradScaler to automatically scale gradients. 36 | precision: torch.dtype as the target precision to cast to in computations. 37 | grad_accum_steps: Number of gradient accumulation steps. 38 | """ 39 | unsupported = ( 40 | "AMPTrainer does not support single-process multi-device training!" 41 | ) 42 | if isinstance(model, DistributedDataParallel): 43 | assert not (model.device_ids and len(model.device_ids) > 1), unsupported 44 | 45 | super().__init__(model, data_loader, optimizer, gather_metric_period) 46 | 47 | if grad_scaler is None: 48 | grad_scaler = GradScaler() 49 | self.grad_scaler = grad_scaler 50 | self.precision = precision 51 | self.log_grad_scaler = log_grad_scaler 52 | 53 | assert grad_accum_steps >= 1, "grad_accum_steps must be >= 1." 54 | self.grad_accum_steps = grad_accum_steps 55 | self.grad_sync_manager = _GradAccumSyncManager(model, grad_accum_steps) 56 | 57 | def run_step(self): 58 | """ 59 | Implement the AMP training logic along with gradient accumulation. 60 | """ 61 | assert self.model.training, "[AMPTrainer] model was changed to eval mode!" 62 | 63 | start = time.perf_counter() 64 | self.optimizer.zero_grad() 65 | 66 | # Record data loading time for all batches during gradient accumulation. 67 | total_data_time = 0.0 68 | prev_data_time = start 69 | 70 | for _ in range(self.grad_accum_steps): 71 | # Load batch and accumulate total time to load all batches throughout 72 | # all steps of gradient accumulation. 73 | data = next(self._data_loader_iter) 74 | current_data_time = time.perf_counter() 75 | total_data_time += current_data_time - prev_data_time 76 | prev_data_time = current_data_time 77 | 78 | with self.grad_sync_manager, autocast(dtype=self.precision): 79 | loss_dict = self.model(data) 80 | if isinstance(loss_dict, torch.Tensor): 81 | losses = loss_dict 82 | loss_dict = {"total_loss": loss_dict} 83 | else: 84 | losses = sum(loss_dict.values()) 85 | 86 | normalized_losses = losses / self.grad_accum_steps 87 | self.grad_scaler.scale(normalized_losses).backward() 88 | 89 | if self.log_grad_scaler: 90 | storage = get_event_storage() 91 | storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale()) 92 | 93 | self.after_backward() 94 | 95 | if self.async_write_metrics: 96 | # write metrics asynchronically 97 | self.concurrent_executor.submit( 98 | self._write_metrics, loss_dict, total_data_time, iter=self.iter 99 | ) 100 | else: 101 | self._write_metrics(loss_dict, total_data_time) 102 | 103 | self.grad_scaler.step(self.optimizer) 104 | self.grad_scaler.update() 105 | 106 | def state_dict(self): 107 | ret = super().state_dict() 108 | ret["grad_scaler"] = self.grad_scaler.state_dict() 109 | return ret 110 | 111 | def load_state_dict(self, state_dict): 112 | super().load_state_dict(state_dict) 113 | self.grad_scaler.load_state_dict(state_dict["grad_scaler"]) 114 | 115 | 116 | class _GradAccumSyncManager: 117 | """ 118 | Distributed training with gradient accumulation can cause huge slowdowns if 119 | gradient synchronization is not done properly. This context manager does it. 120 | When using DDP and accumulation for `N` steps, gradients are not averaged 121 | across process for first `N - 1` steps. This context manager behaves as 122 | a no-op (`nullcontext`) when any of these conditions are true: 123 | 124 | - Training with single GPU or CPU only (`model` is not DDP object) 125 | - DDP with static graph (see https://github.com/pytorch/pytorch/issues/80832) 126 | - No gradient accumulation across multiple steps (`num_steps = 1`) 127 | """ 128 | 129 | def __init__(self, model, num_steps: int): 130 | """ 131 | Args: 132 | model: PyTorch module that is being trained with gradient accumulation. 133 | num_steps: Number of batches processed to accumulate gradients. 134 | """ 135 | self.num_steps = num_steps 136 | self.step = 0 137 | self._sync = nullcontext() 138 | 139 | if isinstance(model, DistributedDataParallel) and not model.static_graph: 140 | self._no_sync = model.no_sync() 141 | else: 142 | self._no_sync = nullcontext() 143 | 144 | def __enter__(self): 145 | return self._no_sync if self.step < self.num_steps - 1 else self._sync 146 | 147 | def __exit__(self, *args, **kwargs): 148 | self.step = (self.step + 1) % self.num_steps 149 | -------------------------------------------------------------------------------- /images/coco_rem_example_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_1.jpg -------------------------------------------------------------------------------- /images/coco_rem_example_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_2.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fvcore==0.1.5.post20221221 2 | hydra-core>=1.1 3 | numpy==1.24.1 4 | omegaconf>=2.1 5 | pycocotools>=2.0 6 | einops>=0.6 7 | wget>=3.0 8 | -------------------------------------------------------------------------------- /scripts/correct_labeling_errors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add and remove a few instances from a given COCO JSON, and mark a few other 3 | instances as 'crowd' objects as their masks cover multiple instances. 4 | """ 5 | 6 | import argparse 7 | import json 8 | 9 | import torch 10 | from pycocotools import mask as mask_utils 11 | from segment_anything.utils import amg 12 | 13 | import coco_rem.data.manual_rem as inv 14 | 15 | 16 | parser = argparse.ArgumentParser(description=__doc__) 17 | parser.add_argument( 18 | "--input", 19 | default="datasets/coco_rem/instances_valrem_interim.json", 20 | help="COCO-ReM JSON file to apply all manual corrections.", 21 | ) 22 | parser.add_argument( 23 | "--output", required=True, help="Path to save output annotations JSON." 24 | ) 25 | 26 | 27 | def main(_A: argparse.Namespace): 28 | coco_json = json.load(open(_A.input)) 29 | print(f"Number of instances in input JSON: {len(coco_json['annotations'])}") 30 | 31 | # ------------------------------------------------------------------------ 32 | # Step 1: Remove some instances. 33 | remove_info_tuples = [ 34 | (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_REMOVE 35 | ] 36 | coco_json["annotations"] = [ 37 | x 38 | for x in coco_json["annotations"] 39 | if (x["image_id"], x["source"], x["source_id"]) not in remove_info_tuples 40 | ] 41 | 42 | num_instances = len(coco_json["annotations"]) 43 | print(f"Removed few instances, updated JSON has {num_instances} instances.") 44 | 45 | # ------------------------------------------------------------------------ 46 | # Step 2: Set 'iscrowd = 1' for few instances. 47 | crowd_info_tuples = [ 48 | (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_CROWD 49 | ] 50 | for ann in coco_json["annotations"]: 51 | if (ann["image_id"], ann["source"], ann["source_id"]) in crowd_info_tuples: 52 | ann["iscrowd"] = 1 53 | 54 | print(f"Set 'iscrowd = 1' for {len(inv.INSTANCES_TO_CROWD)} instances.") 55 | 56 | # ------------------------------------------------------------------------ 57 | # Step 3: Add some instances. 58 | for idx, ann in enumerate(inv.INSTANCES_TO_ADD): 59 | # Convert compressed RLE to mask. 60 | binary_mask = mask_utils.decode(ann["segmentation"]) 61 | binary_mask = torch.from_numpy(binary_mask) 62 | 63 | # Convert torch tensor to uncompressed RLE. 64 | ann["segmentation"] = amg.mask_to_rle_pytorch(binary_mask[None, ...])[0] 65 | 66 | # Fill other attributes for the annotation - a unique ID, source, bbox, 67 | # area, and `iscrowd = 0`. 68 | bbox_xyxy = amg.batched_mask_to_box(binary_mask[None, ...])[0] 69 | 70 | # Convert bounding box from XYXY to XYWH format. 71 | x1, y1, x2, y2 = bbox_xyxy.tolist() 72 | ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1] 73 | 74 | ann["area"] = amg.area_from_rle(ann["segmentation"]) 75 | ann["id"] = 2024000000 + idx 76 | ann["source_id"] = ann["id"] 77 | ann["source"] = "manual" 78 | ann["iscrowd"] = 0 79 | 80 | coco_json["annotations"].append(ann) 81 | 82 | num_instances = len(coco_json["annotations"]) 83 | print(f"Added few instances, updated JSON has {num_instances} instances.") 84 | 85 | json.dump(coco_json, open(_A.output, "w")) 86 | print(f"Saved the updated annotations JSON at {_A.output}!") 87 | 88 | 89 | if __name__ == "__main__": 90 | args = parser.parse_args() 91 | main(args) 92 | -------------------------------------------------------------------------------- /scripts/merge_instances.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge LVIS instance annotations for COCO categories (a.k.a "COCO-fied LVIS") into 3 | the original COCO instance annotations. 4 | 5 | COCO instance annotations are inconsistent, sometimes covering multiple objects 6 | into one mask. LVIS offers a much stronger guarantee that instances are labeled 7 | individually and exhaustively. For any `(image, category)` pair, if COCO-fied LVIS 8 | has instance annotations, then they replace the corresponding COCO annotations. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import argparse 14 | import copy 15 | import json 16 | from collections import defaultdict 17 | 18 | from coco_rem.data.lvis import COCO_CATEGORIES_IN_LVIS 19 | 20 | parser = argparse.ArgumentParser(description=__doc__) 21 | _AA = parser.add_argument 22 | _AA("--coco-json", help="Path to COCO annotations JSON file.") 23 | _AA( 24 | "--lvis-json", 25 | nargs=2, 26 | help="Paths to LVIS train and val JSON files.", 27 | default=["datasets/lvis/lvis_v1_train.json", "datasets/lvis/lvis_v1_val.json"], 28 | ) 29 | _AA("--split", choices=["train", "val"], help="Which dataset split to pre-process?") 30 | _AA("--output", required=True, help="Path to save the output annotations JSON.") 31 | 32 | 33 | def make_cocofied_lvis(lvis_json_paths: list[str], split: str): 34 | """ 35 | Load LVIS instance annotations and filter them to keep instance annotations 36 | of the COCO categories for all images belonging to a COCO split (train/val). 37 | Category IDs in the output JSON are same as COCO IDs. 38 | """ 39 | 40 | lvis_images, lvis_annos = [], [] 41 | for _path in lvis_json_paths: 42 | lvis_json = json.load(open(_path)) 43 | lvis_images.extend(lvis_json.pop("images")) 44 | lvis_annos.extend(lvis_json.pop("annotations")) 45 | 46 | # LVIS train/val splits are different than COCO (but in total, they cover the 47 | # same set of images). So we load both train and val annotations, then retain 48 | # images and their instances of the desired COCO split. 49 | keep_ids = set([x["id"] for x in lvis_images if split in x["coco_url"]]) 50 | lvis_images = [x for x in lvis_images if x["id"] in keep_ids] 51 | lvis_annos = [x for x in lvis_annos if x["image_id"] in keep_ids] 52 | 53 | # Replace the category ID in instance annotation (LVIS -> COCO), and remove 54 | # LVIS instances that do not represent COCO categories. 55 | lvis_to_coco_id = {x["lvis_id"]: x["coco_id"] for x in COCO_CATEGORIES_IN_LVIS} 56 | lvis_annos = [x for x in lvis_annos if x["category_id"] in lvis_to_coco_id] 57 | for ann in lvis_annos: 58 | ann["category_id"] = lvis_to_coco_id[ann["category_id"]] 59 | 60 | # Replace category IDs in the "negative categories" list per image, like above. 61 | for image in lvis_images: 62 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]: 63 | image[key] = [x for x in image[key] if x in lvis_to_coco_id] 64 | image[key] = [lvis_to_coco_id[x] for x in image[key]] 65 | 66 | # Transfer metadata from original LVIS json to COCOfied LVIS json. 67 | cocofied_lvis = copy.deepcopy(lvis_json) 68 | cocofied_lvis["images"] = lvis_images 69 | cocofied_lvis["annotations"] = lvis_annos 70 | 71 | # Update category IDs of LVIS categories. 72 | cocofied_lvis["categories"] = [ 73 | x for x in cocofied_lvis["categories"] if x["id"] in lvis_to_coco_id 74 | ] 75 | for ann in cocofied_lvis["categories"]: 76 | ann["id"] = lvis_to_coco_id[ann["id"]] 77 | 78 | print(f"COCO-fied LVIS stats for COCO {split} split:") 79 | print(f" - Number of images = {len(lvis_images)}") 80 | print(f" - Number of annotations = {len(lvis_annos)}") 81 | 82 | return cocofied_lvis 83 | 84 | 85 | def main(_A: argparse.Namespace): 86 | coco_json = json.load(open(_A.coco_json)) 87 | lvis_json = make_cocofied_lvis(_A.lvis_json, _A.split) 88 | 89 | # Make a mapping from `(image_id, category_id) -> list[instances]` for both, 90 | # COCO and LVIS. 91 | coco_instances_dict = defaultdict(list) 92 | for ann in coco_json["annotations"]: 93 | # Mark the source of every annotation before merging. 94 | ann["source"] = "coco" 95 | ann["source_id"] = ann["id"] 96 | coco_instances_dict[(ann["image_id"], ann["category_id"])].append(ann) 97 | 98 | lvis_instances_dict = defaultdict(list) 99 | for ann in lvis_json["annotations"]: 100 | ann["source"] = "lvis" 101 | ann["source_id"] = ann["id"] 102 | lvis_instances_dict[(ann["image_id"], ann["category_id"])].append(ann) 103 | 104 | # ------------------------------------------------------------------------ 105 | # For val set, remove all COCO-fied LVIS annotations for `(image, category)` 106 | # pair if instances are not annotated exhaustively. 107 | if _A.split == "val": 108 | _remove = [ 109 | (image_info["id"], category_id) 110 | for image_info in lvis_json["images"] 111 | for category_id in image_info["not_exhaustive_category_ids"] 112 | ] 113 | lvis_instances_dict = { 114 | k: v for k, v in lvis_instances_dict.items() if k not in _remove 115 | } 116 | 117 | # ------------------------------------------------------------------------ 118 | # If `(image, category)` tuple has more LVIS instances than COCO instances 119 | # then all COCO instances will be replaced by LVIS instances. 120 | merged_annotations = [] 121 | for (image_id, category_id), anns_in_coco in coco_instances_dict.items(): 122 | anns_in_lvis = lvis_instances_dict.get((image_id, category_id), []) 123 | 124 | if len(anns_in_lvis) > len(anns_in_coco): 125 | merged_annotations.extend(anns_in_lvis) 126 | else: 127 | merged_annotations.extend(anns_in_coco) 128 | 129 | # Some `(image, category)` instances of LVIS are completely absent in COCO. 130 | # Add all of these while merging. 131 | for (image_id, category_id), anns_in_lvis in lvis_instances_dict.items(): 132 | if (image_id, category_id) not in coco_instances_dict: 133 | merged_annotations.extend(anns_in_lvis) 134 | 135 | coco_json["annotations"] = merged_annotations 136 | 137 | # Re-assign annotation IDs after merging. 138 | image_id_to_anns_coco = defaultdict(list) 139 | for ann in coco_json["annotations"]: 140 | image_id_to_anns_coco[ann["image_id"]].append(ann) 141 | 142 | for image_id, anns_in_coco in image_id_to_anns_coco.items(): 143 | for idx, ann in enumerate(anns_in_coco): 144 | ann["id"] = image_id * 1000 + idx 145 | 146 | # ------------------------------------------------------------------------ 147 | # Calculate number of annotations sourced from COCO/LVIS. 148 | num_coco_src = len([x for x in merged_annotations if x["source"] == "coco"]) 149 | num_lvis_src = len([x for x in merged_annotations if x["source"] == "lvis"]) 150 | 151 | print(f"Final COCO {_A.split} split statistics after merging:") 152 | print(f" - Number of images = {len(coco_json['images'])}") 153 | print(f" - Number of annotations = {len(coco_json['annotations'])}") 154 | print(f" - Annotations from COCO = {num_coco_src}") 155 | print(f" - Annotations from LVIS = {num_lvis_src}") 156 | 157 | json.dump(coco_json, open(_A.output, "w")) 158 | print(f"Saved the merged annotations JSON to {_A.output}") 159 | 160 | 161 | if __name__ == "__main__": 162 | _A = parser.parse_args() 163 | 164 | # Log all command-line arguments. 165 | print("Running with arguments:") 166 | for key, value in vars(_A).items(): 167 | print(f"{key:<10}: {value}") 168 | 169 | main(_A) 170 | -------------------------------------------------------------------------------- /scripts/refine_boundaries.py: -------------------------------------------------------------------------------- 1 | """ 2 | Refine mask boundaries of input COCO JSON to obtain COCO-ReM. Refinement is done 3 | using the `SamRefiner` module in this package. 4 | """ 5 | 6 | from __future__ import annotations 7 | 8 | import argparse 9 | import json 10 | import os 11 | from collections import defaultdict 12 | 13 | import torch 14 | from detectron2 import engine 15 | from detectron2.data.detection_utils import read_image 16 | from detectron2.data.transforms import ResizeShortestEdge 17 | from detectron2.structures import polygons_to_bitmask 18 | from detectron2.utils import comm 19 | from segment_anything.utils import amg 20 | from tqdm import tqdm 21 | 22 | from coco_rem.modeling.sam_refiner import SamRefiner 23 | 24 | # Add documentation of `SamRefiner` to this script documentation, so argparse can 25 | # display it with `--help`. 26 | __doc__ += f"\n\n{SamRefiner.__doc__}\n{SamRefiner.__init__.__doc__}" 27 | 28 | # fmt: off 29 | parser = argparse.ArgumentParser( 30 | description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter 31 | ) 32 | _AA = parser.add_argument 33 | _AA("--input-json", required=True, help="Path to COCO annotations JSON.") 34 | _AA("--image-dir", default="datasets/coco/val2017", help="COCO image directory.") 35 | _AA("--num-gpus", type=int, default=0, help="Number of GPUs for parallelization.") 36 | _AA("--output", required=True, help="Path to save output annotations JSON.") 37 | 38 | group = parser.add_argument_group("Input arguments to `SamRefiner`.") 39 | group.add_argument("--arch", default="vit_h", choices=["vit_b", "vit_l", "vit_h"]) 40 | group.add_argument("--checkpoint", default="checkpoints/sam_vit_h_4b8939.pth") 41 | group.add_argument("--num-extra-points", type=int, default=2) 42 | group.add_argument("--num-trials", type=int, default=10) 43 | group.add_argument( 44 | "--box-only-names", nargs="+", 45 | default=["bed", "bicycle", "bowl", "dining table", "motorcycle", "scissors"], 46 | help="COCO category names for which we only use box prompts.", 47 | ) 48 | # fmt: on 49 | 50 | 51 | def main(_A: argparse.Namespace): 52 | device = torch.device("cpu") 53 | if torch.cuda.is_available(): 54 | device = torch.cuda.current_device() 55 | 56 | # ------------------------------------------------------------------------ 57 | coco_json = json.load(open(_A.input_json)) 58 | 59 | # Make a mapping between image ID and all instance annotations. 60 | image_id_annotations = defaultdict(list) 61 | for ann in coco_json["annotations"]: 62 | image_id_annotations[ann["image_id"]].append(ann) 63 | 64 | image_id_annotations = list(image_id_annotations.items()) 65 | 66 | # Shard the dataset so each GPU only refines masks for a subset of images. 67 | WORLD_SIZE = comm.get_world_size() 68 | RANK = comm.get_rank() 69 | 70 | image_id_annotations = image_id_annotations[RANK::WORLD_SIZE] 71 | print(f"GPU {RANK}/{WORLD_SIZE} will process {len(image_id_annotations)} images.") 72 | 73 | # Get a list of category IDs for which only box prompts will be used. 74 | cat_id_map = {x["name"]: x["id"] for x in coco_json["categories"]} 75 | box_only_ids = [cat_id_map[x] for x in _A.box_only_names] 76 | 77 | # ------------------------------------------------------------------------ 78 | # Instantiate model and input tranform (resize longest side to 1024 pixels). 79 | refiner = SamRefiner( 80 | _A.arch, _A.checkpoint, _A.num_extra_points, _A.num_trials, box_only_ids 81 | ) 82 | refiner = refiner.eval().to(device) 83 | 84 | preprocess = ResizeShortestEdge(refiner.img_size, max_size=refiner.img_size) 85 | 86 | # ------------------------------------------------------------------------ 87 | for image_id, annotations in tqdm(image_id_annotations, "Refining masks"): 88 | image_path = os.path.join(_A.image_dir, f"{image_id:0>12d}.jpg") 89 | image = read_image(image_path, "RGB") 90 | original_hw = image.shape[:2] 91 | 92 | # Pre-process image and masks. 93 | transform = preprocess.get_transform(image) 94 | image = transform.apply_image(image) 95 | 96 | # Get image height/width before and after applying resize transform. 97 | resized_hw = image.shape[:2] 98 | 99 | # Convert image to NCHW format tensor, RGB values in 0-255). 100 | image = torch.as_tensor(image, device=device) 101 | image = image.permute(2, 0, 1).contiguous() 102 | 103 | # Make batches of source masks (NHW bool tensor). 104 | source_masks = [ann["segmentation"] for ann in annotations] 105 | for idx, segm in enumerate(source_masks): 106 | if isinstance(segm, list): 107 | # Polygons. 108 | polygons = [torch.as_tensor(p).view(-1, 2) for p in segm] 109 | polygons = [p.view(-1) for p in transform.apply_polygons(polygons)] 110 | segm = polygons_to_bitmask(polygons, *resized_hw) 111 | elif isinstance(segm, dict): 112 | # RLE. 113 | segm = amg.rle_to_mask(segm).astype("uint8") 114 | segm = transform.apply_segmentation(segm) 115 | 116 | source_masks[idx] = torch.as_tensor(segm).bool() 117 | 118 | source_masks = torch.stack(source_masks).to(device) 119 | # -------------------------------------------------------------------- 120 | 121 | category_ids = [ann["category_id"] for ann in annotations] 122 | refined_masks = refiner(image, source_masks, category_ids, original_hw) 123 | 124 | # Get tight boxes enclosing refined masks, then convert masks to RLE. 125 | refined_boxes_xyxy = amg.batched_mask_to_box(refined_masks) 126 | refined_masks = amg.mask_to_rle_pytorch(refined_masks) 127 | 128 | # Replace the source masks with refined masks in COCO annotations. 129 | # NOTE: Keep "crowd" annotations unchanged as they don't participate in 130 | # the calculation of COCO AP. 131 | for idx, ann in enumerate(annotations): 132 | if ann.get("iscrowd", 0) != 1: 133 | ann["segmentation"] = refined_masks[idx] 134 | ann["area"] = amg.area_from_rle(refined_masks[idx]) 135 | 136 | # Recompute box enclosing the refined mask. 137 | x1, y1, x2, y2 = refined_boxes_xyxy[idx].tolist() 138 | ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1] 139 | 140 | # ------------------------------------------------------------------------ 141 | 142 | # Combine the refined masks from all GPU processes to main process. 143 | all_refined_annotations = [] 144 | for _, annotations in image_id_annotations: 145 | all_refined_annotations.extend(annotations) 146 | 147 | all_refined_annotations = comm.gather(all_refined_annotations, dst=0) 148 | 149 | # In main process, replace annotations in COCO JSON and save to output. 150 | if comm.is_main_process(): 151 | coco_json["annotations"] = [] 152 | for ann_list in all_refined_annotations: 153 | coco_json["annotations"].extend(ann_list) 154 | 155 | os.makedirs(os.path.dirname(_A.output), exist_ok=True) 156 | json.dump(coco_json, open(_A.output, "w")) 157 | print(f"Saved annotations JSON with refined masks to {_A.output}") 158 | 159 | comm.synchronize() 160 | print(f"GPU {RANK}/{WORLD_SIZE}: Refinement complete!") 161 | 162 | 163 | if __name__ == "__main__": 164 | _A = parser.parse_args() 165 | 166 | print("Running with arguments:") 167 | for key, value in vars(_A).items(): 168 | print(f"{key:<30}: {value}") 169 | 170 | engine.launch(main, num_gpus_per_machine=_A.num_gpus, dist_url="auto", args=(_A,)) 171 | -------------------------------------------------------------------------------- /scripts/train_net.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train or evaluation a model using Detectron2-style lazy config. 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | import argparse 8 | import json 9 | import logging 10 | import warnings 11 | 12 | import torch 13 | from detectron2 import engine 14 | from detectron2.checkpoint import DetectionCheckpointer 15 | from detectron2.config import LazyConfig, instantiate 16 | from detectron2.engine import hooks 17 | from detectron2.engine.defaults import create_ddp_model 18 | from detectron2.evaluation import inference_on_dataset, print_csv_format 19 | from detectron2.evaluation.testing import flatten_results_dict 20 | from detectron2.utils import comm 21 | 22 | from coco_rem.data.builtin import register_all_coco_rem 23 | from coco_rem.trainer import AMPWithGradAccumTrainer 24 | 25 | warnings.filterwarnings("ignore") 26 | logger = logging.getLogger("detectron2") 27 | 28 | 29 | parser = engine.default_argument_parser(__doc__) 30 | _AA = parser.add_argument 31 | _AA("--checkpoint-period", type=int, default=5000, help="Checkpoint saving period.") 32 | _AA("--log-period", type=int, default=10, help="Log training progress periodically.") 33 | 34 | 35 | def do_test(_C, model): 36 | data_loader = instantiate(_C.dataloader.test) 37 | evaluator = instantiate(_C.dataloader.evaluator) 38 | 39 | results = inference_on_dataset(model, data_loader, evaluator) 40 | print_csv_format(results) 41 | return results 42 | 43 | 44 | def main(_A: argparse.Namespace): 45 | # Register COCO-ReM dataset splits before starting the training job. 46 | register_all_coco_rem() 47 | 48 | _C = LazyConfig.load(_A.config_file) 49 | _C = LazyConfig.apply_overrides(_C, _A.opts) 50 | 51 | engine.default_setup(_C, _A) 52 | 53 | device = torch.cuda.current_device() if _A.num_gpus != 0 else torch.device("cpu") 54 | 55 | model = instantiate(_C.model).to(device) 56 | logger.info("Model:\n{}".format(model)) 57 | 58 | model = create_ddp_model(model) 59 | DetectionCheckpointer(model).load(_C.train.get("init_checkpoint", None)) 60 | 61 | if _A.eval_only: 62 | results = do_test(_C, model) 63 | if comm.is_main_process(): 64 | results = flatten_results_dict(results) 65 | json.dump(results, open(f"{_C.train.output_dir}/eval_results.json", "w")) 66 | return 67 | 68 | train_loader = instantiate(_C.dataloader.train) 69 | 70 | _C.optimizer.params.model = model 71 | optim = instantiate(_C.optimizer) 72 | 73 | trainer_cls = AMPWithGradAccumTrainer if _C.train.amp else engine.SimpleTrainer 74 | trainer = trainer_cls( 75 | model, train_loader, optim, grad_accum_steps=_C.train.get("grad_accum_steps", 1) 76 | ) 77 | checkpointer = DetectionCheckpointer(model, _C.train.output_dir, trainer=trainer) 78 | 79 | trainer.register_hooks( 80 | [ 81 | hooks.IterationTimer(), 82 | hooks.LRScheduler(scheduler=instantiate(_C.lr_multiplier)), 83 | hooks.PeriodicCheckpointer(checkpointer, _A.checkpoint_period) 84 | if comm.is_main_process() 85 | else None, 86 | hooks.EvalHook(_A.checkpoint_period, lambda: do_test(_C, model)), 87 | hooks.PeriodicWriter( 88 | engine.default_writers(_C.train.output_dir, _C.train.max_iter), 89 | period=_A.log_period, 90 | ) 91 | if comm.is_main_process() 92 | else None, 93 | ] 94 | ) 95 | 96 | checkpointer.resume_or_load(_C.train.init_checkpoint, resume=_A.resume) 97 | if _A.resume and checkpointer.has_checkpoint(): 98 | # The checkpoint stores the training iteration that just finished, thus we start 99 | # at the next iteration 100 | start_iter = trainer.iter + 1 101 | else: 102 | start_iter = 0 103 | trainer.train(start_iter, _C.train.max_iter) 104 | 105 | 106 | if __name__ == "__main__": 107 | _A = parser.parse_args() 108 | engine.launch( 109 | main, 110 | num_gpus_per_machine=_A.num_gpus, 111 | num_machines=_A.num_machines, 112 | machine_rank=_A.machine_rank, 113 | dist_url=_A.dist_url, 114 | args=(_A,), 115 | ) 116 | -------------------------------------------------------------------------------- /scripts/visualize_coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualize instances from a COCO annotations JSON (COCO-2017 or COCO-ReM). 3 | """ 4 | 5 | import argparse 6 | import logging 7 | import os 8 | 9 | import numpy as np 10 | from detectron2.data import DatasetCatalog, MetadataCatalog 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data.datasets import load_coco_json 13 | from tqdm import tqdm 14 | 15 | from coco_rem.mask_visualizer import MaskVisualizer 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | # fmt: off 20 | parser = argparse.ArgumentParser(description=__doc__) 21 | _AA = parser.add_argument 22 | _AA( 23 | "--input-json", default="datasets/coco/annotations/instances_val2017.json", 24 | help="Path to JSON file containing COCO annotations." 25 | ) 26 | _AA( 27 | "--image-dir", default="datasets/coco/val2017", 28 | help="Path to directory containing COCO images.", 29 | ) 30 | _AA("--draw-labels", action="store_true", help="Whether to draw labels on masks.") 31 | _AA("--class-name", help="If provided, visualize masks of this class only.") 32 | 33 | _AA("--output", default="./viz", help="Path to output (saving) dir.") 34 | _AA("--filename-suffix", help="Add a suffix to saved image file name.") 35 | # fmt: on 36 | 37 | 38 | def add_id_to_labels(dic, labels): 39 | labels = [f"{lbl} ({x['id']})" for lbl, x in zip(labels, dic["annotations"])] 40 | return labels 41 | 42 | 43 | if __name__ == "__main__": 44 | _A = parser.parse_args() 45 | print("Arguments: " + str(_A)) 46 | 47 | # Register the input COCO JSON file as a Detectron2 dataset to load nicely 48 | # formatted dataset dicts for visualization. 49 | # Extra annotation keys: all possible keys added in generated JSON files. 50 | name = "coco_or_lvis_v1_cocofied_to_visualize" 51 | extra_keys = ["source", "source_id", "id"] 52 | 53 | DatasetCatalog.register( 54 | name, lambda: load_coco_json(_A.input_json, _A.image_dir, name, extra_keys) 55 | ) 56 | # ------------------------------------------------------------------------ 57 | # Fix seed for reproducible colors. 58 | np.random.seed(0) 59 | 60 | dataset_dicts = DatasetCatalog.get(name) 61 | class_names = MetadataCatalog.get("coco_2017_val").thing_classes 62 | os.makedirs(_A.output, exist_ok=True) 63 | 64 | for ddict in tqdm(dataset_dicts): 65 | if _A.class_name is not None: 66 | ddict["annotations"] = [ 67 | ann 68 | for ann in ddict["annotations"] 69 | if class_names[ann["category_id"]] == _A.class_name 70 | ] 71 | 72 | if len(ddict["annotations"]) > 0: 73 | img = utils.read_image(ddict["file_name"], "RGB") 74 | visualizer = MaskVisualizer(img, class_names) 75 | vis_image = visualizer.draw_dataset_dict( 76 | ddict, _A.draw_labels, label_suffix_formatter=add_id_to_labels 77 | ) 78 | 79 | # Save the visualized image. 80 | filepath = os.path.join(_A.output, os.path.basename(ddict["file_name"])) 81 | if _A.class_name is not None: 82 | filepath = filepath.replace(".jpg", f"_{_A.class_name}.jpg") 83 | 84 | if _A.filename_suffix is not None: 85 | filepath = filepath.replace(".jpg", f"_{_A.filename_suffix}.jpg") 86 | 87 | vis_image.save(filepath) 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import find_packages, setup 3 | 4 | setup( 5 | name="coco_rem", 6 | version="0.1", 7 | python_requires=">=3.8", 8 | zip_safe=True, 9 | packages=find_packages(include=["coco_rem"]), 10 | ) 11 | --------------------------------------------------------------------------------