├── .gitignore ├── LICENSE ├── README.md ├── datasets └── .gitkeep ├── pretrained_ckpt └── .gitkeep ├── sas_det ├── __init__.py ├── checkpoint │ ├── __init__.py │ ├── c2_model_loading.py │ ├── catalog.py │ ├── clip_model_loading.py │ └── detection_checkpoint.py ├── config.py ├── configs │ ├── ovd_coco_R50_C4_ensemble.yaml │ ├── ovd_coco_R50_C4_ensemble_PLs.yaml │ ├── ovd_lvis_R50_C4_SAS_Det_3x.yaml │ ├── ovd_lvis_R50_C4_ensemble_PLs.yaml │ └── regionclip │ │ ├── Base-RCNN-C4.yaml │ │ ├── Base-RCNN-DilatedC5.yaml │ │ ├── Base-RCNN-FPN.yaml │ │ ├── Base-RetinaNet.yaml │ │ ├── COCO-Detection │ │ ├── fast_rcnn_R_50_FPN_1x.yaml │ │ ├── faster_rcnn_R_101_C4_3x.yaml │ │ ├── faster_rcnn_R_101_DC5_3x.yaml │ │ ├── faster_rcnn_R_101_FPN_3x.yaml │ │ ├── faster_rcnn_R_50_C4_1x.yaml │ │ ├── faster_rcnn_R_50_C4_3x.yaml │ │ ├── faster_rcnn_R_50_DC5_1x.yaml │ │ ├── faster_rcnn_R_50_DC5_3x.yaml │ │ ├── faster_rcnn_R_50_FPN_1x.yaml │ │ ├── faster_rcnn_R_50_FPN_3x.yaml │ │ ├── faster_rcnn_X_101_32x8d_FPN_3x.yaml │ │ ├── retinanet_R_101_FPN_3x.yaml │ │ ├── retinanet_R_50_FPN_1x.py │ │ ├── retinanet_R_50_FPN_1x.yaml │ │ ├── retinanet_R_50_FPN_3x.yaml │ │ ├── rpn_R_50_C4_1x.yaml │ │ └── rpn_R_50_FPN_1x.yaml │ │ ├── COCO-InstanceSegmentation │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml │ │ ├── customized │ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml │ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml │ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml │ │ │ ├── ovd_coco_2x_PLs_per4k_clsBoxConf.yaml │ │ │ ├── ovd_coco_fCLIP_PLs_clsBoxConf.yaml │ │ │ ├── ovd_coco_fCLIP_offline_PLs.yaml │ │ │ └── ovd_coco_frozen_CLIP_RPN.yaml │ │ ├── mask_rcnn_CLIP_R_50_C4_1x.yaml │ │ ├── mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml │ │ ├── mask_rcnn_R_101_C4_3x.yaml │ │ ├── mask_rcnn_R_101_DC5_3x.yaml │ │ ├── mask_rcnn_R_101_FPN_3x.yaml │ │ ├── mask_rcnn_R_50_C4_1x.py │ │ ├── mask_rcnn_R_50_C4_1x.yaml │ │ ├── mask_rcnn_R_50_C4_1x_ovd_FSD.yaml │ │ ├── mask_rcnn_R_50_C4_1x_ovd_coco65.yaml │ │ ├── mask_rcnn_R_50_C4_3x.yaml │ │ ├── mask_rcnn_R_50_DC5_1x.yaml │ │ ├── mask_rcnn_R_50_DC5_3x.yaml │ │ ├── mask_rcnn_R_50_FPN_1x.py │ │ ├── mask_rcnn_R_50_FPN_1x.yaml │ │ ├── mask_rcnn_R_50_FPN_1x_giou.yaml │ │ ├── mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml │ │ ├── mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml │ │ ├── mask_rcnn_R_50_FPN_3x.yaml │ │ ├── mask_rcnn_X_101_32x8d_FPN_3x.yaml │ │ ├── mask_rcnn_regnetx_4gf_dds_fpn_1x.py │ │ └── mask_rcnn_regnety_4gf_dds_fpn_1x.py │ │ ├── LVISv0.5-InstanceSegmentation │ │ ├── mask_rcnn_R_101_FPN_1x.yaml │ │ ├── mask_rcnn_R_50_FPN_1x.yaml │ │ └── mask_rcnn_X_101_32x8d_FPN_1x.yaml │ │ ├── LVISv1-InstanceSegmentation │ │ ├── CLIP_fast_rcnn_R_50_C4.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_custom_img.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_zsinf.yaml │ │ ├── CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml │ │ ├── customized │ │ │ ├── ovd_lvis_box_PLs_periodic_boxConf.yaml │ │ │ ├── ovd_lvis_fCLIP_PLs_clsBoxConf.yaml │ │ │ └── ovd_lvis_frozen_CLIP_RPN.yaml │ │ ├── mask_rcnn_CLIP_R_50_C4_1x.yaml │ │ ├── mask_rcnn_CLIP_R_50_FPN_1x.yaml │ │ ├── mask_rcnn_R_101_FPN_1x.yaml │ │ ├── mask_rcnn_R_50_C4_1x.yaml │ │ ├── mask_rcnn_R_50_FPN_1x.yaml │ │ ├── mask_rcnn_R_50_FPN_2x.yaml │ │ └── mask_rcnn_X_101_32x8d_FPN_1x.yaml │ │ ├── Misc │ │ ├── cascade_mask_rcnn_R_50_FPN_1x.yaml │ │ ├── cascade_mask_rcnn_R_50_FPN_3x.yaml │ │ ├── cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml │ │ ├── mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml │ │ ├── mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml │ │ ├── mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml │ │ ├── mask_rcnn_R_50_FPN_3x_gn.yaml │ │ ├── mask_rcnn_R_50_FPN_3x_syncbn.yaml │ │ ├── mmdet_mask_rcnn_R_50_FPN_1x.py │ │ ├── panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml │ │ ├── scratch_mask_rcnn_R_50_FPN_3x_gn.yaml │ │ ├── scratch_mask_rcnn_R_50_FPN_9x_gn.yaml │ │ ├── scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml │ │ ├── semantic_R_50_FPN_1x.yaml │ │ └── torchvision_imagenet_R_50.py │ │ ├── common │ │ ├── README.md │ │ ├── coco_schedule.py │ │ ├── data │ │ │ ├── coco.py │ │ │ ├── coco_keypoint.py │ │ │ └── coco_panoptic_separated.py │ │ ├── models │ │ │ ├── cascade_rcnn.py │ │ │ ├── keypoint_rcnn_fpn.py │ │ │ ├── mask_rcnn_c4.py │ │ │ ├── mask_rcnn_fpn.py │ │ │ ├── panoptic_fpn.py │ │ │ └── retinanet.py │ │ ├── optim.py │ │ └── train.py │ │ └── pretrain │ │ ├── RegionCLIP_RN50.yaml │ │ ├── RegionCLIP_RN50_onlinePL.yaml │ │ ├── RegionCLIP_RN50_onlinePL_box_weak.yaml │ │ ├── RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml │ │ ├── RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml │ │ ├── RegionCLIP_RN50x4.yaml │ │ └── RegionCLIP_RN50x4_onlinePL_boxWeak.yaml ├── data │ ├── __init__.py │ ├── coco_zeroshot_categories.py │ ├── lvis.py │ ├── lvis_v0_5_categories.py │ ├── lvis_v1_categories.py │ └── ovd_register.py ├── evaluation │ ├── __init__.py │ ├── cityscapes_evaluation.py │ ├── coco_evaluation.py │ ├── evaluator.py │ ├── fast_eval_api.py │ ├── lvis_evaluation.py │ ├── panoptic_evaluation.py │ ├── pascal_voc_evaluation.py │ ├── rotated_coco_evaluation.py │ ├── sem_seg_evaluation.py │ └── testing.py └── modeling │ ├── __init__.py │ ├── backbone │ ├── __init__.py │ ├── batch_norm.py │ └── clip_backbone.py │ ├── ensemble_fast_rcnn.py │ ├── ensemble_roi_heads.py │ ├── meta_arch │ └── clip_rcnn.py │ └── roi_heads │ ├── __init__.py │ ├── clip_fast_rcnn.py │ ├── clip_roi_heads.py │ └── soft_nms.py ├── test_net.py └── tools ├── gen_cat_text_emb.py ├── offline_eval_onLVIS.py └── offline_eval_onO365.py /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS stuff 2 | *.DS_Store 3 | 4 | # Python caches 5 | **/__pycache__ 6 | 7 | # Ignore all output directories and experiment scripts in the individual projects 8 | /output/ 9 | /datasets 10 | /pretrained_ckpt/ 11 | /pretrained_ckpt 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2023 NEC Laboratories America, Inc. ("NECLA") 2 | 3 | 4 | 5 | This software and any and all related files/code/information is provided by 6 | NECLA to for non-commercial evaluation or research purposes subject to terms in a License agreement the Recipient has agreed to by Recipient’s signature. 7 | 8 | 9 | 10 | The license restriction includes, among other limitations, the Recipient to only evaluate this software and redistribute information related to this software only in the form of technical publications/papers, with no rights to assign a license to third parties or redistribute the software to others. 11 | 12 | 13 | 14 | 15 | IN NO EVENT SHALL NEC BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 16 | SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 17 | USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF NEC HAS BEEN 18 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 19 | 20 | 21 | 22 | NEC SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND NEC HAS NO OBLIGATION TO PROVIDE MAINTENANCE, 23 | SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 24 | 25 | 26 | 27 | THE LICENSE FROM NEC FOR THE SOFTWARE REQUIRES THAT LICENSEE 28 | COMPLY WITH ANY AND ALL UNDERLYING COPYRIGHTS AND LICENSE RIGHTS 29 | IN THE SOFTWARE BY THIRD PARTIES. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Taming Self-Training for Open-Vocabulary Object Detection 2 | 3 | Official implementation of online self-training and a split-and-fusion (SAF) head for Open-Vocabulary Object Detection (OVD), SAS-Det for short. 4 | This project was named as Improving Pseudo Labels for Open-Vocabulary Object Detection. 5 | 6 | [arXiv](https://arxiv.org/abs/2308.06412) 7 | 8 | 9 | ## Installation 10 | - Our project is developed on Detectron2. Please follow the official installation [instructions](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md), OR the following instructions. 11 | ``` 12 | # create new environment 13 | conda create -n sas_det python=3.8 14 | conda activate sas_det 15 | 16 | # install pytorch 17 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch 18 | 19 | # install Detectron2 from a local clone 20 | git clone https://github.com/facebookresearch/detectron2.git 21 | python -m pip install -e detectron2 22 | ``` 23 | 24 | - Install CLIP 25 | ``` 26 | # install CLIP 27 | pip install scipy 28 | pip install ftfy regex tqdm 29 | pip install git+https://github.com/openai/CLIP.git 30 | ``` 31 | 32 | 33 | ## Datasets 34 | 35 | - Please follow RegionCLIP's [dataset instructions](https://github.com/microsoft/RegionCLIP/blob/main/datasets/README.md) to prepare COCO and LVIS datasets. 36 | 37 | - Download and put [metadata](https://drive.google.com/drive/u/1/folders/1R72q0Wg26-PQGqbaK3P3pT2vmGm9uzKU) for datasets in the folder `datasets` (i.e., `$DETECTRON2_DATASETS` used in the last step), which will be used in our evaluation and training. 38 | 39 | 40 | ## Download pretrained weights 41 | - Download various [RegionCLIP's pretrained weights](https://drive.google.com/drive/folders/1hzrJBvcCrahoRcqJRqzkIGFO_HUSJIii). Check [here](https://github.com/microsoft/RegionCLIP/blob/main/docs/MODEL_ZOO.md#model-downloading) for more details. 42 | Create a new folder `pretrained_ckpt` to put those weights. In this repository, `regionclip`, `concept_emb` and `rpn` will be used. 43 | 44 | - Download [our pretrained weights](https://drive.google.com/drive/u/1/folders/1TAr7nZSvpB6nCZCC6nXBw6xgmMmlL0X9) and put them in corresponding folders in `pretrained_ckpt`. 45 | Our pretrained weights includes: 46 | - `r50_3x_pre_RegCLIP_cocoRPN_2`: RPN weights pretrained only with COCO Base categories. This is used for experiments on COCO to avoid potential data leakage. 47 | - `concept_emb`: Complementary to RegionCLIP's `concept_emb`. 48 | 49 | ## Evaluation with released weights 50 | 51 | ### Results on COCO-OVD 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 |
ConfigsNovel APBase APOverall AP
w/o SAF head31.455.749.4
with SAF head37.458.553.0
73 | 74 |
75 | 76 | Evaluation without the SAF Head (baseline in the paper), 77 | 78 | 79 | ```bash 80 | python3 ./test_net.py \ 81 | --num-gpus 8 \ 82 | --eval-only \ 83 | --config-file ./sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml \ 84 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco_no_saf_head_baseline.pth \ 85 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \ 86 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \ 87 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \ 88 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \ 89 | MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \ 90 | OUTPUT_DIR output/eval 91 | ``` 92 |
93 | 94 |
95 | 96 | Evaluation with the SAF Head, 97 | 98 | 99 | ```bash 100 | python3 ./test_net.py \ 101 | --num-gpus 8 \ 102 | --eval-only \ 103 | --config-file ./sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml \ 104 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco.pth \ 105 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \ 106 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \ 107 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \ 108 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_coco_48_base_17_cls_emb.pth \ 109 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \ 110 | MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \ 111 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/coco_ovd_continue_cat_ids.json" \ 112 | MODEL.ENSEMBLE.ALPHA 0.3 MODEL.ENSEMBLE.BETA 0.7 \ 113 | OUTPUT_DIR output/eval 114 | ``` 115 |
116 | 117 | 118 | ### Results on LVIS-OVD 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 |
ConfigsAPrAPcAPfAP
RN50-C4 as backbone20.127.132.928.1
RN50x4-C4 as backbone29.032.336.833.5
143 | 144 |
145 | 146 | Evaluation with RN50-C4 as the backbone, 147 | 148 | 149 | ```bash 150 | python3 ./test_net.py \ 151 | --num-gpus 8 \ 152 | --eval-only \ 153 | --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \ 154 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50.pth \ 155 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \ 156 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \ 157 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb.pth \ 158 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb.pth \ 159 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \ 160 | MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \ 161 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \ 162 | MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \ 163 | OUTPUT_DIR output/eval 164 | ``` 165 |
166 | 167 |
168 | 169 | Evaluation with RN50x4-C4 as the backbone, 170 | 171 | 172 | ```bash 173 | python3 ./test_net.py \ 174 | --num-gpus 8 \ 175 | --eval-only \ 176 | --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \ 177 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50x4.pth \ 178 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \ 179 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \ 180 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb_rn50x4.pth \ 181 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb_rn50x4.pth \ 182 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \ 183 | MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \ 184 | MODEL.CLIP.TEXT_EMB_DIM 640 \ 185 | MODEL.RESNETS.DEPTH 200 \ 186 | MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \ 187 | MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \ 188 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \ 189 | MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \ 190 | OUTPUT_DIR output/eval 191 | ``` 192 |
193 | 194 | 195 | 196 | ## Acknowledgement 197 | 198 | This repository was built on top of [Detectron2](https://github.com/facebookresearch/detectron2), [RegionCLIP](https://github.com/microsoft/RegionCLIP), and [VLDet](https://github.com/clin1223/VLDet). We thank the effort from our community. 199 | -------------------------------------------------------------------------------- /datasets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/datasets/.gitkeep -------------------------------------------------------------------------------- /pretrained_ckpt/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/pretrained_ckpt/.gitkeep -------------------------------------------------------------------------------- /sas_det/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) NEC Laboratories America, Inc. 2 | from .modeling import ensemble_roi_heads as _ 3 | from .config import add_sas_det_config 4 | from .data import * 5 | 6 | -------------------------------------------------------------------------------- /sas_det/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # File: 4 | 5 | 6 | from . import catalog as _UNUSED # register the handler 7 | from .detection_checkpoint import DetectionCheckpointer 8 | from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer 9 | 10 | __all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"] 11 | -------------------------------------------------------------------------------- /sas_det/checkpoint/catalog.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | 4 | from detectron2.utils.file_io import PathHandler, PathManager 5 | 6 | 7 | class ModelCatalog(object): 8 | """ 9 | Store mappings from names to third-party models. 10 | """ 11 | 12 | S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron" 13 | 14 | # MSRA models have STRIDE_IN_1X1=True. False otherwise. 15 | # NOTE: all BN models here have fused BN into an affine layer. 16 | # As a result, you should only load them to a model with "FrozenBN". 17 | # Loading them to a model with regular BN or SyncBN is wrong. 18 | # Even when loaded to FrozenBN, it is still different from affine by an epsilon, 19 | # which should be negligible for training. 20 | # NOTE: all models here uses PIXEL_STD=[1,1,1] 21 | # NOTE: Most of the BN models here are no longer used. We use the 22 | # re-converted pre-trained models under detectron2 model zoo instead. 23 | C2_IMAGENET_MODELS = { 24 | "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", 25 | "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", 26 | "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", 27 | "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", 28 | "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", 29 | "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl", 30 | "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl", 31 | } 32 | 33 | C2_DETECTRON_PATH_FORMAT = ( 34 | "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl" # noqa B950 35 | ) 36 | 37 | C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival" 38 | C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival" 39 | 40 | # format: {model_name} -> part of the url 41 | C2_DETECTRON_MODELS = { 42 | "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW", # noqa B950 43 | "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I", # noqa B950 44 | "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7", # noqa B950 45 | "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ", # noqa B950 46 | "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB", # noqa B950 47 | "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC", # noqa B950 48 | "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT", # noqa B950 49 | "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI", # noqa B950 50 | "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q", # noqa B950 51 | "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao", # noqa B950 52 | "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L", # noqa B950 53 | "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179", # noqa B950 54 | "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2", # noqa B950 55 | } 56 | 57 | @staticmethod 58 | def get(name): 59 | if name.startswith("Caffe2Detectron/COCO"): 60 | return ModelCatalog._get_c2_detectron_baseline(name) 61 | if name.startswith("ImageNetPretrained/"): 62 | return ModelCatalog._get_c2_imagenet_pretrained(name) 63 | raise RuntimeError("model not present in the catalog: {}".format(name)) 64 | 65 | @staticmethod 66 | def _get_c2_imagenet_pretrained(name): 67 | prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX 68 | name = name[len("ImageNetPretrained/") :] 69 | name = ModelCatalog.C2_IMAGENET_MODELS[name] 70 | url = "/".join([prefix, name]) 71 | return url 72 | 73 | @staticmethod 74 | def _get_c2_detectron_baseline(name): 75 | name = name[len("Caffe2Detectron/COCO/") :] 76 | url = ModelCatalog.C2_DETECTRON_MODELS[name] 77 | if "keypoint_rcnn" in name: 78 | dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS 79 | else: 80 | dataset = ModelCatalog.C2_DATASET_COCO 81 | 82 | if "35998355/rpn_R-50-C4_1x" in name: 83 | # this one model is somehow different from others .. 84 | type = "rpn" 85 | else: 86 | type = "generalized_rcnn" 87 | 88 | # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`. 89 | url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format( 90 | prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset 91 | ) 92 | return url 93 | 94 | 95 | class ModelCatalogHandler(PathHandler): 96 | """ 97 | Resolve URL like catalog://. 98 | """ 99 | 100 | PREFIX = "catalog://" 101 | 102 | def _get_supported_prefixes(self): 103 | return [self.PREFIX] 104 | 105 | def _get_local_path(self, path, **kwargs): 106 | logger = logging.getLogger(__name__) 107 | catalog_path = ModelCatalog.get(path[len(self.PREFIX) :]) 108 | logger.info("Catalog entry {} points to {}".format(path, catalog_path)) 109 | return PathManager.get_local_path(catalog_path, **kwargs) 110 | 111 | def _open(self, path, mode="r", **kwargs): 112 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 113 | 114 | 115 | PathManager.register_handler(ModelCatalogHandler()) 116 | -------------------------------------------------------------------------------- /sas_det/checkpoint/detection_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import os 4 | import pickle 5 | import torch 6 | from fvcore.common.checkpoint import Checkpointer 7 | from torch.nn.parallel import DistributedDataParallel 8 | 9 | import detectron2.utils.comm as comm 10 | from detectron2.utils.env import TORCH_VERSION 11 | from detectron2.utils.file_io import PathManager 12 | 13 | from .c2_model_loading import align_and_update_state_dicts 14 | from .clip_model_loading import align_and_update_state_dicts_for_CLIP 15 | 16 | class DetectionCheckpointer(Checkpointer): 17 | """ 18 | Same as :class:`Checkpointer`, but is able to: 19 | 1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models. 20 | 2. correctly load checkpoints that are only available on the master worker 21 | """ 22 | 23 | def __init__(self, model, save_dir="", *, save_to_disk=None, bb_rpn_weights=False, **checkpointables): 24 | is_main_process = comm.is_main_process() 25 | super().__init__( 26 | model, 27 | save_dir, 28 | save_to_disk=is_main_process if save_to_disk is None else save_to_disk, 29 | **checkpointables, 30 | ) 31 | self.path_manager = PathManager 32 | self.bb_rpn_weights = bb_rpn_weights 33 | 34 | def load(self, path, *args, **kwargs): 35 | need_sync = False 36 | 37 | if path and isinstance(self.model, DistributedDataParallel): 38 | logger = logging.getLogger(__name__) 39 | path = self.path_manager.get_local_path(path) 40 | has_file = os.path.isfile(path) 41 | all_has_file = comm.all_gather(has_file) 42 | if not all_has_file[0]: 43 | raise OSError(f"File {path} not found on main worker.") 44 | if not all(all_has_file): 45 | logger.warning( 46 | f"Not all workers can read checkpoint {path}. " 47 | "Training may fail to fully resume." 48 | ) 49 | # TODO: broadcast the checkpoint file contents from main 50 | # worker, and load from it instead. 51 | need_sync = True 52 | if not has_file: 53 | path = None # don't load if not readable 54 | ret = super().load(path, *args, **kwargs) 55 | 56 | if need_sync: 57 | logger.info("Broadcasting model states from main worker ...") 58 | if TORCH_VERSION >= (1, 7): 59 | self.model._sync_params_and_buffers() 60 | return ret 61 | 62 | def _load_file(self, filename): 63 | if filename.endswith(".pkl"): 64 | with PathManager.open(filename, "rb") as f: 65 | data = pickle.load(f, encoding="latin1") 66 | if "model" in data and "__author__" in data: 67 | # file is in Detectron2 model zoo format 68 | self.logger.info("Reading a file from '{}'".format(data["__author__"])) 69 | return data 70 | else: 71 | # assume file is from Caffe2 / Detectron1 model zoo 72 | if "blobs" in data: 73 | # Detection models have "blobs", but ImageNet models don't 74 | data = data["blobs"] 75 | data = {k: v for k, v in data.items() if not k.endswith("_momentum")} 76 | return {"model": data, "__author__": "Caffe2", "matching_heuristics": True} 77 | elif filename.endswith(".pyth"): 78 | # assume file is from pycls; no one else seems to use the ".pyth" extension 79 | with PathManager.open(filename, "rb") as f: 80 | data = torch.load(f) 81 | assert ( 82 | "model_state" in data 83 | ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'." 84 | model_state = { 85 | k: v 86 | for k, v in data["model_state"].items() 87 | if not k.endswith("num_batches_tracked") 88 | } 89 | return {"model": model_state, "__author__": "pycls", "matching_heuristics": True} 90 | elif "OAI_CLIP" in filename: 91 | # assume file is from OpenAI CLIP pre-trained model 92 | loaded = super()._load_file(filename) # load native pth checkpoint 93 | if "model" not in loaded: 94 | loaded = {"model": loaded} 95 | return {"model": loaded["model"], "__author__": "OAI_CLIP", "matching_heuristics": True} 96 | 97 | loaded = super()._load_file(filename) # load native pth checkpoint 98 | if "model" not in loaded: 99 | loaded = {"model": loaded} 100 | return loaded 101 | 102 | def _load_model(self, checkpoint): 103 | if checkpoint.get("matching_heuristics", False) or self.bb_rpn_weights: 104 | self._convert_ndarray_to_tensor(checkpoint["model"]) 105 | # convert weights by name-matching heuristics 106 | if checkpoint.get("__author__", "NA") == "OAI_CLIP" or self.bb_rpn_weights: # for OAI_CLIP or 2nd ckpt (offline modules) 107 | checkpoint["model"] = align_and_update_state_dicts_for_CLIP( 108 | self.model.state_dict(), 109 | checkpoint["model"], 110 | bb_rpn_weights=self.bb_rpn_weights, 111 | ) 112 | else: # default loading 113 | checkpoint["model"] = align_and_update_state_dicts( 114 | self.model.state_dict(), 115 | checkpoint["model"], 116 | c2_conversion=checkpoint.get("__author__", None) == "Caffe2", 117 | ) 118 | # for non-caffe2 models, use standard ways to load it 119 | incompatible = super()._load_model(checkpoint) 120 | del checkpoint # try saving memory 121 | 122 | model_buffers = dict(self.model.named_buffers(recurse=False)) 123 | for k in ["pixel_mean", "pixel_std"]: 124 | # Ignore missing key message about pixel_mean/std. 125 | # Though they may be missing in old checkpoints, they will be correctly 126 | # initialized from config anyway. 127 | if k in model_buffers: 128 | try: 129 | incompatible.missing_keys.remove(k) 130 | except ValueError: 131 | pass 132 | return incompatible -------------------------------------------------------------------------------- /sas_det/configs/ovd_coco_R50_C4_ensemble.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./regionclip/Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" 8 | MASK_ON: False 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | # RPN: # not used 16 | # HEAD_NAME: StandardRPNHead 17 | # IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "EnsembleCLIPRes5ROIHeads" # 20 | IN_FEATURES: ["res4"] 21 | NUM_CLASSES: 48 # base categories only 22 | SCORE_THRESH_TEST: 0.001 23 | ROI_BOX_HEAD: 24 | NAME: "FastRCNNConvFCHead" # for text head 25 | NUM_FC: 2 26 | POOLER_RESOLUTION: 14 27 | CLS_AGNOSTIC_BBOX_REG: True 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 0 31 | POOLER_RESOLUTION: 14 32 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 33 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 34 | CLIP: 35 | CROP_REGION_TYPE: "RPN" 36 | USE_TEXT_EMB_CLASSIFIER: True 37 | CLSS_TEMP: 0.01 38 | NO_BOX_DELTA: False 39 | BG_CLS_LOSS_WEIGHT: 0.2 40 | FOCAL_SCALED_LOSS: 0.5 41 | INPUT: 42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 43 | DATASETS: 44 | TRAIN: ("coco_2017_ovd_b_train",) 45 | TEST: ("coco_2017_ovd_all_test",) 46 | TEST: 47 | EVAL_PERIOD: 5000 48 | SOLVER: 49 | IMS_PER_BATCH: 16 50 | BASE_LR: 0.002 51 | STEPS: (60000, 80000) 52 | MAX_ITER: 90000 53 | WARMUP_ITERS: 5000 54 | CHECKPOINT_PERIOD: 10000 55 | INPUT: 56 | MIN_SIZE_TRAIN_SAMPLING: choice 57 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 58 | MAX_SIZE_TRAIN: 1333 59 | MIN_SIZE_TEST: 800 60 | MAX_SIZE_TEST: 1333 61 | FORMAT: "RGB" -------------------------------------------------------------------------------- /sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./ovd_coco_R50_C4_ensemble.yaml" 2 | MODEL: 3 | ROI_BOX_HEAD: 4 | NAME: "CLIP_BOX_HEAD" # close-branch head 5 | OVD: 6 | WITH_PSEUDO_LABELS: True 7 | # 8 | USE_ADAPTIVE_THRES: True 9 | PL_THRESHOLD: 0.85 10 | PL_NMS_THRES: 0.5 11 | RPN_FUSION_METHOD: "avg_norm_scores" 12 | CATEGORY_INFO: None 13 | # periodic update 14 | USE_PERIODIC_UPDATE: True 15 | # box reg, cls loss 16 | BOX_CONFIDENCE_THRES: 1.0 17 | USE_CONFIDENCE_WEIGHT: True -------------------------------------------------------------------------------- /sas_det/configs/ovd_lvis_R50_C4_SAS_Det_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./ovd_lvis_R50_C4_ensemble_PLs.yaml" 2 | DATASETS: 3 | TRAIN: ('lvis_v1_train_SASDet_r50x4_PLs', 'lvis_v1_o365_SASDet_r50x4_PLs',) 4 | SOLVER: 5 | CHECKPOINT_PERIOD: 20000 6 | STEPS: (210000, 250000) 7 | MAX_ITER: 270000 8 | TEST: 9 | EVAL_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NAME: "EnsembleCLIPRes5ROIHeads" # 5 | ROI_BOX_HEAD: 6 | NAME: "CLIP_BOX_HEAD" # close-branch head 7 | # NUM_FC: 2 8 | POOLER_RESOLUTION: 14 9 | CLS_AGNOSTIC_BBOX_REG: True 10 | OVD: 11 | WITH_PSEUDO_LABELS: True 12 | # 13 | USE_ADAPTIVE_THRES: True 14 | PL_NMS_THRES: 0.5 15 | PL_THRESHOLD: 0.925 16 | MIN_AVG_PLS: 2.0 17 | MAX_AVG_PLS: 4.0 18 | ADAPTIVE_THRES_DELTA: 0.005 19 | RPN_FUSION_METHOD: "avg_logits" 20 | CATEGORY_INFO: None # if None, assume novel cat ids >= len(base_categories) 21 | # periodic update 22 | USE_PERIODIC_UPDATE: True 23 | PERIODIC_STEPS: (120000, 160000) 24 | # box reg, cls loss 25 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 26 | USE_CONFIDENCE_WEIGHT: False # False for LVIS 27 | ENSEMBLE: 28 | ALPHA: 0.33 29 | BETA: 0.67 30 | # TEST_CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json" 31 | SOLVER: 32 | CHECKPOINT_PERIOD: 20000 33 | TEST: 34 | EVAL_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Base-RCNN-C4.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | ROI_HEADS: 7 | NAME: "Res5ROIHeads" 8 | DATASETS: 9 | TRAIN: ("coco_2017_train",) 10 | TEST: ("coco_2017_val",) 11 | SOLVER: 12 | IMS_PER_BATCH: 16 13 | BASE_LR: 0.02 14 | STEPS: (60000, 80000) 15 | MAX_ITER: 90000 16 | INPUT: 17 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 18 | VERSION: 2 19 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Base-RCNN-DilatedC5.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RESNETS: 4 | OUT_FEATURES: ["res5"] 5 | RES5_DILATION: 2 6 | RPN: 7 | IN_FEATURES: ["res5"] 8 | PRE_NMS_TOPK_TEST: 6000 9 | POST_NMS_TOPK_TEST: 1000 10 | ROI_HEADS: 11 | NAME: "StandardROIHeads" 12 | IN_FEATURES: ["res5"] 13 | ROI_BOX_HEAD: 14 | NAME: "FastRCNNConvFCHead" 15 | NUM_FC: 2 16 | POOLER_RESOLUTION: 7 17 | ROI_MASK_HEAD: 18 | NAME: "MaskRCNNConvUpsampleHead" 19 | NUM_CONV: 4 20 | POOLER_RESOLUTION: 14 21 | DATASETS: 22 | TRAIN: ("coco_2017_train",) 23 | TEST: ("coco_2017_val",) 24 | SOLVER: 25 | IMS_PER_BATCH: 16 26 | BASE_LR: 0.02 27 | STEPS: (60000, 80000) 28 | MAX_ITER: 90000 29 | INPUT: 30 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 31 | VERSION: 2 32 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | BACKBONE: 4 | NAME: "build_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 7 | FPN: 8 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 9 | ANCHOR_GENERATOR: 10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 12 | RPN: 13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 16 | # Detectron1 uses 2000 proposals per-batch, 17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 19 | POST_NMS_TOPK_TRAIN: 1000 20 | POST_NMS_TOPK_TEST: 1000 21 | ROI_HEADS: 22 | NAME: "StandardROIHeads" 23 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 24 | ROI_BOX_HEAD: 25 | NAME: "FastRCNNConvFCHead" 26 | NUM_FC: 2 27 | POOLER_RESOLUTION: 7 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 4 31 | POOLER_RESOLUTION: 14 32 | DATASETS: 33 | TRAIN: ("coco_2017_train",) 34 | TEST: ("coco_2017_val",) 35 | SOLVER: 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 38 | STEPS: (60000, 80000) 39 | MAX_ITER: 90000 40 | INPUT: 41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 42 | VERSION: 2 43 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Base-RetinaNet.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "RetinaNet" 3 | BACKBONE: 4 | NAME: "build_retinanet_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res3", "res4", "res5"] 7 | ANCHOR_GENERATOR: 8 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"] 9 | FPN: 10 | IN_FEATURES: ["res3", "res4", "res5"] 11 | RETINANET: 12 | IOU_THRESHOLDS: [0.4, 0.5] 13 | IOU_LABELS: [0, -1, 1] 14 | SMOOTH_L1_LOSS_BETA: 0.0 15 | DATASETS: 16 | TRAIN: ("coco_2017_train",) 17 | TEST: ("coco_2017_val",) 18 | SOLVER: 19 | IMS_PER_BATCH: 16 20 | BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate 21 | STEPS: (60000, 80000) 22 | MAX_ITER: 90000 23 | INPUT: 24 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 25 | VERSION: 2 26 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | LOAD_PROPOSALS: True 6 | RESNETS: 7 | DEPTH: 50 8 | PROPOSAL_GENERATOR: 9 | NAME: "PrecomputedProposals" 10 | DATASETS: 11 | TRAIN: ("coco_2017_train",) 12 | PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", ) 13 | TEST: ("coco_2017_val",) 14 | PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", ) 15 | DATALOADER: 16 | # proposals are part of the dataset_dicts, and take a lot of RAM 17 | NUM_WORKERS: 2 18 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | MASK_ON: False 4 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" 5 | PIXEL_STD: [57.375, 57.120, 58.395] 6 | RESNETS: 7 | STRIDE_IN_1X1: False # this is a C2 model 8 | NUM_GROUPS: 32 9 | WIDTH_PER_GROUP: 8 10 | DEPTH: 101 11 | SOLVER: 12 | STEPS: (210000, 250000) 13 | MAX_ITER: 270000 14 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/retinanet_R_101_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RetinaNet.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | SOLVER: 7 | STEPS: (210000, 250000) 8 | MAX_ITER: 270000 9 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.py: -------------------------------------------------------------------------------- 1 | from ..common.optim import SGD as optimizer 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 3 | from ..common.data.coco import dataloader 4 | from ..common.models.retinanet import model 5 | from ..common.train import train 6 | 7 | dataloader.train.mapper.use_instance_mask = False 8 | model.backbone.bottom_up.freeze_at = 2 9 | optimizer.lr = 0.01 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RetinaNet.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | RESNETS: 5 | DEPTH: 50 6 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RetinaNet.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | RESNETS: 5 | DEPTH: 50 6 | SOLVER: 7 | STEPS: (210000, 250000) 8 | MAX_ITER: 270000 9 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/rpn_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "ProposalNetwork" 4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 5 | MASK_ON: False 6 | RESNETS: 7 | DEPTH: 50 8 | RPN: 9 | PRE_NMS_TOPK_TEST: 12000 10 | POST_NMS_TOPK_TEST: 2000 11 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-Detection/rpn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "ProposalNetwork" 4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 5 | MASK_ON: False 6 | RESNETS: 7 | DEPTH: 50 8 | RPN: 9 | POST_NMS_TOPK_TEST: 2000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" 8 | MASK_ON: False 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | RPN: 16 | HEAD_NAME: StandardRPNHead 17 | IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "CLIPRes5ROIHeads" 20 | IN_FEATURES: ["res4"] 21 | NUM_CLASSES: 48 # base categories 22 | SCORE_THRESH_TEST: 0.001 23 | ROI_BOX_HEAD: 24 | NAME: "" 25 | NUM_FC: 0 26 | POOLER_RESOLUTION: 14 27 | CLS_AGNOSTIC_BBOX_REG: True 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 0 31 | POOLER_RESOLUTION: 14 32 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 33 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 34 | CLIP: 35 | CROP_REGION_TYPE: "RPN" 36 | USE_TEXT_EMB_CLASSIFIER: True 37 | CLSS_TEMP: 0.01 38 | NO_BOX_DELTA: False 39 | BG_CLS_LOSS_WEIGHT: 0.2 40 | FOCAL_SCALED_LOSS: 0.5 41 | INPUT: 42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 43 | DATASETS: 44 | TRAIN: ("coco_2017_ovd_b_train",) 45 | TEST: ("coco_2017_ovd_all_test",) 46 | TEST: 47 | EVAL_PERIOD: 25000 48 | SOLVER: 49 | IMS_PER_BATCH: 16 50 | BASE_LR: 0.002 51 | STEPS: (60000, 80000) 52 | MAX_ITER: 90000 53 | WARMUP_ITERS: 5000 54 | CHECKPOINT_PERIOD: 10000 55 | INPUT: 56 | MIN_SIZE_TRAIN_SAMPLING: choice 57 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 58 | MAX_SIZE_TRAIN: 1333 59 | MIN_SIZE_TEST: 800 60 | MAX_SIZE_TEST: 1333 61 | FORMAT: "RGB" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 80 5 | DATASETS: 6 | TRAIN: ("coco_2017_train",) 7 | TEST: ("coco_2017_val",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | DATASETS: 3 | TEST: ("coco_2017_ovd_b_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | DATASETS: 3 | TEST: ("coco_2017_ovd_t_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 65 5 | NMS_THRESH_TEST: 0.5 6 | CLIP: 7 | NO_BOX_DELTA: True # no box refinement 8 | OFFLINE_RPN_NMS_THRESH: 0.7 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_clip_resnet_backbone_from_pretrain" 5 | ROI_HEADS: 6 | NUM_CLASSES: 65 7 | NMS_THRESH_TEST: 0.5 8 | CLIP: 9 | NO_BOX_DELTA: True # no box refinement 10 | OFFLINE_RPN_NMS_THRESH: 0.9 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 65 # base + novel categories 5 | OVD: 6 | WITH_PSEUDO_LABELS: True 7 | USE_ADAPTIVE_THRES: True 8 | PL_THRESHOLD: 0.8 # init pl threshold 9 | PL_NMS_THRES: 0.5 10 | RPN_FUSION_METHOD: "avg_norm_scores" 11 | USE_PERIODIC_UPDATE: True 12 | BOX_CONFIDENCE_THRES: 1.0 # only use pseudo boxes with confidence > BOX_CONFIDENCE_THRES. 1.0 means no pseudo boxes 13 | USE_CONFIDENCE_WEIGHT: True 14 | DATASETS: 15 | TRAIN: ("coco_2017_ovd_b_train_65cats",) 16 | TEST: ("coco_2017_ovd_all_test",) 17 | DATALOADER: 18 | FILTER_EMPTY_ANNOTATIONS: False # empty images may contain novel categories 19 | SOLVER: 20 | CHECKPOINT_PERIOD: 10000 21 | TEST: 22 | EVAL_PERIOD: 5000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml" 2 | SOLVER: 3 | STEPS: (210000, 250000) 4 | MAX_ITER: 270000 5 | CHECKPOINT_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | CLIP: 4 | CROP_REGION_TYPE: "RPN" 5 | # OFFLINE_RPN_NMS_THRESH: 0.3 # will change offline_cfg.MODEL.RPN.NMS_THRESH, will affect the eval performance 6 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST 7 | PRETRAIN_SAMPLE_REGIONS: 32 # num_regions_per_img, topk in box selection 8 | # for inference 9 | NO_BOX_DELTA: False # check 10 | USE_TEXT_EMB_CLASSIFIER: True 11 | MULTIPLY_RPN_SCORE: False # check 12 | WEAK_LOSS: 13 | WEAK_LOSS_WEIGHT: 0.01 14 | BOX_SELECT_THRES: 0.97 # threshold in box selection 15 | NEG_CONCEPT_NUM: 10 16 | DATASETS: 17 | TRAIN: ("coco_2017_ovd_b_train", "coco_caption_nouns_train_4764tags",) # coco_2017_ovd_b_train with 48 cats 18 | TEST: ("coco_generalized_del_val",) 19 | INPUT: 20 | CUSTOM_AUG: ResizeShortestEdge 21 | MIN_SIZE_TRAIN_SAMPLING: range 22 | MIN_SIZE_TRAIN: (800, 800) 23 | DATALOADER: 24 | SAMPLER_TRAIN: "MultiDatasetSampler" 25 | DATASET_RATIO: [1, 4] 26 | USE_DIFF_BS_SIZE: True 27 | DATASET_BS: [2, 8] 28 | USE_RFS: [False, False] 29 | DATASET_MIN_SIZES: [[800, 800], [400, 400]] 30 | DATASET_MAX_SIZES: [1333, 667] 31 | FILTER_EMPTY_ANNOTATIONS: False 32 | MULTI_DATASET_GROUPING: True 33 | DATASET_ANN: ['box', 'caption'] 34 | NUM_WORKERS: 8 35 | TEST: 36 | EVAL_PERIOD: 10000 37 | FIND_UNUSED_PARAM: True 38 | WITH_IMAGE_LABELS: True 39 | OUTPUT_DIR: output/test -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_2x_PLs_per4k_clsBoxConf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml" 2 | MODEL: 3 | OVD: 4 | WITH_PSEUDO_LABELS: True 5 | USE_ADAPTIVE_THRES: True 6 | PL_THRESHOLD: 0.9 7 | MIN_AVG_PLS: 1.0 8 | MAX_AVG_PLS: 3.0 9 | PL_NMS_THRES: 0.5 10 | RPN_FUSION_METHOD: "avg_norm_scores" 11 | CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json" 12 | # periodic update 13 | USE_PERIODIC_UPDATE: True 14 | PERIODIC_STEPS: (40000, 80000, 120000, 160000) 15 | # box reg 16 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 17 | SOLVER: 18 | STEPS: (120000, 160000) 19 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 20 | CHECKPOINT_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_PLs_clsBoxConf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "MyCLIPFastRCNN" 4 | ROI_HEADS: 5 | NUM_CLASSES: 65 # base + novel categories 6 | OVD: 7 | WITH_PSEUDO_LABELS: True 8 | USE_ADAPTIVE_THRES: True 9 | PL_THRESHOLD: 0.9 10 | MIN_AVG_PLS: 1.0 11 | MAX_AVG_PLS: 3.0 12 | PL_NMS_THRES: 0.5 13 | RPN_FUSION_METHOD: "avg_norm_scores" 14 | CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json" 15 | # periodic update 16 | USE_PERIODIC_UPDATE: True 17 | PERIODIC_STEPS: (40000, 60000, 80000) 18 | # box reg 19 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 20 | DATASETS: 21 | TRAIN: ("coco_2017_ovd_b_train_65cats",) 22 | TEST: ("coco_2017_ovd_all_test",) 23 | SOLVER: 24 | CHECKPOINT_PERIOD: 10000 25 | TEST: 26 | EVAL_PERIOD: 5000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_offline_PLs.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "MyCLIPFastRCNN" 4 | ROI_HEADS: 5 | NUM_CLASSES: 65 # base + novel categories 6 | OVD: 7 | WITH_PSEUDO_LABELS: False # no online PLs 8 | # box reg 9 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 10 | DATASETS: 11 | TRAIN: ("",) 12 | TEST: ("coco_2017_ovd_all_test",) 13 | SOLVER: 14 | CHECKPOINT_PERIOD: 10000 15 | TEST: 16 | EVAL_PERIOD: 5000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_frozen_CLIP_RPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "MyCLIPFastRCNN" 4 | # IGNORE_CLS_LOSS: True 5 | CLIP: 6 | FREEZE_BACKBONE: True 7 | SOLVER: 8 | IMS_PER_BATCH: 16 9 | BASE_LR: 0.002 10 | STEPS: (60000, 80000) 11 | MAX_ITER: 90000 12 | WARMUP_ITERS: 5000 13 | CHECKPOINT_PERIOD: 10000 14 | TEST: 15 | EVAL_PERIOD: 5000 16 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "GeneralizedRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | RPN: 16 | HEAD_NAME: StandardRPNHead 17 | IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads" 20 | IN_FEATURES: ["res4"] 21 | ROI_BOX_HEAD: 22 | NAME: "" 23 | NUM_FC: 0 24 | POOLER_RESOLUTION: 14 25 | ROI_MASK_HEAD: 26 | NAME: "MaskRCNNConvUpsampleHead" 27 | NUM_CONV: 0 28 | POOLER_RESOLUTION: 14 29 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 30 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 31 | INPUT: 32 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 33 | TEST: 34 | EVAL_PERIOD: 50000 35 | SOLVER: 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 38 | STEPS: (60000, 80000) 39 | MAX_ITER: 90000 40 | INPUT: 41 | MIN_SIZE_TRAIN_SAMPLING: choice 42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 43 | MAX_SIZE_TRAIN: 1333 44 | MIN_SIZE_TEST: 800 45 | MAX_SIZE_TEST: 1333 46 | FORMAT: "RGB" # "BGR" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "GeneralizedRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | RPN: 16 | HEAD_NAME: StandardRPNHead 17 | IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads" 20 | IN_FEATURES: ["res4"] 21 | NUM_CLASSES: 48 22 | ROI_BOX_HEAD: 23 | NAME: "" 24 | NUM_FC: 0 25 | POOLER_RESOLUTION: 14 26 | ROI_MASK_HEAD: 27 | NAME: "MaskRCNNConvUpsampleHead" 28 | NUM_CONV: 0 29 | POOLER_RESOLUTION: 14 30 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 31 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 32 | INPUT: 33 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 34 | DATASETS: 35 | TRAIN: ("coco_2017_ovd_b_train",) 36 | TEST: ("coco_2017_ovd_b_test",) 37 | TEST: 38 | EVAL_PERIOD: 50000 39 | SOLVER: 40 | IMS_PER_BATCH: 16 41 | BASE_LR: 0.02 42 | STEPS: (60000, 80000) 43 | MAX_ITER: 90000 44 | INPUT: 45 | MIN_SIZE_TRAIN_SAMPLING: choice 46 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 47 | MAX_SIZE_TRAIN: 1333 48 | MIN_SIZE_TEST: 800 49 | MAX_SIZE_TEST: 1333 50 | FORMAT: "RGB" # "BGR" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.py: -------------------------------------------------------------------------------- 1 | from ..common.train import train 2 | from ..common.optim import SGD as optimizer 3 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 4 | from ..common.data.coco import dataloader 5 | from ..common.models.mask_rcnn_c4 import model 6 | 7 | model.backbone.freeze_at = 2 8 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 48 9 | DATASETS: 10 | TRAIN: ("coco_2017_ovd_b_train",) 11 | TEST: ("coco_2017_ovd_all_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_coco65.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 65 9 | DATASETS: 10 | TRAIN: ("coco_2017_ovd_all_train",) 11 | TEST: ("coco_2017_ovd_all_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.py: -------------------------------------------------------------------------------- 1 | from ..common.optim import SGD as optimizer 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 3 | from ..common.data.coco import dataloader 4 | from ..common.models.mask_rcnn_fpn import model 5 | from ..common.train import train 6 | 7 | model.backbone.bottom_up.freeze_at = 2 8 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_giou.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | RPN: 8 | BBOX_REG_LOSS_TYPE: "giou" 9 | BBOX_REG_LOSS_WEIGHT: 2.0 10 | ROI_BOX_HEAD: 11 | BBOX_REG_LOSS_TYPE: "giou" 12 | BBOX_REG_LOSS_WEIGHT: 10.0 13 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 48 9 | DATASETS: 10 | TRAIN: ("coco_2017_ovd_b_train",) 11 | TEST: ("coco_2017_ovd_b_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 65 9 | DATASETS: 10 | TRAIN: ("coco_2017_ovd_all_train",) 11 | TEST: ("coco_2017_ovd_all_test",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | MASK_ON: True 4 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" 5 | PIXEL_STD: [57.375, 57.120, 58.395] 6 | RESNETS: 7 | STRIDE_IN_1X1: False # this is a C2 model 8 | NUM_GROUPS: 32 9 | WIDTH_PER_GROUP: 8 10 | DEPTH: 101 11 | SOLVER: 12 | STEPS: (210000, 250000) 13 | MAX_ITER: 270000 14 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnetx_4gf_dds_fpn_1x.py: -------------------------------------------------------------------------------- 1 | from ..common.optim import SGD as optimizer 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 3 | from ..common.data.coco import dataloader 4 | from ..common.models.mask_rcnn_fpn import model 5 | from ..common.train import train 6 | 7 | from detectron2.config import LazyCall as L 8 | from detectron2.modeling.backbone import RegNet 9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock 10 | 11 | 12 | # Replace default ResNet with RegNetX-4GF from the DDS paper. Config source: 13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnetx/RegNetX-4.0GF_dds_8gpu.yaml#L4-L9 # noqa 14 | model.backbone.bottom_up = L(RegNet)( 15 | stem_class=SimpleStem, 16 | stem_width=32, 17 | block_class=ResBottleneckBlock, 18 | depth=23, 19 | w_a=38.65, 20 | w_0=96, 21 | w_m=2.43, 22 | group_width=40, 23 | freeze_at=2, 24 | norm="FrozenBN", 25 | out_features=["s1", "s2", "s3", "s4"], 26 | ) 27 | model.pixel_std = [57.375, 57.120, 58.395] 28 | 29 | optimizer.weight_decay = 5e-5 30 | train.init_checkpoint = ( 31 | "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906383/RegNetX-4.0GF_dds_8gpu.pyth" 32 | ) 33 | # RegNets benefit from enabling cudnn benchmark mode 34 | train.cudnn_benchmark = True 35 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnety_4gf_dds_fpn_1x.py: -------------------------------------------------------------------------------- 1 | from ..common.optim import SGD as optimizer 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 3 | from ..common.data.coco import dataloader 4 | from ..common.models.mask_rcnn_fpn import model 5 | from ..common.train import train 6 | 7 | from detectron2.config import LazyCall as L 8 | from detectron2.modeling.backbone import RegNet 9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock 10 | 11 | 12 | # Replace default ResNet with RegNetY-4GF from the DDS paper. Config source: 13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnety/RegNetY-4.0GF_dds_8gpu.yaml#L4-L10 # noqa 14 | model.backbone.bottom_up = L(RegNet)( 15 | stem_class=SimpleStem, 16 | stem_width=32, 17 | block_class=ResBottleneckBlock, 18 | depth=22, 19 | w_a=31.41, 20 | w_0=96, 21 | w_m=2.24, 22 | group_width=64, 23 | se_ratio=0.25, 24 | freeze_at=2, 25 | norm="FrozenBN", 26 | out_features=["s1", "s2", "s3", "s4"], 27 | ) 28 | model.pixel_std = [57.375, 57.120, 58.395] 29 | 30 | optimizer.weight_decay = 5e-5 31 | train.init_checkpoint = ( 32 | "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906838/RegNetY-4.0GF_dds_8gpu.pyth" 33 | ) 34 | # RegNets benefit from enabling cudnn benchmark mode 35 | train.cudnn_benchmark = True 36 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 1230 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v0.5_train",) 14 | TEST: ("lvis_v0.5_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | DATALOADER: 18 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 19 | REPEAT_THRESHOLD: 0.001 20 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 1230 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v0.5_train",) 14 | TEST: ("lvis_v0.5_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | DATALOADER: 18 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 19 | REPEAT_THRESHOLD: 0.001 20 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" 4 | PIXEL_STD: [57.375, 57.120, 58.395] 5 | MASK_ON: True 6 | RESNETS: 7 | STRIDE_IN_1X1: False # this is a C2 model 8 | NUM_GROUPS: 32 9 | WIDTH_PER_GROUP: 8 10 | DEPTH: 101 11 | ROI_HEADS: 12 | NUM_CLASSES: 1230 13 | SCORE_THRESH_TEST: 0.0001 14 | INPUT: 15 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 16 | DATASETS: 17 | TRAIN: ("lvis_v0.5_train",) 18 | TEST: ("lvis_v0.5_val",) 19 | TEST: 20 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 21 | DATALOADER: 22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 23 | REPEAT_THRESHOLD: 0.001 24 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | RPN: 16 | HEAD_NAME: StandardRPNHead 17 | IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "CLIPRes5ROIHeads" 20 | IN_FEATURES: ["res4"] 21 | NUM_CLASSES: 866 # 1203 22 | SCORE_THRESH_TEST: 0.02 23 | ROI_BOX_HEAD: 24 | NAME: "" 25 | NUM_FC: 0 26 | POOLER_RESOLUTION: 14 27 | CLS_AGNOSTIC_BBOX_REG: True 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 0 31 | POOLER_RESOLUTION: 14 32 | CLS_AGNOSTIC_MASK: True 33 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 34 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 35 | CLIP: 36 | CROP_REGION_TYPE: "RPN" 37 | USE_TEXT_EMB_CLASSIFIER: True 38 | CLSS_TEMP: 0.01 39 | NO_BOX_DELTA: False 40 | BG_CLS_LOSS_WEIGHT: 0.8 41 | MULTIPLY_RPN_SCORE: True 42 | INPUT: 43 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 44 | DATASETS: 45 | TRAIN: ("lvis_v1_train",) 46 | TEST: ("lvis_v1_val",) 47 | TEST: 48 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 49 | EVAL_PERIOD: 25000 50 | SOLVER: 51 | IMS_PER_BATCH: 16 52 | BASE_LR: 0.002 53 | STEPS: (120000, 160000) 54 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 55 | WARMUP_ITERS: 5000 56 | DATALOADER: 57 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 58 | REPEAT_THRESHOLD: 0.001 59 | INPUT: 60 | MIN_SIZE_TRAIN_SAMPLING: choice 61 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 62 | MAX_SIZE_TRAIN: 1333 63 | MIN_SIZE_TEST: 800 64 | MAX_SIZE_TEST: 1333 65 | FORMAT: "RGB" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | MASK_ON: False 4 | ROI_HEADS: 5 | NUM_CLASSES: 1203 6 | NMS_THRESH_TEST: 0.3 7 | CLIP: 8 | NO_BOX_DELTA: True 9 | OFFLINE_RPN_NMS_THRESH: 0.9 10 | VIS: True # Note: visualize the scores before multiplying RPN scores, if any 11 | DATASETS: 12 | TRAIN: ("lvis_v1_train_custom_img",) 13 | TEST: ("lvis_v1_val_custom_img",) -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | MASK_ON: False 4 | ROI_HEADS: 5 | NUM_CLASSES: 1203 6 | NMS_THRESH_TEST: 0.5 7 | CLIP: 8 | NO_BOX_DELTA: True 9 | OFFLINE_RPN_NMS_THRESH: 0.9 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | MASK_ON: False 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone_from_pretrain" 6 | ROI_HEADS: 7 | NUM_CLASSES: 1203 8 | NMS_THRESH_TEST: 0.5 9 | CLIP: 10 | NO_BOX_DELTA: True 11 | OFFLINE_RPN_NMS_THRESH: 0.9 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_box_PLs_periodic_boxConf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 1203 # base + novel categories 5 | OVD: 6 | WITH_PSEUDO_LABELS: True 7 | # 8 | USE_ADAPTIVE_THRES: True 9 | PL_NMS_THRES: 0.5 10 | PL_THRESHOLD: 0.925 11 | MIN_AVG_PLS: 2.0 12 | MAX_AVG_PLS: 3.0 13 | ADAPTIVE_THRES_DELTA: 0.005 14 | RPN_FUSION_METHOD: "avg_logits" 15 | # CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json" 16 | # periodic update 17 | USE_PERIODIC_UPDATE: True 18 | PERIODIC_STEPS: (120000, 160000) 19 | # box reg 20 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 21 | DATASETS: 22 | TRAIN: ("lvis_v1_train_base_1203cats",) 23 | TEST: ("lvis_v1_val",) 24 | SOLVER: 25 | IMS_PER_BATCH: 16 26 | BASE_LR: 0.002 27 | STEPS: (120000, 160000) 28 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 29 | WARMUP_ITERS: 5000 30 | CHECKPOINT_PERIOD: 20000 31 | TEST: 32 | EVAL_PERIOD: 20000 33 | OUTPUT_DIR: output/ovd_lvis_ft_PLs_per4kUpdate_boxConf 34 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_fCLIP_PLs_clsBoxConf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "MyCLIPFastRCNN" 4 | ROI_HEADS: 5 | NUM_CLASSES: 1203 # base + novel categories 6 | OVD: 7 | WITH_PSEUDO_LABELS: True 8 | # 9 | USE_ADAPTIVE_THRES: True 10 | PL_NMS_THRES: 0.5 11 | PL_THRESHOLD: 0.925 12 | MIN_AVG_PLS: 1.0 13 | MAX_AVG_PLS: 3.0 14 | ADAPTIVE_THRES_DELTA: 0.005 15 | RPN_FUSION_METHOD: "avg_logits" 16 | CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json" 17 | # periodic update 18 | USE_PERIODIC_UPDATE: True 19 | PERIODIC_STEPS: (40000, 80000, 120000, 160000) 20 | # box reg 21 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes 22 | DATASETS: 23 | TRAIN: ("lvis_v1_train_base_1203cats",) 24 | TEST: ("lvis_v1_val",) 25 | SOLVER: 26 | IMS_PER_BATCH: 16 27 | BASE_LR: 0.002 28 | STEPS: (120000, 160000) 29 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 30 | WARMUP_ITERS: 5000 31 | CHECKPOINT_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_frozen_CLIP_RPN.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "MyCLIPFastRCNN" 4 | # IGNORE_CLS_LOSS: True 5 | CLIP: 6 | FREEZE_BACKBONE: True 7 | SOLVER: 8 | IMS_PER_BATCH: 16 9 | BASE_LR: 0.002 10 | STEPS: (60000, 80000) 11 | MAX_ITER: 90000 12 | WARMUP_ITERS: 5000 13 | CHECKPOINT_PERIOD: 10000 14 | TEST: 15 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 16 | EVAL_PERIOD: 20000 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "GeneralizedRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | RPN: 16 | HEAD_NAME: StandardRPNHead 17 | IN_FEATURES: ["res4"] 18 | ROI_HEADS: 19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads" 20 | IN_FEATURES: ["res4"] 21 | NUM_CLASSES: 1203 22 | SCORE_THRESH_TEST: 0.0001 23 | ROI_BOX_HEAD: 24 | NAME: "" 25 | NUM_FC: 0 26 | POOLER_RESOLUTION: 14 27 | ROI_MASK_HEAD: 28 | NAME: "MaskRCNNConvUpsampleHead" 29 | NUM_CONV: 0 30 | POOLER_RESOLUTION: 14 31 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 32 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 33 | INPUT: 34 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 35 | DATASETS: 36 | TRAIN: ("lvis_v1_train",) 37 | TEST: ("lvis_v1_val",) 38 | TEST: 39 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 40 | EVAL_PERIOD: 25000 41 | SOLVER: 42 | IMS_PER_BATCH: 16 43 | BASE_LR: 0.02 44 | STEPS: (120000, 160000) # (140000,) # 45 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 46 | DATALOADER: 47 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 48 | REPEAT_THRESHOLD: 0.001 49 | INPUT: 50 | MIN_SIZE_TRAIN_SAMPLING: choice 51 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 52 | MAX_SIZE_TRAIN: 1333 53 | MIN_SIZE_TEST: 800 54 | MAX_SIZE_TEST: 1333 55 | FORMAT: "RGB" # "BGR" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "GeneralizedRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | FPN: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | OUT_CHANNELS: 256 18 | FUSE_TYPE: sum 19 | RPN: 20 | HEAD_NAME: StandardRPNHead 21 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 22 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 23 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 24 | # Detectron1 uses 2000 proposals per-batch, 25 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 26 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 27 | POST_NMS_TOPK_TRAIN: 1000 28 | POST_NMS_TOPK_TEST: 1000 29 | ROI_HEADS: 30 | NAME: "StandardROIHeads" 31 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 32 | NUM_CLASSES: 1203 33 | SCORE_THRESH_TEST: 0.0001 34 | ROI_BOX_HEAD: 35 | NAME: "FastRCNNConvFCHead" 36 | NUM_FC: 2 37 | POOLER_RESOLUTION: 7 38 | ROI_MASK_HEAD: 39 | NAME: "MaskRCNNConvUpsampleHead" 40 | NUM_CONV: 4 41 | POOLER_RESOLUTION: 14 42 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 43 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 44 | INPUT: 45 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 46 | DATASETS: 47 | TRAIN: ("lvis_v1_train",) 48 | TEST: ("lvis_v1_val",) 49 | TEST: 50 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 51 | EVAL_PERIOD: 50000 52 | SOLVER: 53 | IMS_PER_BATCH: 16 54 | BASE_LR: 0.02 55 | STEPS: (120000, 160000) # (140000,) # 56 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 57 | DATALOADER: 58 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 59 | REPEAT_THRESHOLD: 0.001 60 | INPUT: 61 | MIN_SIZE_TRAIN_SAMPLING: choice 62 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 63 | MAX_SIZE_TRAIN: 1333 64 | MIN_SIZE_TEST: 800 65 | MAX_SIZE_TEST: 1333 66 | FORMAT: "RGB" # "BGR" -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 1203 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v1_train",) 14 | TEST: ("lvis_v1_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | SOLVER: 18 | STEPS: (120000, 160000) 19 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 20 | DATALOADER: 21 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 22 | REPEAT_THRESHOLD: 0.001 23 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 1203 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v1_train",) 14 | TEST: ("lvis_v1_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | EVAL_PERIOD: 50000 18 | SOLVER: 19 | STEPS: (120000, 160000) 20 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 21 | DATALOADER: 22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 23 | REPEAT_THRESHOLD: 0.001 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 1203 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v1_train",) 14 | TEST: ("lvis_v1_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | EVAL_PERIOD: 50000 18 | SOLVER: 19 | STEPS: (120000, 160000) 20 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 21 | DATALOADER: 22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 23 | REPEAT_THRESHOLD: 0.001 24 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_2x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 1203 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | TRAIN: ("lvis_v1_train",) 14 | TEST: ("lvis_v1_val",) 15 | TEST: 16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 17 | EVAL_PERIOD: 50000 18 | SOLVER: 19 | STEPS: (240000, 320000) #(120000, 160000) 20 | MAX_ITER: 360000 # 180000 * 16 / 100000 ~ 28.8 epochs 21 | DATALOADER: 22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 23 | REPEAT_THRESHOLD: 0.001 24 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" 4 | PIXEL_STD: [57.375, 57.120, 58.395] 5 | MASK_ON: True 6 | RESNETS: 7 | STRIDE_IN_1X1: False # this is a C2 model 8 | NUM_GROUPS: 32 9 | WIDTH_PER_GROUP: 8 10 | DEPTH: 101 11 | ROI_HEADS: 12 | NUM_CLASSES: 1203 13 | SCORE_THRESH_TEST: 0.0001 14 | INPUT: 15 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 16 | DATASETS: 17 | TRAIN: ("lvis_v1_train",) 18 | TEST: ("lvis_v1_val",) 19 | SOLVER: 20 | STEPS: (120000, 160000) 21 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs 22 | TEST: 23 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 24 | DATALOADER: 25 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 26 | REPEAT_THRESHOLD: 0.001 27 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NAME: CascadeROIHeads 9 | ROI_BOX_HEAD: 10 | CLS_AGNOSTIC_BBOX_REG: True 11 | RPN: 12 | POST_NMS_TOPK_TRAIN: 2000 13 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NAME: CascadeROIHeads 9 | ROI_BOX_HEAD: 10 | CLS_AGNOSTIC_BBOX_REG: True 11 | RPN: 12 | POST_NMS_TOPK_TRAIN: 2000 13 | SOLVER: 14 | STEPS: (210000, 250000) 15 | MAX_ITER: 270000 16 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | MASK_ON: True 4 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k" 5 | RESNETS: 6 | STRIDE_IN_1X1: False # this is a C2 model 7 | NUM_GROUPS: 32 8 | WIDTH_PER_GROUP: 8 9 | DEPTH: 152 10 | DEFORM_ON_PER_STAGE: [False, True, True, True] 11 | ROI_HEADS: 12 | NAME: "CascadeROIHeads" 13 | ROI_BOX_HEAD: 14 | NAME: "FastRCNNConvFCHead" 15 | NUM_CONV: 4 16 | NUM_FC: 1 17 | NORM: "GN" 18 | CLS_AGNOSTIC_BBOX_REG: True 19 | ROI_MASK_HEAD: 20 | NUM_CONV: 8 21 | NORM: "GN" 22 | RPN: 23 | POST_NMS_TOPK_TRAIN: 2000 24 | SOLVER: 25 | IMS_PER_BATCH: 128 26 | STEPS: (35000, 45000) 27 | MAX_ITER: 50000 28 | BASE_LR: 0.16 29 | INPUT: 30 | MIN_SIZE_TRAIN: (640, 864) 31 | MIN_SIZE_TRAIN_SAMPLING: "range" 32 | MAX_SIZE_TRAIN: 1440 33 | CROP: 34 | ENABLED: True 35 | TEST: 36 | EVAL_PERIOD: 2500 37 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_BOX_HEAD: 8 | CLS_AGNOSTIC_BBOX_REG: True 9 | ROI_MASK_HEAD: 10 | CLS_AGNOSTIC_MASK: True 11 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5 8 | DEFORM_MODULATED: False 9 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5 8 | DEFORM_MODULATED: False 9 | SOLVER: 10 | STEPS: (210000, 250000) 11 | MAX_ITER: 270000 12 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | NORM: "GN" 8 | STRIDE_IN_1X1: False 9 | FPN: 10 | NORM: "GN" 11 | ROI_BOX_HEAD: 12 | NAME: "FastRCNNConvFCHead" 13 | NUM_CONV: 4 14 | NUM_FC: 1 15 | NORM: "GN" 16 | ROI_MASK_HEAD: 17 | NORM: "GN" 18 | SOLVER: 19 | # 3x schedule 20 | STEPS: (210000, 250000) 21 | MAX_ITER: 270000 22 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | NORM: "SyncBN" 8 | STRIDE_IN_1X1: True 9 | FPN: 10 | NORM: "SyncBN" 11 | ROI_BOX_HEAD: 12 | NAME: "FastRCNNConvFCHead" 13 | NUM_CONV: 4 14 | NUM_FC: 1 15 | NORM: "SyncBN" 16 | ROI_MASK_HEAD: 17 | NORM: "SyncBN" 18 | SOLVER: 19 | # 3x schedule 20 | STEPS: (210000, 250000) 21 | MAX_ITER: 270000 22 | TEST: 23 | PRECISE_BN: 24 | ENABLED: True 25 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/mmdet_mask_rcnn_R_50_FPN_1x.py: -------------------------------------------------------------------------------- 1 | # An example config to train a mmdetection model using detectron2. 2 | 3 | from ..common.data.coco import dataloader 4 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier 5 | from ..common.optim import SGD as optimizer 6 | from ..common.train import train 7 | 8 | from detectron2.modeling.mmdet_wrapper import MMDetDetector 9 | from detectron2.config import LazyCall as L 10 | 11 | model = L(MMDetDetector)( 12 | detector=dict( 13 | type="MaskRCNN", 14 | pretrained="torchvision://resnet50", 15 | backbone=dict( 16 | type="ResNet", 17 | depth=50, 18 | num_stages=4, 19 | out_indices=(0, 1, 2, 3), 20 | frozen_stages=1, 21 | norm_cfg=dict(type="BN", requires_grad=True), 22 | norm_eval=True, 23 | style="pytorch", 24 | ), 25 | neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), 26 | rpn_head=dict( 27 | type="RPNHead", 28 | in_channels=256, 29 | feat_channels=256, 30 | anchor_generator=dict( 31 | type="AnchorGenerator", 32 | scales=[8], 33 | ratios=[0.5, 1.0, 2.0], 34 | strides=[4, 8, 16, 32, 64], 35 | ), 36 | bbox_coder=dict( 37 | type="DeltaXYWHBBoxCoder", 38 | target_means=[0.0, 0.0, 0.0, 0.0], 39 | target_stds=[1.0, 1.0, 1.0, 1.0], 40 | ), 41 | loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0), 42 | loss_bbox=dict(type="L1Loss", loss_weight=1.0), 43 | ), 44 | roi_head=dict( 45 | type="StandardRoIHead", 46 | bbox_roi_extractor=dict( 47 | type="SingleRoIExtractor", 48 | roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0), 49 | out_channels=256, 50 | featmap_strides=[4, 8, 16, 32], 51 | ), 52 | bbox_head=dict( 53 | type="Shared2FCBBoxHead", 54 | in_channels=256, 55 | fc_out_channels=1024, 56 | roi_feat_size=7, 57 | num_classes=80, 58 | bbox_coder=dict( 59 | type="DeltaXYWHBBoxCoder", 60 | target_means=[0.0, 0.0, 0.0, 0.0], 61 | target_stds=[0.1, 0.1, 0.2, 0.2], 62 | ), 63 | reg_class_agnostic=False, 64 | loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0), 65 | loss_bbox=dict(type="L1Loss", loss_weight=1.0), 66 | ), 67 | mask_roi_extractor=dict( 68 | type="SingleRoIExtractor", 69 | roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0), 70 | out_channels=256, 71 | featmap_strides=[4, 8, 16, 32], 72 | ), 73 | mask_head=dict( 74 | type="FCNMaskHead", 75 | num_convs=4, 76 | in_channels=256, 77 | conv_out_channels=256, 78 | num_classes=80, 79 | loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0), 80 | ), 81 | ), 82 | # model training and testing settings 83 | train_cfg=dict( 84 | rpn=dict( 85 | assigner=dict( 86 | type="MaxIoUAssigner", 87 | pos_iou_thr=0.7, 88 | neg_iou_thr=0.3, 89 | min_pos_iou=0.3, 90 | match_low_quality=True, 91 | ignore_iof_thr=-1, 92 | ), 93 | sampler=dict( 94 | type="RandomSampler", 95 | num=256, 96 | pos_fraction=0.5, 97 | neg_pos_ub=-1, 98 | add_gt_as_proposals=False, 99 | ), 100 | allowed_border=-1, 101 | pos_weight=-1, 102 | debug=False, 103 | ), 104 | rpn_proposal=dict( 105 | nms_pre=2000, 106 | max_per_img=1000, 107 | nms=dict(type="nms", iou_threshold=0.7), 108 | min_bbox_size=0, 109 | ), 110 | rcnn=dict( 111 | assigner=dict( 112 | type="MaxIoUAssigner", 113 | pos_iou_thr=0.5, 114 | neg_iou_thr=0.5, 115 | min_pos_iou=0.5, 116 | match_low_quality=True, 117 | ignore_iof_thr=-1, 118 | ), 119 | sampler=dict( 120 | type="RandomSampler", 121 | num=512, 122 | pos_fraction=0.25, 123 | neg_pos_ub=-1, 124 | add_gt_as_proposals=True, 125 | ), 126 | mask_size=28, 127 | pos_weight=-1, 128 | debug=False, 129 | ), 130 | ), 131 | test_cfg=dict( 132 | rpn=dict( 133 | nms_pre=1000, 134 | max_per_img=1000, 135 | nms=dict(type="nms", iou_threshold=0.7), 136 | min_bbox_size=0, 137 | ), 138 | rcnn=dict( 139 | score_thr=0.05, 140 | nms=dict(type="nms", iou_threshold=0.5), 141 | max_per_img=100, 142 | mask_thr_binary=0.5, 143 | ), 144 | ), 145 | ), 146 | pixel_mean=[123.675, 116.280, 103.530], 147 | pixel_std=[58.395, 57.120, 57.375], 148 | ) 149 | 150 | dataloader.train.mapper.image_format = "RGB" # torchvision pretrained model 151 | train.init_checkpoint = None # pretrained model is loaded inside backbone 152 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml: -------------------------------------------------------------------------------- 1 | # A large PanopticFPN for demo purposes. 2 | # Use GN on backbone to support semantic seg. 3 | # Use Cascade + Deform Conv to improve localization. 4 | _BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml" 5 | MODEL: 6 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN" 7 | RESNETS: 8 | DEPTH: 101 9 | NORM: "GN" 10 | DEFORM_ON_PER_STAGE: [False, True, True, True] 11 | STRIDE_IN_1X1: False 12 | FPN: 13 | NORM: "GN" 14 | ROI_HEADS: 15 | NAME: CascadeROIHeads 16 | ROI_BOX_HEAD: 17 | CLS_AGNOSTIC_BBOX_REG: True 18 | ROI_MASK_HEAD: 19 | NORM: "GN" 20 | RPN: 21 | POST_NMS_TOPK_TRAIN: 2000 22 | SOLVER: 23 | STEPS: (105000, 125000) 24 | MAX_ITER: 135000 25 | IMS_PER_BATCH: 32 26 | BASE_LR: 0.04 27 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml" 2 | MODEL: 3 | # Train from random initialization. 4 | WEIGHTS: "" 5 | # It makes sense to divide by STD when training from scratch 6 | # But it seems to make no difference on the results and C2's models didn't do this. 7 | # So we keep things consistent with C2. 8 | # PIXEL_STD: [57.375, 57.12, 58.395] 9 | MASK_ON: True 10 | BACKBONE: 11 | FREEZE_AT: 0 12 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 13 | # to learn what you need for training from scratch. 14 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml" 2 | MODEL: 3 | PIXEL_STD: [57.375, 57.12, 58.395] 4 | WEIGHTS: "" 5 | MASK_ON: True 6 | RESNETS: 7 | STRIDE_IN_1X1: False 8 | BACKBONE: 9 | FREEZE_AT: 0 10 | SOLVER: 11 | # 9x schedule 12 | IMS_PER_BATCH: 64 # 4x the standard 13 | STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k 14 | MAX_ITER: 202500 # 90k * 9 / 4 15 | BASE_LR: 0.08 16 | TEST: 17 | EVAL_PERIOD: 2500 18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 19 | # to learn what you need for training from scratch. 20 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml" 2 | MODEL: 3 | PIXEL_STD: [57.375, 57.12, 58.395] 4 | WEIGHTS: "" 5 | MASK_ON: True 6 | RESNETS: 7 | STRIDE_IN_1X1: False 8 | BACKBONE: 9 | FREEZE_AT: 0 10 | SOLVER: 11 | # 9x schedule 12 | IMS_PER_BATCH: 64 # 4x the standard 13 | STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k 14 | MAX_ITER: 202500 # 90k * 9 / 4 15 | BASE_LR: 0.08 16 | TEST: 17 | EVAL_PERIOD: 2500 18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 19 | # to learn what you need for training from scratch. 20 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/semantic_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 5 | RESNETS: 6 | DEPTH: 50 7 | DATASETS: 8 | TRAIN: ("coco_2017_train_panoptic_stuffonly",) 9 | TEST: ("coco_2017_val_panoptic_stuffonly",) 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/Misc/torchvision_imagenet_R_50.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example config file to train a ImageNet classifier with detectron2. 3 | Model and dataloader both come from torchvision. 4 | This shows how to use detectron2 as a general engine for any new models and tasks. 5 | To run, use the following command: 6 | 7 | python tools/lazyconfig_train_net.py --config-file configs/Misc/torchvision_imagenet_R_50.py \ 8 | --num-gpus 8 dataloader.train.dataset.root=/path/to/imagenet/ 9 | """ 10 | 11 | 12 | import torch 13 | from torch import nn 14 | from torch.nn import functional as F 15 | from omegaconf import OmegaConf 16 | import torchvision 17 | from torchvision.transforms import transforms as T 18 | from torchvision.models.resnet import ResNet, Bottleneck 19 | from fvcore.common.param_scheduler import MultiStepParamScheduler 20 | 21 | from detectron2.solver import WarmupParamScheduler 22 | from detectron2.solver.build import get_default_optimizer_params 23 | from detectron2.config import LazyCall as L 24 | from detectron2.model_zoo import get_config 25 | from detectron2.data.samplers import TrainingSampler, InferenceSampler 26 | from detectron2.evaluation import DatasetEvaluator 27 | from detectron2.utils import comm 28 | 29 | 30 | def build_data_loader(dataset, batch_size, num_workers, training=True): 31 | return torch.utils.data.DataLoader( 32 | dataset, 33 | sampler=(TrainingSampler if training else InferenceSampler)(len(dataset)), 34 | batch_size=batch_size, 35 | num_workers=num_workers, 36 | pin_memory=True, 37 | ) 38 | 39 | 40 | class ClassificationNet(nn.Module): 41 | def __init__(self, model: nn.Module): 42 | super().__init__() 43 | self.model = model 44 | 45 | @property 46 | def device(self): 47 | return list(self.model.parameters())[0].device 48 | 49 | def forward(self, inputs): 50 | image, label = inputs 51 | pred = self.model(image.to(self.device)) 52 | if self.training: 53 | label = label.to(self.device) 54 | return F.cross_entropy(pred, label) 55 | else: 56 | return pred 57 | 58 | 59 | class ClassificationAcc(DatasetEvaluator): 60 | def reset(self): 61 | self.corr = self.total = 0 62 | 63 | def process(self, inputs, outputs): 64 | image, label = inputs 65 | self.corr += (outputs.argmax(dim=1).cpu() == label.cpu()).sum().item() 66 | self.total += len(label) 67 | 68 | def evaluate(self): 69 | all_corr_total = comm.all_gather([self.corr, self.total]) 70 | corr = sum(x[0] for x in all_corr_total) 71 | total = sum(x[1] for x in all_corr_total) 72 | return {"accuracy": corr / total} 73 | 74 | 75 | dataloader = OmegaConf.create() 76 | dataloader.train = L(build_data_loader)( 77 | dataset=L(torchvision.datasets.ImageNet)( 78 | root="/path/to/imagenet", 79 | split="train", 80 | transform=L(T.Compose)( 81 | transforms=[ 82 | L(T.RandomResizedCrop)(size=224), 83 | L(T.RandomHorizontalFlip)(), 84 | T.ToTensor(), 85 | L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), 86 | ] 87 | ), 88 | ), 89 | batch_size=256 // 8, 90 | num_workers=4, 91 | training=True, 92 | ) 93 | 94 | dataloader.test = L(build_data_loader)( 95 | dataset=L(torchvision.datasets.ImageNet)( 96 | root="${...train.dataset.root}", 97 | split="val", 98 | transform=L(T.Compose)( 99 | transforms=[ 100 | L(T.Resize)(size=256), 101 | L(T.CenterCrop)(size=224), 102 | T.ToTensor(), 103 | L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), 104 | ] 105 | ), 106 | ), 107 | batch_size=256 // 8, 108 | num_workers=4, 109 | training=False, 110 | ) 111 | 112 | dataloader.evaluator = L(ClassificationAcc)() 113 | 114 | model = L(ClassificationNet)( 115 | model=(ResNet)(block=Bottleneck, layers=[3, 4, 6, 3], zero_init_residual=True) 116 | ) 117 | 118 | 119 | optimizer = L(torch.optim.SGD)( 120 | params=L(get_default_optimizer_params)(), 121 | lr=0.1, 122 | momentum=0.9, 123 | weight_decay=1e-4, 124 | ) 125 | 126 | lr_multiplier = L(WarmupParamScheduler)( 127 | scheduler=L(MultiStepParamScheduler)( 128 | values=[1.0, 0.1, 0.01, 0.001], milestones=[30, 60, 90, 100] 129 | ), 130 | warmup_length=1 / 100, 131 | warmup_factor=0.1, 132 | ) 133 | 134 | 135 | train = get_config("common/train.py").train 136 | train.init_checkpoint = None 137 | train.max_iter = 100 * 1281167 // 256 138 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/README.md: -------------------------------------------------------------------------------- 1 | This directory provides definitions for a few common models, dataloaders, scheduler, 2 | and optimizers that are often used in training. 3 | The definition of these objects are provided in the form of lazy instantiation: 4 | their arguments can be edited by users before constructing the objects. 5 | 6 | They can be imported, or loaded by `model_zoo.get_config` API in users' own configs. 7 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/coco_schedule.py: -------------------------------------------------------------------------------- 1 | from fvcore.common.param_scheduler import MultiStepParamScheduler 2 | 3 | from detectron2.config import LazyCall as L 4 | from detectron2.solver import WarmupParamScheduler 5 | 6 | 7 | def default_X_scheduler(num_X): 8 | """ 9 | Returns the config for a default multi-step LR scheduler such as "1x", "3x", 10 | commonly referred to in papers, where every 1x has the total length of 1440k 11 | training images (~12 COCO epochs). LR is decayed twice at the end of training 12 | following the strategy defined in "Rethinking ImageNet Pretraining", Sec 4. 13 | 14 | Args: 15 | num_X: a positive real number 16 | 17 | Returns: 18 | DictConfig: configs that define the multiplier for LR during training 19 | """ 20 | # total number of iterations assuming 16 batch size, using 1440000/16=90000 21 | total_steps_16bs = num_X * 90000 22 | 23 | if num_X <= 2: 24 | scheduler = L(MultiStepParamScheduler)( 25 | values=[1.0, 0.1, 0.01], 26 | # note that scheduler is scale-invariant. This is equivalent to 27 | # milestones=[6, 8, 9] 28 | milestones=[60000, 80000, 90000], 29 | ) 30 | else: 31 | scheduler = L(MultiStepParamScheduler)( 32 | values=[1.0, 0.1, 0.01], 33 | milestones=[total_steps_16bs - 60000, total_steps_16bs - 20000, total_steps_16bs], 34 | ) 35 | return L(WarmupParamScheduler)( 36 | scheduler=scheduler, 37 | warmup_length=1000 / total_steps_16bs, 38 | warmup_method="linear", 39 | warmup_factor=0.001, 40 | ) 41 | 42 | 43 | lr_multiplier_1x = default_X_scheduler(1) 44 | lr_multiplier_2x = default_X_scheduler(2) 45 | lr_multiplier_3x = default_X_scheduler(3) 46 | lr_multiplier_6x = default_X_scheduler(6) 47 | lr_multiplier_9x = default_X_scheduler(9) 48 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/data/coco.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | 3 | import detectron2.data.transforms as T 4 | from detectron2.config import LazyCall as L 5 | from detectron2.data import ( 6 | DatasetMapper, 7 | build_detection_test_loader, 8 | build_detection_train_loader, 9 | get_detection_dataset_dicts, 10 | ) 11 | from detectron2.evaluation import COCOEvaluator 12 | 13 | dataloader = OmegaConf.create() 14 | 15 | dataloader.train = L(build_detection_train_loader)( 16 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"), 17 | mapper=L(DatasetMapper)( 18 | is_train=True, 19 | augmentations=[ 20 | L(T.ResizeShortestEdge)( 21 | short_edge_length=(640, 672, 704, 736, 768, 800), 22 | sample_style="choice", 23 | max_size=1333, 24 | ), 25 | L(T.RandomFlip)(horizontal=True), 26 | ], 27 | image_format="BGR", 28 | use_instance_mask=True, 29 | ), 30 | total_batch_size=16, 31 | num_workers=4, 32 | ) 33 | 34 | dataloader.test = L(build_detection_test_loader)( 35 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False), 36 | mapper=L(DatasetMapper)( 37 | is_train=False, 38 | augmentations=[ 39 | L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333), 40 | ], 41 | image_format="${...train.mapper.image_format}", 42 | ), 43 | num_workers=4, 44 | ) 45 | 46 | dataloader.evaluator = L(COCOEvaluator)( 47 | dataset_name="${..test.dataset.names}", 48 | ) 49 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/data/coco_keypoint.py: -------------------------------------------------------------------------------- 1 | from detectron2.data.detection_utils import create_keypoint_hflip_indices 2 | 3 | from .coco import dataloader 4 | 5 | dataloader.train.dataset.min_keypoints = 1 6 | dataloader.train.dataset.names = "keypoints_coco_2017_train" 7 | dataloader.test.dataset.names = "keypoints_coco_2017_val" 8 | 9 | dataloader.train.mapper.update( 10 | use_instance_mask=False, 11 | use_keypoint=True, 12 | keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names), 13 | ) 14 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/data/coco_panoptic_separated.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.evaluation import ( 3 | COCOEvaluator, 4 | COCOPanopticEvaluator, 5 | DatasetEvaluators, 6 | SemSegEvaluator, 7 | ) 8 | 9 | from .coco import dataloader 10 | 11 | dataloader.train.dataset.names = "coco_2017_train_panoptic_separated" 12 | dataloader.train.dataset.filter_empty = False 13 | dataloader.test.dataset.names = "coco_2017_val_panoptic_separated" 14 | 15 | 16 | dataloader.evaluator = [ 17 | L(COCOEvaluator)( 18 | dataset_name="${...test.dataset.names}", 19 | ), 20 | L(SemSegEvaluator)( 21 | dataset_name="${...test.dataset.names}", 22 | ), 23 | L(COCOPanopticEvaluator)( 24 | dataset_name="${...test.dataset.names}", 25 | ), 26 | ] 27 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/cascade_rcnn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.box_regression import Box2BoxTransform 4 | from detectron2.modeling.matcher import Matcher 5 | from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads 6 | 7 | from .mask_rcnn_fpn import model 8 | 9 | # arguments that don't exist for Cascade R-CNN 10 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] 11 | 12 | model.roi_heads.update( 13 | _target_=CascadeROIHeads, 14 | box_heads=[ 15 | L(FastRCNNConvFCHead)( 16 | input_shape=ShapeSpec(channels=256, height=7, width=7), 17 | conv_dims=[], 18 | fc_dims=[1024, 1024], 19 | ) 20 | for k in range(3) 21 | ], 22 | box_predictors=[ 23 | L(FastRCNNOutputLayers)( 24 | input_shape=ShapeSpec(channels=1024), 25 | test_score_thresh=0.05, 26 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), 27 | cls_agnostic_bbox_reg=True, 28 | num_classes="${...num_classes}", 29 | ) 30 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)] 31 | ], 32 | proposal_matchers=[ 33 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) 34 | for th in [0.5, 0.6, 0.7] 35 | ], 36 | ) 37 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/keypoint_rcnn_fpn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.poolers import ROIPooler 4 | from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead 5 | 6 | from .mask_rcnn_fpn import model 7 | 8 | [model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]] 9 | 10 | model.roi_heads.update( 11 | num_classes=1, 12 | keypoint_in_features=["p2", "p3", "p4", "p5"], 13 | keypoint_pooler=L(ROIPooler)( 14 | output_size=14, 15 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), 16 | sampling_ratio=0, 17 | pooler_type="ROIAlignV2", 18 | ), 19 | keypoint_head=L(KRCNNConvDeconvUpsampleHead)( 20 | input_shape=ShapeSpec(channels=256, width=14, height=14), 21 | num_keypoints=17, 22 | conv_dims=[512] * 8, 23 | loss_normalizer="visible", 24 | ), 25 | ) 26 | 27 | # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2. 28 | # 1000 proposals per-image is found to hurt box AP. 29 | # Therefore we increase it to 1500 per-image. 30 | model.proposal_generator.post_nms_topk = (1500, 1000) 31 | 32 | # Keypoint AP degrades (though box AP improves) when using plain L1 loss 33 | model.roi_heads.box_predictor.smooth_l1_beta = 0.5 34 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/mask_rcnn_c4.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.meta_arch import GeneralizedRCNN 4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator 5 | from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet 6 | from detectron2.modeling.box_regression import Box2BoxTransform 7 | from detectron2.modeling.matcher import Matcher 8 | from detectron2.modeling.poolers import ROIPooler 9 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead 10 | from detectron2.modeling.roi_heads import ( 11 | FastRCNNOutputLayers, 12 | MaskRCNNConvUpsampleHead, 13 | Res5ROIHeads, 14 | ) 15 | 16 | model = L(GeneralizedRCNN)( 17 | backbone=L(ResNet)( 18 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), 19 | stages=L(ResNet.make_default_stages)( 20 | depth=50, 21 | stride_in_1x1=True, 22 | norm="FrozenBN", 23 | ), 24 | out_features=["res4"], 25 | ), 26 | proposal_generator=L(RPN)( 27 | in_features=["res4"], 28 | head=L(StandardRPNHead)(in_channels=1024, num_anchors=15), 29 | anchor_generator=L(DefaultAnchorGenerator)( 30 | sizes=[[32, 64, 128, 256, 512]], 31 | aspect_ratios=[0.5, 1.0, 2.0], 32 | strides=[16], 33 | offset=0.0, 34 | ), 35 | anchor_matcher=L(Matcher)( 36 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True 37 | ), 38 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]), 39 | batch_size_per_image=256, 40 | positive_fraction=0.5, 41 | pre_nms_topk=(12000, 6000), 42 | post_nms_topk=(2000, 1000), 43 | nms_thresh=0.7, 44 | ), 45 | roi_heads=L(Res5ROIHeads)( 46 | num_classes=80, 47 | batch_size_per_image=512, 48 | positive_fraction=0.25, 49 | proposal_matcher=L(Matcher)( 50 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False 51 | ), 52 | in_features=["res4"], 53 | pooler=L(ROIPooler)( 54 | output_size=14, 55 | scales=(1.0 / 16,), 56 | sampling_ratio=0, 57 | pooler_type="ROIAlignV2", 58 | ), 59 | res5=L(ResNet.make_stage)( 60 | block_class=BottleneckBlock, 61 | num_blocks=3, 62 | stride_per_block=[2, 1, 1], 63 | in_channels=1024, 64 | bottleneck_channels=512, 65 | out_channels=2048, 66 | norm="FrozenBN", 67 | stride_in_1x1=True, 68 | ), 69 | box_predictor=L(FastRCNNOutputLayers)( 70 | input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1), 71 | test_score_thresh=0.05, 72 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)), 73 | num_classes="${..num_classes}", 74 | ), 75 | mask_head=L(MaskRCNNConvUpsampleHead)( 76 | input_shape=L(ShapeSpec)( 77 | channels="${...res5.out_channels}", 78 | width="${...pooler.output_size}", 79 | height="${...pooler.output_size}", 80 | ), 81 | num_classes="${..num_classes}", 82 | conv_dims=[256], 83 | ), 84 | ), 85 | pixel_mean=[103.530, 116.280, 123.675], 86 | pixel_std=[1.0, 1.0, 1.0], 87 | input_format="BGR", 88 | ) 89 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/mask_rcnn_fpn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.meta_arch import GeneralizedRCNN 4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 6 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet 7 | from detectron2.modeling.box_regression import Box2BoxTransform 8 | from detectron2.modeling.matcher import Matcher 9 | from detectron2.modeling.poolers import ROIPooler 10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead 11 | from detectron2.modeling.roi_heads import ( 12 | StandardROIHeads, 13 | FastRCNNOutputLayers, 14 | MaskRCNNConvUpsampleHead, 15 | FastRCNNConvFCHead, 16 | ) 17 | 18 | model = L(GeneralizedRCNN)( 19 | backbone=L(FPN)( 20 | bottom_up=L(ResNet)( 21 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), 22 | stages=L(ResNet.make_default_stages)( 23 | depth=50, 24 | stride_in_1x1=True, 25 | norm="FrozenBN", 26 | ), 27 | out_features=["res2", "res3", "res4", "res5"], 28 | ), 29 | in_features="${.bottom_up.out_features}", 30 | out_channels=256, 31 | top_block=L(LastLevelMaxPool)(), 32 | ), 33 | proposal_generator=L(RPN)( 34 | in_features=["p2", "p3", "p4", "p5", "p6"], 35 | head=L(StandardRPNHead)(in_channels=256, num_anchors=3), 36 | anchor_generator=L(DefaultAnchorGenerator)( 37 | sizes=[[32], [64], [128], [256], [512]], 38 | aspect_ratios=[0.5, 1.0, 2.0], 39 | strides=[4, 8, 16, 32, 64], 40 | offset=0.0, 41 | ), 42 | anchor_matcher=L(Matcher)( 43 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True 44 | ), 45 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]), 46 | batch_size_per_image=256, 47 | positive_fraction=0.5, 48 | pre_nms_topk=(2000, 1000), 49 | post_nms_topk=(1000, 1000), 50 | nms_thresh=0.7, 51 | ), 52 | roi_heads=L(StandardROIHeads)( 53 | num_classes=80, 54 | batch_size_per_image=512, 55 | positive_fraction=0.25, 56 | proposal_matcher=L(Matcher)( 57 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False 58 | ), 59 | box_in_features=["p2", "p3", "p4", "p5"], 60 | box_pooler=L(ROIPooler)( 61 | output_size=7, 62 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), 63 | sampling_ratio=0, 64 | pooler_type="ROIAlignV2", 65 | ), 66 | box_head=L(FastRCNNConvFCHead)( 67 | input_shape=ShapeSpec(channels=256, height=7, width=7), 68 | conv_dims=[], 69 | fc_dims=[1024, 1024], 70 | ), 71 | box_predictor=L(FastRCNNOutputLayers)( 72 | input_shape=ShapeSpec(channels=1024), 73 | test_score_thresh=0.05, 74 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)), 75 | num_classes="${..num_classes}", 76 | ), 77 | mask_in_features=["p2", "p3", "p4", "p5"], 78 | mask_pooler=L(ROIPooler)( 79 | output_size=14, 80 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), 81 | sampling_ratio=0, 82 | pooler_type="ROIAlignV2", 83 | ), 84 | mask_head=L(MaskRCNNConvUpsampleHead)( 85 | input_shape=ShapeSpec(channels=256, width=14, height=14), 86 | num_classes="${..num_classes}", 87 | conv_dims=[256, 256, 256, 256, 256], 88 | ), 89 | ), 90 | pixel_mean=[103.530, 116.280, 123.675], 91 | pixel_std=[1.0, 1.0, 1.0], 92 | input_format="BGR", 93 | ) 94 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/panoptic_fpn.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import LazyCall as L 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling import PanopticFPN 4 | from detectron2.modeling.meta_arch.semantic_seg import SemSegFPNHead 5 | 6 | from .mask_rcnn_fpn import model 7 | 8 | model._target_ = PanopticFPN 9 | model.sem_seg_head = L(SemSegFPNHead)( 10 | input_shape={ 11 | f: L(ShapeSpec)(stride=s, channels="${....backbone.out_channels}") 12 | for f, s in zip(["p2", "p3", "p4", "p5"], [4, 8, 16, 32]) 13 | }, 14 | ignore_value=255, 15 | num_classes=54, # COCO stuff + 1 16 | conv_dims=128, 17 | common_stride=4, 18 | loss_weight=0.5, 19 | norm="GN", 20 | ) 21 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/models/retinanet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from detectron2.config import LazyCall as L 4 | from detectron2.layers import ShapeSpec 5 | from detectron2.modeling.meta_arch import RetinaNet 6 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator 7 | from detectron2.modeling.backbone.fpn import LastLevelP6P7 8 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet 9 | from detectron2.modeling.box_regression import Box2BoxTransform 10 | from detectron2.modeling.matcher import Matcher 11 | from detectron2.modeling.meta_arch.retinanet import RetinaNetHead 12 | 13 | model = L(RetinaNet)( 14 | backbone=L(FPN)( 15 | bottom_up=L(ResNet)( 16 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), 17 | stages=L(ResNet.make_default_stages)( 18 | depth=50, 19 | stride_in_1x1=True, 20 | norm="FrozenBN", 21 | ), 22 | out_features=["res3", "res4", "res5"], 23 | ), 24 | in_features=["res3", "res4", "res5"], 25 | out_channels=256, 26 | top_block=L(LastLevelP6P7)(in_channels=2048, out_channels="${..out_channels}"), 27 | ), 28 | head=L(RetinaNetHead)( 29 | input_shape=[ShapeSpec(channels=256)], 30 | num_classes="${..num_classes}", 31 | conv_dims=[256, 256, 256, 256], 32 | prior_prob=0.01, 33 | num_anchors=9, 34 | ), 35 | anchor_generator=L(DefaultAnchorGenerator)( 36 | sizes=[[x, x * 2 ** (1.0 / 3), x * 2 ** (2.0 / 3)] for x in [32, 64, 128, 256, 512]], 37 | aspect_ratios=[0.5, 1.0, 2.0], 38 | strides=[8, 16, 32, 64, 128], 39 | offset=0.0, 40 | ), 41 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]), 42 | anchor_matcher=L(Matcher)( 43 | thresholds=[0.4, 0.5], labels=[0, -1, 1], allow_low_quality_matches=True 44 | ), 45 | num_classes=80, 46 | head_in_features=["p3", "p4", "p5", "p6", "p7"], 47 | focal_loss_alpha=0.25, 48 | focal_loss_gamma=2.0, 49 | pixel_mean=[103.530, 116.280, 123.675], 50 | pixel_std=[1.0, 1.0, 1.0], 51 | input_format="BGR", 52 | ) 53 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/optim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from detectron2.config import LazyCall as L 4 | from detectron2.solver.build import get_default_optimizer_params 5 | 6 | SGD = L(torch.optim.SGD)( 7 | params=L(get_default_optimizer_params)( 8 | # params.model is meant to be set to the model object, before instantiating 9 | # the optimizer. 10 | weight_decay_norm=0.0 11 | ), 12 | lr=0.02, 13 | momentum=0.9, 14 | weight_decay=1e-4, 15 | ) 16 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/common/train.py: -------------------------------------------------------------------------------- 1 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py" 2 | # You can use your own instead, together with your own train_net.py 3 | train = dict( 4 | output_dir="./output", 5 | init_checkpoint="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 6 | bb_rpn_checkpoint="", 7 | max_iter=90000, 8 | amp=dict(enabled=False), # options for Automatic Mixed Precision 9 | ddp=dict( # options for DistributedDataParallel 10 | broadcast_buffers=False, 11 | find_unused_parameters=False, 12 | fp16_compression=False, 13 | ), 14 | checkpointer=dict(period=5000, max_to_keep=100), # options for PeriodicCheckpointer 15 | eval_period=5000, 16 | log_period=20, 17 | device="cuda" 18 | # ... 19 | ) 20 | -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "PretrainFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" 8 | MASK_ON: False 9 | RESNETS: 10 | DEPTH: 50 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | ROI_HEADS: 16 | NAME: "PretrainRes5ROIHeads" 17 | IN_FEATURES: ["res4"] 18 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 19 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 20 | CLIP: 21 | CLSS_TEMP: 0.01 22 | CROP_REGION_TYPE: "RPN" 23 | OFFLINE_RPN_NMS_THRESH: 0.5 24 | GATHER_GPUS: True 25 | CONCEPT_THRES: 0.1 26 | PRETRAIN_RPN_REGIONS: 300 27 | PRETRAIN_SAMPLE_REGIONS: 100 28 | PRETRAIN_IMG_TXT_LEVEL: True 29 | PRETRAIN_ONLY_EOT: True 30 | TEACHER_RESNETS_DEPTH: 50 31 | TEACHER_POOLER_RESOLUTION: 14 32 | DATASETS: 33 | TRAIN: ("imgtxtpairs",) 34 | FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",) 35 | PATH_TRAIN: ("./datasets/coco/val2017",) # ("/tmp/datasets/CC3M",) 36 | TEST: () 37 | DATALOADER: 38 | ASPECT_RATIO_GROUPING: False 39 | NUM_WORKERS: 4 40 | TEST: 41 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 42 | EVAL_PERIOD: 2500000 43 | SOLVER: 44 | IMS_PER_BATCH: 96 # 32 gpus 45 | BASE_LR: 0.002 46 | WEIGHT_DECAY: 0.0001 47 | STEPS: (300000, 525000) 48 | MAX_ITER: 600000 49 | CLIP_GRADIENTS: 50 | ENABLED: True 51 | CLIP_TYPE: "norm" 52 | CLIP_VALUE: 5.0 53 | INPUT: 54 | MIN_SIZE_TRAIN_SAMPLING: choice 55 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 56 | MAX_SIZE_TRAIN: 1333 57 | MIN_SIZE_TEST: 800 58 | MAX_SIZE_TEST: 1333 59 | FORMAT: "RGB" 60 | AUG: # Data Augmentation from MSR-CLIP 61 | TRAIN: 62 | IMAGE_SIZE: [800,] 63 | MAX_SIZE: 1333 64 | TEST: 65 | IMAGE_SIZE: [800,] 66 | MAX_SIZE: 1333 67 | INTERPOLATION: 3 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./RegionCLIP_RN50.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "WeakPretrainFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone_from_pretrain" 6 | FREEZE_AT: 2 7 | CLIP: 8 | CROP_REGION_TYPE: "RPN" 9 | OFFLINE_RPN_NMS_THRESH: 0.3 # will affect the eval performance 10 | # GATHER_GPUS: True 11 | PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST 12 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection 13 | # for ZS inference 14 | NO_BOX_DELTA: True # no box refinement 15 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth" 16 | USE_TEXT_EMB_CLASSIFIER: True 17 | MULTIPLY_RPN_SCORE: True 18 | WEAK_LOSS: 19 | WEAK_LOSS_WEIGHT: 0.01 20 | IMAGE_LOSS_WEIGHT: 0.1 21 | BOX_SELECT_THRES: 0.97 # threshold in box selection 22 | # for ZS inference 23 | ROI_HEADS: 24 | NAME: "CLIPRes5ROIHeads" # pretrain roi head 25 | IN_FEATURES: ["res4"] 26 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score 27 | # for ZS inference 28 | NMS_THRESH_TEST: 0.5 29 | # for ZS inference 30 | ROI_BOX_HEAD: 31 | NAME: "" 32 | NUM_FC: 0 33 | # POOLER_RESOLUTION: 14 34 | CLS_AGNOSTIC_BBOX_REG: True 35 | DATASETS: 36 | # TRAIN: ("coco_zeroshot_train_del", "coco_caption_nouns_train_4764tags",) 37 | TRAIN: ("coco_caption_nouns_train_4764tags",) 38 | TEST: ("coco_generalized_del_val",) 39 | INPUT: 40 | CUSTOM_AUG: ResizeShortestEdge 41 | MIN_SIZE_TRAIN_SAMPLING: range 42 | MIN_SIZE_TRAIN: (400, 400) 43 | MAX_SIZE_TRAIN: 667 44 | DATALOADER: 45 | # SAMPLER_TRAIN: "MultiDatasetSampler" 46 | # DATASET_RATIO: [1, 4] 47 | # USE_DIFF_BS_SIZE: True # if use build_custom_augmentation 48 | # DATASET_BS: [2, 8] 49 | # USE_RFS: [False, False] 50 | # DATASET_MIN_SIZES: [[800, 800], [400, 400]] 51 | # DATASET_MAX_SIZES: [1333, 667] 52 | # DATASET_MIN_SIZES: [[800, 800], [400, 400]] 53 | # DATASET_MAX_SIZES: [1333, 667] 54 | FILTER_EMPTY_ANNOTATIONS: False 55 | DATASET_ANN: ['caption',] 56 | # MULTI_DATASET_GROUPING: True 57 | # DATASET_ANN: ['box', 'caption'] 58 | # NUM_WORKERS: 8 59 | TEST: 60 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300 61 | EVAL_PERIOD: 10000 62 | SOLVER: 63 | IMS_PER_BATCH: 96 # 32 gpus 64 | BASE_LR: 0.002 65 | WEIGHT_DECAY: 0.0001 66 | STEPS: (60000, 80000) 67 | MAX_ITER: 90000 68 | CHECKPOINT_PERIOD: 20000 69 | CLIP_GRADIENTS: 70 | ENABLED: True 71 | CLIP_TYPE: "norm" 72 | CLIP_VALUE: 5.0 73 | FIND_UNUSED_PARAM: True 74 | WITH_IMAGE_LABELS: True # load image tags 75 | OUTPUT_DIR: output/r50_onlinePL_pre -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./RegionCLIP_RN50.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "WeakPretrainFastRCNN" 4 | IGNORE_CLS_LOSS: True # disable weak loss 5 | BACKBONE: 6 | NAME: "build_clip_resnet_backbone_from_pretrain" 7 | FREEZE_AT: 2 8 | ROI_HEADS: 9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head 10 | IN_FEATURES: ["res4"] 11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score 12 | # for ZS inference 13 | NMS_THRESH_TEST: 0.5 14 | # for ZS inference 15 | ROI_BOX_HEAD: 16 | NAME: "" 17 | NUM_FC: 0 18 | # POOLER_RESOLUTION: 14 19 | CLS_AGNOSTIC_BBOX_REG: True 20 | CLIP: 21 | CROP_REGION_TYPE: "RPN" 22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance 23 | # GATHER_GPUS: True 24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST 25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection 26 | # for ZS inference 27 | NO_BOX_DELTA: False # pretrain roi head 28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth" 29 | USE_TEXT_EMB_CLASSIFIER: True 30 | MULTIPLY_RPN_SCORE: True 31 | WEAK_LOSS: 32 | WEAK_LOSS_WEIGHT: 0.01 33 | IMAGE_LOSS_WEIGHT: 0.1 34 | BOX_SELECT_THRES: 0.97 # threshold in box selection 35 | DATASETS: 36 | TRAIN: ("lvis_v1_train_base_box_only", "coco_caption_nouns_train_4764tags",) 37 | TEST: ("coco_generalized_del_val",) 38 | INPUT: 39 | CUSTOM_AUG: ResizeShortestEdge 40 | MIN_SIZE_TRAIN_SAMPLING: range 41 | MIN_SIZE_TRAIN: (800, 800) 42 | DATALOADER: 43 | SAMPLER_TRAIN: "MultiDatasetSampler" 44 | DATASET_RATIO: [1, 8] 45 | USE_DIFF_BS_SIZE: True 46 | DATASET_BS: [2, 16] 47 | USE_RFS: [False, False] 48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]] 49 | DATASET_MAX_SIZES: [1333, 667] 50 | FILTER_EMPTY_ANNOTATIONS: False 51 | MULTI_DATASET_GROUPING: True 52 | DATASET_ANN: ['box', 'caption'] 53 | NUM_WORKERS: 8 54 | TEST: 55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300 56 | EVAL_PERIOD: 10000 57 | SOLVER: 58 | IMS_PER_BATCH: 96 # 32 gpus 59 | BASE_LR: 0.002 60 | WEIGHT_DECAY: 0.0001 61 | STEPS: (60000, 80000) 62 | MAX_ITER: 90000 63 | CHECKPOINT_PERIOD: 20000 64 | CLIP_GRADIENTS: 65 | ENABLED: True 66 | CLIP_TYPE: "norm" 67 | CLIP_VALUE: 5.0 68 | FIND_UNUSED_PARAM: True 69 | WITH_IMAGE_LABELS: True # load image tags 70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_weak -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./RegionCLIP_RN50.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "WeakPretrainFastRCNN" 4 | IGNORE_CLS_LOSS: True # disable weak loss 5 | BACKBONE: 6 | NAME: "build_clip_resnet_backbone_from_pretrain" 7 | FREEZE_AT: 2 8 | ROI_HEADS: 9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head 10 | IN_FEATURES: ["res4"] 11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score 12 | # for ZS inference 13 | NMS_THRESH_TEST: 0.5 14 | # for ZS inference 15 | ROI_BOX_HEAD: 16 | NAME: "" 17 | NUM_FC: 0 18 | # POOLER_RESOLUTION: 14 19 | CLS_AGNOSTIC_BBOX_REG: True 20 | CLIP: 21 | CROP_REGION_TYPE: "RPN" 22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance 23 | # GATHER_GPUS: True 24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST 25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection 26 | # for ZS inference 27 | NO_BOX_DELTA: False # pretrain roi head 28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth" 29 | USE_TEXT_EMB_CLASSIFIER: True 30 | MULTIPLY_RPN_SCORE: True 31 | WEAK_LOSS: 32 | WEAK_LOSS_WEIGHT: 0.01 33 | IMAGE_LOSS_WEIGHT: 0.1 34 | BOX_SELECT_THRES: 0.97 # threshold in box selection 35 | DATASETS: 36 | TRAIN: ("lvis_v1_train_base_box_only", "cc3m_v1_nouns_train_4764tags",) 37 | TEST: ("coco_generalized_del_val",) 38 | INPUT: 39 | CUSTOM_AUG: ResizeShortestEdge 40 | MIN_SIZE_TRAIN_SAMPLING: range 41 | MIN_SIZE_TRAIN: (800, 800) 42 | DATALOADER: 43 | SAMPLER_TRAIN: "MultiDatasetSampler" 44 | DATASET_RATIO: [1, 8] 45 | USE_DIFF_BS_SIZE: True 46 | DATASET_BS: [2, 16] 47 | USE_RFS: [False, False] 48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]] 49 | DATASET_MAX_SIZES: [1333, 667] 50 | FILTER_EMPTY_ANNOTATIONS: False 51 | MULTI_DATASET_GROUPING: True 52 | DATASET_ANN: ['box', 'caption'] 53 | NUM_WORKERS: 8 54 | TEST: 55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300 56 | EVAL_PERIOD: 10000 57 | SOLVER: 58 | IMS_PER_BATCH: 96 # 32 gpus 59 | BASE_LR: 0.002 60 | WEIGHT_DECAY: 0.0001 61 | STEPS: (240000, 320000) 62 | MAX_ITER: 360000 63 | CHECKPOINT_PERIOD: 40000 64 | CLIP_GRADIENTS: 65 | ENABLED: True 66 | CLIP_TYPE: "norm" 67 | CLIP_VALUE: 5.0 68 | FIND_UNUSED_PARAM: True 69 | WITH_IMAGE_LABELS: True # load image tags 70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./RegionCLIP_RN50.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "WeakPretrainFastRCNN" 4 | IGNORE_CLS_LOSS: True # disable weak loss 5 | BACKBONE: 6 | NAME: "build_clip_resnet_backbone_from_pretrain" 7 | FREEZE_AT: 2 8 | ROI_HEADS: 9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head 10 | IN_FEATURES: ["res4"] 11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score 12 | # for ZS inference 13 | NMS_THRESH_TEST: 0.5 14 | # for ZS inference 15 | ROI_BOX_HEAD: 16 | NAME: "" 17 | NUM_FC: 0 18 | # POOLER_RESOLUTION: 14 19 | CLS_AGNOSTIC_BBOX_REG: True 20 | CLIP: 21 | CROP_REGION_TYPE: "RPN" 22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance 23 | # GATHER_GPUS: True 24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST 25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection 26 | # for ZS inference 27 | NO_BOX_DELTA: False # pretrain roi head 28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth" 29 | USE_TEXT_EMB_CLASSIFIER: True 30 | MULTIPLY_RPN_SCORE: True 31 | WEAK_LOSS: 32 | WEAK_LOSS_WEIGHT: 0.01 33 | IMAGE_LOSS_WEIGHT: 0.1 34 | BOX_SELECT_THRES: 0.97 # threshold in box selection 35 | DATASETS: 36 | TRAIN: ("lvis_v1_train_base_box_only", "loc_narr_nouns_train_4764tags",) 37 | TEST: ("coco_generalized_del_val",) 38 | INPUT: 39 | CUSTOM_AUG: ResizeShortestEdge 40 | MIN_SIZE_TRAIN_SAMPLING: range 41 | MIN_SIZE_TRAIN: (800, 800) 42 | DATALOADER: 43 | SAMPLER_TRAIN: "MultiDatasetSampler" 44 | DATASET_RATIO: [1, 8] 45 | USE_DIFF_BS_SIZE: True 46 | DATASET_BS: [2, 16] 47 | USE_RFS: [False, False] 48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]] 49 | DATASET_MAX_SIZES: [1333, 667] 50 | FILTER_EMPTY_ANNOTATIONS: False 51 | MULTI_DATASET_GROUPING: True 52 | DATASET_ANN: ['box', 'caption'] 53 | NUM_WORKERS: 8 54 | TEST: 55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300 56 | EVAL_PERIOD: 10000 57 | SOLVER: 58 | IMS_PER_BATCH: 96 # 32 gpus 59 | BASE_LR: 0.002 60 | WEIGHT_DECAY: 0.0001 61 | STEPS: (240000, 320000) 62 | MAX_ITER: 360000 63 | CHECKPOINT_PERIOD: 40000 64 | CLIP_GRADIENTS: 65 | ENABLED: True 66 | CLIP_TYPE: "norm" 67 | CLIP_VALUE: 5.0 68 | FIND_UNUSED_PARAM: True 69 | WITH_IMAGE_LABELS: True # load image tags 70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-C4.yaml" 2 | MODEL: 3 | META_ARCHITECTURE: "PretrainFastRCNN" 4 | BACKBONE: 5 | NAME: "build_clip_resnet_backbone" 6 | FREEZE_AT: 2 7 | WEIGHTS: "" 8 | MASK_ON: False 9 | RESNETS: 10 | DEPTH: 200 11 | OUT_FEATURES: ["res4"] 12 | NORM: FrozenBN 13 | STEM_OUT_CHANNELS: 64 14 | RES2_OUT_CHANNELS: 256 15 | ROI_HEADS: 16 | NAME: "PretrainRes5ROIHeads" 17 | IN_FEATURES: ["res4"] 18 | ROI_BOX_HEAD: 19 | POOLER_RESOLUTION: 18 20 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] 21 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] 22 | CLIP: 23 | CLSS_TEMP: 0.01 24 | CROP_REGION_TYPE: "RPN" 25 | OFFLINE_RPN_NMS_THRESH: 0.5 26 | GATHER_GPUS: True 27 | CONCEPT_THRES: 0.1 28 | PRETRAIN_RPN_REGIONS: 300 29 | PRETRAIN_SAMPLE_REGIONS: 100 30 | PRETRAIN_IMG_TXT_LEVEL: True 31 | PRETRAIN_ONLY_EOT: True 32 | TEACHER_RESNETS_DEPTH: 200 33 | TEACHER_POOLER_RESOLUTION: 18 34 | # INPUT: 35 | # MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 36 | DATASETS: 37 | TRAIN: ("imgtxtpairs",) 38 | FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",) 39 | PATH_TRAIN: ("/home/v-yiwuzhong/projects/azureblobs/vlpdatasets/coco-caption/val2017",) # ("/tmp/datasets/CC3M",) 40 | TEST: () 41 | DATALOADER: 42 | ASPECT_RATIO_GROUPING: False 43 | NUM_WORKERS: 4 44 | TEST: 45 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 46 | EVAL_PERIOD: 2500000 47 | SOLVER: 48 | IMS_PER_BATCH: 96 # 32 gpus 49 | BASE_LR: 0.002 50 | WEIGHT_DECAY: 0.0001 51 | STEPS: (300000, 525000) 52 | MAX_ITER: 600000 53 | CLIP_GRADIENTS: 54 | ENABLED: True 55 | CLIP_TYPE: "norm" 56 | CLIP_VALUE: 5.0 57 | INPUT: 58 | MIN_SIZE_TRAIN_SAMPLING: choice 59 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 60 | MAX_SIZE_TRAIN: 1333 61 | MIN_SIZE_TEST: 800 62 | MAX_SIZE_TEST: 1333 63 | FORMAT: "RGB" 64 | AUG: # Data Augmentation from MSR-CLIP 65 | TRAIN: 66 | IMAGE_SIZE: [800,] 67 | MAX_SIZE: 1333 68 | TEST: 69 | IMAGE_SIZE: [800,] 70 | MAX_SIZE: 1333 71 | INTERPOLATION: 3 -------------------------------------------------------------------------------- /sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4_onlinePL_boxWeak.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./RegionCLIP_RN50_onlinePL_box_weak.yaml" 2 | MODEL: 3 | RESNETS: 4 | DEPTH: 200 5 | ROI_BOX_HEAD: 6 | POOLER_RESOLUTION: 18 7 | CLIP: 8 | TEACHER_RESNETS_DEPTH: 200 9 | TEACHER_POOLER_RESOLUTION: 18 10 | TEXT_EMB_DIM: 640 11 | # TEXT_EMB_PATH: None # for classifer, not used in pretraining if MODEL.IGNORE_CLS_LOSS True 12 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth" # use emb from r50x4 13 | OUTPUT_DIR: output/r50x4_pre_onlinePL_boxWeak 14 | 15 | -------------------------------------------------------------------------------- /sas_det/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ovd_register as _ovd_register # ensure the builtin datasets are registered 3 | 4 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 5 | -------------------------------------------------------------------------------- /sas_det/data/coco_zeroshot_categories.py: -------------------------------------------------------------------------------- 1 | # COCO categories for zero-shot setting 2 | # 65 categories in total, 48 base categories for training, 17 unseen categories are only used in testing 3 | # from http://ankan.umiacs.io/files/mscoco_seen_classes.json, http://ankan.umiacs.io/files/mscoco_unseen_classes.json 4 | 5 | # 17 class names in order, obtained from load_coco_json() function 6 | COCO_UNSEEN_CLS = ['airplane', 'bus', 'cat', 'dog', 'cow', 'elephant', 'umbrella', \ 7 | 'tie', 'snowboard', 'skateboard', 'cup', 'knife', 'cake', 'couch', 'keyboard', \ 8 | 'sink', 'scissors'] 9 | 10 | # 48 class names in order, obtained from load_coco_json() function 11 | COCO_SEEN_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'train', 'truck', \ 12 | 'boat', 'bench', 'bird', 'horse', 'sheep', 'bear', 'zebra', 'giraffe', \ 13 | 'backpack', 'handbag', 'suitcase', 'frisbee', 'skis', 'kite', 'surfboard', \ 14 | 'bottle', 'fork', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', \ 15 | 'broccoli', 'carrot', 'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', \ 16 | 'laptop', 'mouse', 'remote', 'microwave', 'oven', 'toaster', \ 17 | 'refrigerator', 'book', 'clock', 'vase', 'toothbrush'] 18 | 19 | # 65 class names in order, obtained from load_coco_json() function 20 | COCO_OVD_ALL_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', \ 21 | 'bus', 'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', \ 22 | 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', \ 23 | 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard', \ 24 | 'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', \ 25 | 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut', 'cake', \ 26 | 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', \ 27 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', \ 28 | 'scissors', 'toothbrush'] 29 | 30 | # 80 class names 31 | COCO_80_ALL_CLS = {1: 'person', 32 | 2: 'bicycle', 33 | 3: 'car', 34 | 4: 'motorcycle', 35 | 5: 'airplane', 36 | 6: 'bus', 37 | 7: 'train', 38 | 8: 'truck', 39 | 9: 'boat', 40 | 10: 'traffic light', 41 | 11: 'fire hydrant', 42 | 12: 'stop sign', 43 | 13: 'parking meter', 44 | 14: 'bench', 45 | 15: 'bird', 46 | 16: 'cat', 47 | 17: 'dog', 48 | 18: 'horse', 49 | 19: 'sheep', 50 | 20: 'cow', 51 | 21: 'elephant', 52 | 22: 'bear', 53 | 23: 'zebra', 54 | 24: 'giraffe', 55 | 25: 'backpack', 56 | 26: 'umbrella', 57 | 27: 'handbag', 58 | 28: 'tie', 59 | 29: 'suitcase', 60 | 30: 'frisbee', 61 | 31: 'skis', 62 | 32: 'snowboard', 63 | 33: 'sports ball', 64 | 34: 'kite', 65 | 35: 'baseball bat', 66 | 36: 'baseball glove', 67 | 37: 'skateboard', 68 | 38: 'surfboard', 69 | 39: 'tennis racket', 70 | 40: 'bottle', 71 | 41: 'wine glass', 72 | 42: 'cup', 73 | 43: 'fork', 74 | 44: 'knife', 75 | 45: 'spoon', 76 | 46: 'bowl', 77 | 47: 'banana', 78 | 48: 'apple', 79 | 49: 'sandwich', 80 | 50: 'orange', 81 | 51: 'broccoli', 82 | 52: 'carrot', 83 | 53: 'hot dog', 84 | 54: 'pizza', 85 | 55: 'donut', 86 | 56: 'cake', 87 | 57: 'chair', 88 | 58: 'couch', 89 | 59: 'potted plant', 90 | 60: 'bed', 91 | 61: 'dining table', 92 | 62: 'toilet', 93 | 63: 'tv', 94 | 64: 'laptop', 95 | 65: 'mouse', 96 | 66: 'remote', 97 | 67: 'keyboard', 98 | 68: 'cell phone', 99 | 69: 'microwave', 100 | 70: 'oven', 101 | 71: 'toaster', 102 | 72: 'sink', 103 | 73: 'refrigerator', 104 | 74: 'book', 105 | 75: 'clock', 106 | 76: 'vase', 107 | 77: 'scissors', 108 | 78: 'teddy bear', 109 | 79: 'hair drier', 110 | 80: 'toothbrush'} 111 | 112 | if __name__ == "__main__": 113 | # from https://github.com/alirezazareian/ovr-cnn/blob/master/ipynb/001.ipynb 114 | # Create zero-shot setting data split in COCO 115 | import json 116 | import ipdb 117 | 118 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin: 119 | coco_train_anno_all = json.load(fin) 120 | 121 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin: 122 | coco_train_anno_seen = json.load(fin) 123 | 124 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin: 125 | coco_train_anno_unseen = json.load(fin) 126 | 127 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin: 128 | coco_val_anno_all = json.load(fin) 129 | 130 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin: 131 | coco_val_anno_seen = json.load(fin) 132 | 133 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin: 134 | coco_val_anno_unseen = json.load(fin) 135 | 136 | labels_seen = COCO_SEEN_CLS 137 | labels_unseen = COCO_UNSEEN_CLS 138 | labels_all = [item['name'] for item in coco_val_anno_all['categories']] # 80 class names 139 | # len(labels_seen), len(labels_unseen) 140 | # set(labels_seen) - set(labels_all) 141 | # set(labels_unseen) - set(labels_all) 142 | 143 | class_id_to_split = {} # {1: 'seen', 2: 'seen', 3: 'seen', 4: 'seen', 5: 'unseen',...} 144 | class_name_to_split = {} # {'person': 'seen', 'bicycle': 'seen', 'car': 'seen', 'motorcycle': 'seen', 'airplane': 'unseen',...} 145 | for item in coco_val_anno_all['categories']: 146 | if item['name'] in labels_seen: 147 | class_id_to_split[item['id']] = 'seen' 148 | class_name_to_split[item['name']] = 'seen' 149 | elif item['name'] in labels_unseen: 150 | class_id_to_split[item['id']] = 'unseen' 151 | class_name_to_split[item['name']] = 'unseen' 152 | 153 | # class_name_to_emb = {} 154 | # with open('../datasets/coco/zero-shot/glove.6B.300d.txt', 'r') as fin: 155 | # for row in fin: 156 | # row_tk = row.split() 157 | # if row_tk[0] in class_name_to_split: 158 | # class_name_to_emb[row_tk[0]] = [float(num) for num in row_tk[1:]] 159 | # len(class_name_to_emb), len(class_name_to_split) 160 | 161 | def filter_annotation(anno_dict, split_name_list): 162 | """ 163 | COCO annotations have fields: dict_keys(['info', 'licenses', 'images', 'annotations', 'categories']) 164 | This function (1) filters the category metadata (list) in 'categories'; 165 | (2) filter instance annotation in 'annotations'; (3) filter image metadata (list) in 'images 166 | """ 167 | filtered_categories = [] 168 | for item in anno_dict['categories']: 169 | if class_id_to_split.get(item['id']) in split_name_list: 170 | #item['embedding'] = class_name_to_emb[item['name']] 171 | item['split'] = class_id_to_split.get(item['id']) 172 | filtered_categories.append(item) 173 | anno_dict['categories'] = filtered_categories 174 | 175 | filtered_images = [] 176 | filtered_annotations = [] 177 | useful_image_ids = set() 178 | for item in anno_dict['annotations']: 179 | if class_id_to_split.get(item['category_id']) in split_name_list: 180 | filtered_annotations.append(item) 181 | useful_image_ids.add(item['image_id']) 182 | for item in anno_dict['images']: 183 | if item['id'] in useful_image_ids: 184 | filtered_images.append(item) 185 | anno_dict['annotations'] = filtered_annotations 186 | anno_dict['images'] = filtered_images 187 | 188 | filter_annotation(coco_train_anno_seen, ['seen']) 189 | filter_annotation(coco_train_anno_unseen, ['unseen']) 190 | filter_annotation(coco_train_anno_all, ['seen', 'unseen']) 191 | filter_annotation(coco_val_anno_seen, ['seen']) 192 | filter_annotation(coco_val_anno_unseen, ['unseen']) 193 | filter_annotation(coco_val_anno_all, ['seen', 'unseen']) 194 | 195 | with open('./datasets/coco/annotations/ovd_ins_train2017_b.json', 'w') as fout: 196 | json.dump(coco_train_anno_seen, fout) 197 | with open('./datasets/coco/annotations/ovd_ins_train2017_t.json', 'w') as fout: 198 | json.dump(coco_train_anno_unseen, fout) 199 | with open('./datasets/coco/annotations/ovd_ins_train2017_all.json', 'w') as fout: 200 | json.dump(coco_train_anno_all, fout) 201 | with open('./datasets/coco/annotations/ovd_ins_val2017_b.json', 'w') as fout: 202 | json.dump(coco_val_anno_seen, fout) 203 | with open('./datasets/coco/annotations/ovd_ins_val2017_t.json', 'w') as fout: 204 | json.dump(coco_val_anno_unseen, fout) 205 | with open('./datasets/coco/annotations/ovd_ins_val2017_all.json', 'w') as fout: 206 | json.dump(coco_val_anno_all, fout) -------------------------------------------------------------------------------- /sas_det/data/ovd_register.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | 5 | """ 6 | This file registers pre-defined datasets at hard-coded paths, and their metadata. 7 | 8 | We hard-code metadata for common datasets. This will enable: 9 | 1. Consistency check when loading the datasets 10 | 2. Use models on these standard datasets directly and run demos, 11 | without having to download the dataset annotations 12 | 13 | We hard-code some paths to the dataset that's assumed to 14 | exist in "./datasets/". 15 | 16 | Users SHOULD NOT use this file to create new dataset / metadata for new dataset. 17 | To add new dataset, refer to the tutorial "docs/DATASETS.md". 18 | """ 19 | 20 | import os 21 | 22 | from detectron2.data import DatasetCatalog, MetadataCatalog 23 | 24 | from detectron2.data.datasets.builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata 25 | # from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic 26 | # from .cityscapes_panoptic import register_all_cityscapes_panoptic 27 | from detectron2.data.datasets.coco import load_sem_seg, register_coco_instances 28 | # from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated 29 | # from detectron2.data.datasets.lvis import get_lvis_instances_meta, register_lvis_instances 30 | # from .pascal_voc import register_pascal_voc 31 | 32 | from .lvis import get_lvis_instances_meta, register_lvis_instances_w_PLs, register_lvis_instances 33 | 34 | # ==== Predefined datasets and splits for COCO ========== 35 | 36 | _PREDEFINED_SPLITS_COCO = {} 37 | # _PREDEFINED_SPLITS_COCO["coco"] = { 38 | # "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"), 39 | # "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"), 40 | # "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"), 41 | # "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"), 42 | # "coco_2014_valminusminival": ( 43 | # "coco/val2014", 44 | # "coco/annotations/instances_valminusminival2014.json", 45 | # ), 46 | # "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"), 47 | # "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"), 48 | # "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"), 49 | # "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"), 50 | # "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"), 51 | # } 52 | _PREDEFINED_SPLITS_COCO["coco_ovd"] = { 53 | "coco_2017_ovd_all_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_all.json"), 54 | "coco_2017_ovd_b_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b.json"), 55 | "coco_2017_ovd_b_train_65cats": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats.json"), 56 | "coco_2017_ovd_b_train_65cats_all_images": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats_all_images.json"), 57 | "coco_2017_ovd_t_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_t.json"), 58 | # 59 | "coco_2017_ovd_all_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_all.json"), 60 | "coco_2017_ovd_b_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_b.json"), 61 | "coco_2017_ovd_t_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_t.json"), 62 | # 63 | "coco_2017_ovd_retain_val": ("coco/val2017", "coco/annotations/ovd_ins_val2017_retain_15.json"), 64 | } 65 | 66 | 67 | def register_all_coco(root): 68 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items(): 69 | if dataset_name == 'coco_ovd': # for zero-shot split 70 | for key, (image_root, json_file) in splits_per_dataset.items(): 71 | # Assume pre-defined datasets live in `./datasets`. 72 | register_coco_instances( 73 | key, 74 | {}, # empty metadata, it will be overwritten in load_coco_json() function 75 | os.path.join(root, json_file) if "://" not in json_file else json_file, 76 | os.path.join(root, image_root), 77 | ) 78 | else: # default splits 79 | for key, (image_root, json_file) in splits_per_dataset.items(): 80 | # Assume pre-defined datasets live in `./datasets`. 81 | register_coco_instances( 82 | key, 83 | _get_builtin_metadata(dataset_name), 84 | os.path.join(root, json_file) if "://" not in json_file else json_file, 85 | os.path.join(root, image_root), 86 | ) 87 | 88 | 89 | # ==== Predefined datasets and splits for LVIS ========== 90 | 91 | _PREDEFINED_SPLITS_LVIS = { 92 | # # openset setting 93 | # "lvis_v1": { 94 | # "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"), 95 | # "lvis_v1_train_p0": ("coco/", "lvis/lvis_v1_train_p0.json"), 96 | # "lvis_v1_train_p1": ("coco/", "lvis/lvis_v1_train_p1.json"), 97 | # "lvis_v1_train_p2": ("coco/", "lvis/lvis_v1_train_p2.json"), 98 | # "lvis_v1_train_p3": ("coco/", "lvis/lvis_v1_train_p3.json"), 99 | # # 100 | # "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"), 101 | # "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"), 102 | # "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"), 103 | # }, 104 | # custom image setting 105 | "lvis_v1_custom_img": { 106 | "lvis_v1_train_custom_img": ("coco/", "lvis/lvis_v1_train.json"), 107 | "lvis_v1_val_custom_img": ("coco/", "lvis/lvis_v1_val.json"), 108 | "lvis_v1_test_dev_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"), 109 | "lvis_v1_test_challenge_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"), 110 | }, 111 | # regular fully supervised setting 112 | "lvis_v1_fullysup": { 113 | "lvis_v1_train_fullysup": ("coco/", "lvis/lvis_v1_train.json"), 114 | "lvis_v1_val_fullysup": ("coco/", "lvis/lvis_v1_val.json"), 115 | "lvis_v1_test_dev_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"), 116 | "lvis_v1_test_challenge_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"), 117 | # 118 | "lvis_v1_train_base_1203cats": ("coco/", "lvis/lvis_v1_train_baseOnly.json"), 119 | "lvis_v1_val_1@10": ("coco/", "lvis/lvis_v1_val_1@10.json"), 120 | }, 121 | # PLs for ensemble by zsy 122 | "lvis_v1_PLs": { 123 | "lvis_v1_train_base_PLs_r50x4": ("coco/", "lvis/regionclip_PLs/inst_train_defRegCLIPr50x4_PLs_93.json"), 124 | "lvis_v1_train_SASDet_r50x4_PLs": ("coco/", "lvis/regionclip_PLs/lvis_v1_train_SASDet_r50x4_PLs_t62.json"), 125 | "lvis_v1_o365_SASDet_r50x4_PLs": ("Objects365/train", "Objects365/regionclip_PLs/zsy_objv1_train_SASDet_r50x4_PLs_t83.json"), 126 | } 127 | } 128 | 129 | 130 | def register_all_lvis(root): 131 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items(): 132 | if dataset_name == "lvis_v1_PLs": 133 | for key, (image_root, json_file) in splits_per_dataset.items(): 134 | register_lvis_instances_w_PLs( 135 | key, 136 | get_lvis_instances_meta(dataset_name), # TODO: meta for PLs, category order is rearranged 137 | os.path.join(root, json_file) if "://" not in json_file else json_file, 138 | os.path.join(root, image_root), 139 | ) 140 | else: 141 | for key, (image_root, json_file) in splits_per_dataset.items(): 142 | if dataset_name == "lvis_v1": 143 | args = {'filter_open_cls': True, 'run_custom_img': False} 144 | elif dataset_name == 'lvis_v1_custom_img': 145 | args = {'filter_open_cls': False, 'run_custom_img': True} 146 | elif dataset_name == 'lvis_v1_fullysup': 147 | args = {'filter_open_cls': False, 'run_custom_img': False} 148 | register_lvis_instances( 149 | key, 150 | get_lvis_instances_meta(dataset_name), 151 | os.path.join(root, json_file) if "://" not in json_file else json_file, 152 | os.path.join(root, image_root), 153 | args, 154 | ) 155 | 156 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 157 | register_all_coco(_root) 158 | register_all_lvis(_root) 159 | 160 | # # True for open source; 161 | # # Internally at fb, we register them elsewhere 162 | # if __name__.endswith(".builtin"): 163 | # # Assume pre-defined datasets live in `./datasets`. 164 | # _root = os.getenv("DETECTRON2_DATASETS", "datasets") 165 | # register_all_coco(_root) 166 | # register_all_lvis(_root) -------------------------------------------------------------------------------- /sas_det/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator 3 | from .coco_evaluation import COCOEvaluator 4 | from .rotated_coco_evaluation import RotatedCOCOEvaluator 5 | from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset 6 | from .lvis_evaluation import LVISEvaluator 7 | from .panoptic_evaluation import COCOPanopticEvaluator 8 | from .pascal_voc_evaluation import PascalVOCDetectionEvaluator 9 | from .sem_seg_evaluation import SemSegEvaluator 10 | from .testing import print_csv_format, verify_results 11 | 12 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 13 | -------------------------------------------------------------------------------- /sas_det/evaluation/cityscapes_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import glob 3 | import logging 4 | import numpy as np 5 | import os 6 | import tempfile 7 | from collections import OrderedDict 8 | import torch 9 | from PIL import Image 10 | 11 | from detectron2.data import MetadataCatalog 12 | from detectron2.utils import comm 13 | from detectron2.utils.file_io import PathManager 14 | 15 | from .evaluator import DatasetEvaluator 16 | 17 | 18 | class CityscapesEvaluator(DatasetEvaluator): 19 | """ 20 | Base class for evaluation using cityscapes API. 21 | """ 22 | 23 | def __init__(self, dataset_name): 24 | """ 25 | Args: 26 | dataset_name (str): the name of the dataset. 27 | It must have the following metadata associated with it: 28 | "thing_classes", "gt_dir". 29 | """ 30 | self._metadata = MetadataCatalog.get(dataset_name) 31 | self._cpu_device = torch.device("cpu") 32 | self._logger = logging.getLogger(__name__) 33 | 34 | def reset(self): 35 | self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_") 36 | self._temp_dir = self._working_dir.name 37 | # All workers will write to the same results directory 38 | # TODO this does not work in distributed training 39 | self._temp_dir = comm.all_gather(self._temp_dir)[0] 40 | if self._temp_dir != self._working_dir.name: 41 | self._working_dir.cleanup() 42 | self._logger.info( 43 | "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir) 44 | ) 45 | 46 | 47 | class CityscapesInstanceEvaluator(CityscapesEvaluator): 48 | """ 49 | Evaluate instance segmentation results on cityscapes dataset using cityscapes API. 50 | 51 | Note: 52 | * It does not work in multi-machine distributed training. 53 | * It contains a synchronization, therefore has to be used on all ranks. 54 | * Only the main process runs evaluation. 55 | """ 56 | 57 | def process(self, inputs, outputs): 58 | from cityscapesscripts.helpers.labels import name2label 59 | 60 | for input, output in zip(inputs, outputs): 61 | file_name = input["file_name"] 62 | basename = os.path.splitext(os.path.basename(file_name))[0] 63 | pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt") 64 | 65 | if "instances" in output: 66 | output = output["instances"].to(self._cpu_device) 67 | num_instances = len(output) 68 | with open(pred_txt, "w") as fout: 69 | for i in range(num_instances): 70 | pred_class = output.pred_classes[i] 71 | classes = self._metadata.thing_classes[pred_class] 72 | class_id = name2label[classes].id 73 | score = output.scores[i] 74 | mask = output.pred_masks[i].numpy().astype("uint8") 75 | png_filename = os.path.join( 76 | self._temp_dir, basename + "_{}_{}.png".format(i, classes) 77 | ) 78 | 79 | Image.fromarray(mask * 255).save(png_filename) 80 | fout.write( 81 | "{} {} {}\n".format(os.path.basename(png_filename), class_id, score) 82 | ) 83 | else: 84 | # Cityscapes requires a prediction file for every ground truth image. 85 | with open(pred_txt, "w") as fout: 86 | pass 87 | 88 | def evaluate(self): 89 | """ 90 | Returns: 91 | dict: has a key "segm", whose value is a dict of "AP" and "AP50". 92 | """ 93 | comm.synchronize() 94 | if comm.get_rank() > 0: 95 | return 96 | import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval 97 | 98 | self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) 99 | 100 | # set some global states in cityscapes evaluation API, before evaluating 101 | cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) 102 | cityscapes_eval.args.predictionWalk = None 103 | cityscapes_eval.args.JSONOutput = False 104 | cityscapes_eval.args.colorized = False 105 | cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") 106 | 107 | # These lines are adopted from 108 | # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa 109 | gt_dir = PathManager.get_local_path(self._metadata.gt_dir) 110 | groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png")) 111 | assert len( 112 | groundTruthImgList 113 | ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( 114 | cityscapes_eval.args.groundTruthSearch 115 | ) 116 | predictionImgList = [] 117 | for gt in groundTruthImgList: 118 | predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) 119 | results = cityscapes_eval.evaluateImgLists( 120 | predictionImgList, groundTruthImgList, cityscapes_eval.args 121 | )["averages"] 122 | 123 | ret = OrderedDict() 124 | ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} 125 | self._working_dir.cleanup() 126 | return ret 127 | 128 | 129 | class CityscapesSemSegEvaluator(CityscapesEvaluator): 130 | """ 131 | Evaluate semantic segmentation results on cityscapes dataset using cityscapes API. 132 | 133 | Note: 134 | * It does not work in multi-machine distributed training. 135 | * It contains a synchronization, therefore has to be used on all ranks. 136 | * Only the main process runs evaluation. 137 | """ 138 | 139 | def process(self, inputs, outputs): 140 | from cityscapesscripts.helpers.labels import trainId2label 141 | 142 | for input, output in zip(inputs, outputs): 143 | file_name = input["file_name"] 144 | basename = os.path.splitext(os.path.basename(file_name))[0] 145 | pred_filename = os.path.join(self._temp_dir, basename + "_pred.png") 146 | 147 | output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy() 148 | pred = 255 * np.ones(output.shape, dtype=np.uint8) 149 | for train_id, label in trainId2label.items(): 150 | if label.ignoreInEval: 151 | continue 152 | pred[output == train_id] = label.id 153 | Image.fromarray(pred).save(pred_filename) 154 | 155 | def evaluate(self): 156 | comm.synchronize() 157 | if comm.get_rank() > 0: 158 | return 159 | # Load the Cityscapes eval script *after* setting the required env var, 160 | # since the script reads CITYSCAPES_DATASET into global variables at load time. 161 | import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval 162 | 163 | self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) 164 | 165 | # set some global states in cityscapes evaluation API, before evaluating 166 | cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) 167 | cityscapes_eval.args.predictionWalk = None 168 | cityscapes_eval.args.JSONOutput = False 169 | cityscapes_eval.args.colorized = False 170 | 171 | # These lines are adopted from 172 | # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa 173 | gt_dir = PathManager.get_local_path(self._metadata.gt_dir) 174 | groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png")) 175 | assert len( 176 | groundTruthImgList 177 | ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( 178 | cityscapes_eval.args.groundTruthSearch 179 | ) 180 | predictionImgList = [] 181 | for gt in groundTruthImgList: 182 | predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt)) 183 | results = cityscapes_eval.evaluateImgLists( 184 | predictionImgList, groundTruthImgList, cityscapes_eval.args 185 | ) 186 | ret = OrderedDict() 187 | ret["sem_seg"] = { 188 | "IoU": 100.0 * results["averageScoreClasses"], 189 | "iIoU": 100.0 * results["averageScoreInstClasses"], 190 | "IoU_sup": 100.0 * results["averageScoreCategories"], 191 | "iIoU_sup": 100.0 * results["averageScoreInstCategories"], 192 | } 193 | self._working_dir.cleanup() 194 | return ret 195 | -------------------------------------------------------------------------------- /sas_det/evaluation/evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import datetime 3 | import logging 4 | import time 5 | from collections import OrderedDict, abc 6 | from contextlib import ExitStack, contextmanager 7 | from typing import List, Union 8 | import torch 9 | from torch import nn 10 | 11 | from detectron2.utils.comm import get_world_size, is_main_process 12 | from detectron2.utils.logger import log_every_n_seconds 13 | 14 | 15 | class DatasetEvaluator: 16 | """ 17 | Base class for a dataset evaluator. 18 | 19 | The function :func:`inference_on_dataset` runs the model over 20 | all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. 21 | 22 | This class will accumulate information of the inputs/outputs (by :meth:`process`), 23 | and produce evaluation results in the end (by :meth:`evaluate`). 24 | """ 25 | 26 | def reset(self): 27 | """ 28 | Preparation for a new round of evaluation. 29 | Should be called before starting a round of evaluation. 30 | """ 31 | pass 32 | 33 | def process(self, inputs, outputs): 34 | """ 35 | Process the pair of inputs and outputs. 36 | If they contain batches, the pairs can be consumed one-by-one using `zip`: 37 | 38 | .. code-block:: python 39 | 40 | for input_, output in zip(inputs, outputs): 41 | # do evaluation on single input/output pair 42 | ... 43 | 44 | Args: 45 | inputs (list): the inputs that's used to call the model. 46 | outputs (list): the return value of `model(inputs)` 47 | """ 48 | pass 49 | 50 | def evaluate(self): 51 | """ 52 | Evaluate/summarize the performance, after processing all input/output pairs. 53 | 54 | Returns: 55 | dict: 56 | A new evaluator class can return a dict of arbitrary format 57 | as long as the user can process the results. 58 | In our train_net.py, we expect the following format: 59 | 60 | * key: the name of the task (e.g., bbox) 61 | * value: a dict of {metric name: score}, e.g.: {"AP50": 80} 62 | """ 63 | pass 64 | 65 | 66 | class DatasetEvaluators(DatasetEvaluator): 67 | """ 68 | Wrapper class to combine multiple :class:`DatasetEvaluator` instances. 69 | 70 | This class dispatches every evaluation call to 71 | all of its :class:`DatasetEvaluator`. 72 | """ 73 | 74 | def __init__(self, evaluators): 75 | """ 76 | Args: 77 | evaluators (list): the evaluators to combine. 78 | """ 79 | super().__init__() 80 | self._evaluators = evaluators 81 | 82 | def reset(self): 83 | for evaluator in self._evaluators: 84 | evaluator.reset() 85 | 86 | def process(self, inputs, outputs): 87 | for evaluator in self._evaluators: 88 | evaluator.process(inputs, outputs) 89 | 90 | def evaluate(self): 91 | results = OrderedDict() 92 | for evaluator in self._evaluators: 93 | result = evaluator.evaluate() 94 | if is_main_process() and result is not None: 95 | for k, v in result.items(): 96 | assert ( 97 | k not in results 98 | ), "Different evaluators produce results with the same key {}".format(k) 99 | results[k] = v 100 | return results 101 | 102 | 103 | def inference_on_dataset( 104 | model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None] 105 | ): 106 | """ 107 | Run model on the data_loader and evaluate the metrics with evaluator. 108 | Also benchmark the inference speed of `model.__call__` accurately. 109 | The model will be used in eval mode. 110 | 111 | Args: 112 | model (callable): a callable which takes an object from 113 | `data_loader` and returns some outputs. 114 | 115 | If it's an nn.Module, it will be temporarily set to `eval` mode. 116 | If you wish to evaluate a model in `training` mode instead, you can 117 | wrap the given model and override its behavior of `.eval()` and `.train()`. 118 | data_loader: an iterable object with a length. 119 | The elements it generates will be the inputs to the model. 120 | evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark, 121 | but don't want to do any evaluation. 122 | 123 | Returns: 124 | The return value of `evaluator.evaluate()` 125 | """ 126 | num_devices = get_world_size() 127 | logger = logging.getLogger(__name__) 128 | logger.info("Start inference on {} batches".format(len(data_loader))) 129 | 130 | total = len(data_loader) # inference data loader must have a fixed length 131 | if evaluator is None: 132 | # create a no-op evaluator 133 | evaluator = DatasetEvaluators([]) 134 | if isinstance(evaluator, abc.MutableSequence): 135 | evaluator = DatasetEvaluators(evaluator) 136 | evaluator.reset() 137 | 138 | num_warmup = min(5, total - 1) 139 | start_time = time.perf_counter() 140 | total_data_time = 0 141 | total_compute_time = 0 142 | total_eval_time = 0 143 | with ExitStack() as stack: 144 | if isinstance(model, nn.Module): 145 | stack.enter_context(inference_context(model)) 146 | stack.enter_context(torch.no_grad()) 147 | 148 | start_data_time = time.perf_counter() 149 | for idx, inputs in enumerate(data_loader): 150 | total_data_time += time.perf_counter() - start_data_time 151 | if idx == num_warmup: 152 | start_time = time.perf_counter() 153 | total_data_time = 0 154 | total_compute_time = 0 155 | total_eval_time = 0 156 | 157 | start_compute_time = time.perf_counter() 158 | outputs = model(inputs) 159 | if torch.cuda.is_available(): 160 | torch.cuda.synchronize() 161 | total_compute_time += time.perf_counter() - start_compute_time 162 | 163 | start_eval_time = time.perf_counter() 164 | evaluator.process(inputs, outputs) 165 | total_eval_time += time.perf_counter() - start_eval_time 166 | 167 | iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) 168 | data_seconds_per_iter = total_data_time / iters_after_start 169 | compute_seconds_per_iter = total_compute_time / iters_after_start 170 | eval_seconds_per_iter = total_eval_time / iters_after_start 171 | total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start 172 | if idx >= num_warmup * 2 or compute_seconds_per_iter > 5: 173 | eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1))) 174 | log_every_n_seconds( 175 | logging.INFO, 176 | ( 177 | f"Inference done {idx + 1}/{total}. " 178 | f"Dataloading: {data_seconds_per_iter:.4f} s / iter. " 179 | f"Inference: {compute_seconds_per_iter:.4f} s / iter. " 180 | f"Eval: {eval_seconds_per_iter:.4f} s / iter. " 181 | f"Total: {total_seconds_per_iter:.4f} s / iter. " 182 | f"ETA={eta}" 183 | ), 184 | n=5, 185 | ) 186 | start_data_time = time.perf_counter() 187 | 188 | # Measure the time only for this worker (before the synchronization barrier) 189 | total_time = time.perf_counter() - start_time 190 | total_time_str = str(datetime.timedelta(seconds=total_time)) 191 | # NOTE this format is parsed by grep 192 | logger.info( 193 | "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format( 194 | total_time_str, total_time / (total - num_warmup), num_devices 195 | ) 196 | ) 197 | total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) 198 | logger.info( 199 | "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format( 200 | total_compute_time_str, total_compute_time / (total - num_warmup), num_devices 201 | ) 202 | ) 203 | 204 | results = evaluator.evaluate() 205 | # An evaluator may return None when not in main process. 206 | # Replace it by an empty dict instead to make it easier for downstream code to handle 207 | if results is None: 208 | results = {} 209 | return results 210 | 211 | 212 | @contextmanager 213 | def inference_context(model): 214 | """ 215 | A context where the model is temporarily changed to eval mode, 216 | and restored to previous mode afterwards. 217 | 218 | Args: 219 | model: a torch Module 220 | """ 221 | training_mode = model.training 222 | model.eval() 223 | yield 224 | model.train(training_mode) 225 | -------------------------------------------------------------------------------- /sas_det/evaluation/fast_eval_api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | import numpy as np 5 | import time 6 | from pycocotools.cocoeval import COCOeval 7 | 8 | from detectron2 import _C 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class COCOeval_opt(COCOeval): 14 | """ 15 | This is a slightly modified version of the original COCO API, where the functions evaluateImg() 16 | and accumulate() are implemented in C++ to speedup evaluation 17 | """ 18 | 19 | def evaluate(self): 20 | """ 21 | Run per image evaluation on given images and store results in self.evalImgs_cpp, a 22 | datastructure that isn't readable from Python but is used by a c++ implementation of 23 | accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure 24 | self.evalImgs because this datastructure is a computational bottleneck. 25 | :return: None 26 | """ 27 | tic = time.time() 28 | 29 | p = self.params 30 | # add backward compatibility if useSegm is specified in params 31 | if p.useSegm is not None: 32 | p.iouType = "segm" if p.useSegm == 1 else "bbox" 33 | logger.info("Evaluate annotation type *{}*".format(p.iouType)) 34 | p.imgIds = list(np.unique(p.imgIds)) 35 | if p.useCats: 36 | p.catIds = list(np.unique(p.catIds)) 37 | p.maxDets = sorted(p.maxDets) 38 | self.params = p 39 | 40 | self._prepare() # bottleneck 41 | 42 | # loop through images, area range, max detection number 43 | catIds = p.catIds if p.useCats else [-1] 44 | 45 | if p.iouType == "segm" or p.iouType == "bbox": 46 | computeIoU = self.computeIoU 47 | elif p.iouType == "keypoints": 48 | computeIoU = self.computeOks 49 | self.ious = { 50 | (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds 51 | } # bottleneck 52 | 53 | maxDet = p.maxDets[-1] 54 | 55 | # <<<< Beginning of code differences with original COCO API 56 | def convert_instances_to_cpp(instances, is_det=False): 57 | # Convert annotations for a list of instances in an image to a format that's fast 58 | # to access in C++ 59 | instances_cpp = [] 60 | for instance in instances: 61 | instance_cpp = _C.InstanceAnnotation( 62 | int(instance["id"]), 63 | instance["score"] if is_det else instance.get("score", 0.0), 64 | instance["area"], 65 | bool(instance.get("iscrowd", 0)), 66 | bool(instance.get("ignore", 0)), 67 | ) 68 | instances_cpp.append(instance_cpp) 69 | return instances_cpp 70 | 71 | # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ 72 | ground_truth_instances = [ 73 | [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] 74 | for imgId in p.imgIds 75 | ] 76 | detected_instances = [ 77 | [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds] 78 | for imgId in p.imgIds 79 | ] 80 | ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] 81 | 82 | if not p.useCats: 83 | # For each image, flatten per-category lists into a single list 84 | ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances] 85 | detected_instances = [[[o for c in i for o in c]] for i in detected_instances] 86 | 87 | # Call C++ implementation of self.evaluateImgs() 88 | self._evalImgs_cpp = _C.COCOevalEvaluateImages( 89 | p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances 90 | ) 91 | self._evalImgs = None 92 | 93 | self._paramsEval = copy.deepcopy(self.params) 94 | toc = time.time() 95 | logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic)) 96 | # >>>> End of code differences with original COCO API 97 | 98 | def accumulate(self): 99 | """ 100 | Accumulate per image evaluation results and store the result in self.eval. Does not 101 | support changing parameter settings from those used by self.evaluate() 102 | """ 103 | logger.info("Accumulating evaluation results...") 104 | tic = time.time() 105 | assert hasattr( 106 | self, "_evalImgs_cpp" 107 | ), "evaluate() must be called before accmulate() is called." 108 | 109 | self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) 110 | 111 | # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections 112 | self.eval["recall"] = np.array(self.eval["recall"]).reshape( 113 | self.eval["counts"][:1] + self.eval["counts"][2:] 114 | ) 115 | 116 | # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X 117 | # num_area_ranges X num_max_detections 118 | self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"]) 119 | self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) 120 | toc = time.time() 121 | logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)) 122 | -------------------------------------------------------------------------------- /sas_det/evaluation/panoptic_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import io 4 | import itertools 5 | import json 6 | import logging 7 | import numpy as np 8 | import os 9 | import tempfile 10 | from collections import OrderedDict 11 | from typing import Optional 12 | from PIL import Image 13 | from tabulate import tabulate 14 | 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.utils import comm 17 | from detectron2.utils.file_io import PathManager 18 | 19 | from .evaluator import DatasetEvaluator 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class COCOPanopticEvaluator(DatasetEvaluator): 25 | """ 26 | Evaluate Panoptic Quality metrics on COCO using PanopticAPI. 27 | It saves panoptic segmentation prediction in `output_dir` 28 | 29 | It contains a synchronize call and has to be called from all workers. 30 | """ 31 | 32 | def __init__(self, dataset_name: str, output_dir: Optional[str] = None): 33 | """ 34 | Args: 35 | dataset_name: name of the dataset 36 | output_dir: output directory to save results for evaluation. 37 | """ 38 | self._metadata = MetadataCatalog.get(dataset_name) 39 | self._thing_contiguous_id_to_dataset_id = { 40 | v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() 41 | } 42 | self._stuff_contiguous_id_to_dataset_id = { 43 | v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items() 44 | } 45 | 46 | self._output_dir = output_dir 47 | if self._output_dir is not None: 48 | PathManager.mkdirs(self._output_dir) 49 | 50 | def reset(self): 51 | self._predictions = [] 52 | 53 | def _convert_category_id(self, segment_info): 54 | isthing = segment_info.pop("isthing", None) 55 | if isthing is None: 56 | # the model produces panoptic category id directly. No more conversion needed 57 | return segment_info 58 | if isthing is True: 59 | segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[ 60 | segment_info["category_id"] 61 | ] 62 | else: 63 | segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[ 64 | segment_info["category_id"] 65 | ] 66 | return segment_info 67 | 68 | def process(self, inputs, outputs): 69 | from panopticapi.utils import id2rgb 70 | 71 | for input, output in zip(inputs, outputs): 72 | panoptic_img, segments_info = output["panoptic_seg"] 73 | panoptic_img = panoptic_img.cpu().numpy() 74 | if segments_info is None: 75 | # If "segments_info" is None, we assume "panoptic_img" is a 76 | # H*W int32 image storing the panoptic_id in the format of 77 | # category_id * label_divisor + instance_id. We reserve -1 for 78 | # VOID label, and add 1 to panoptic_img since the official 79 | # evaluation script uses 0 for VOID label. 80 | label_divisor = self._metadata.label_divisor 81 | segments_info = [] 82 | for panoptic_label in np.unique(panoptic_img): 83 | if panoptic_label == -1: 84 | # VOID region. 85 | continue 86 | pred_class = panoptic_label // label_divisor 87 | isthing = ( 88 | pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values() 89 | ) 90 | segments_info.append( 91 | { 92 | "id": int(panoptic_label) + 1, 93 | "category_id": int(pred_class), 94 | "isthing": bool(isthing), 95 | } 96 | ) 97 | # Official evaluation script uses 0 for VOID label. 98 | panoptic_img += 1 99 | 100 | file_name = os.path.basename(input["file_name"]) 101 | file_name_png = os.path.splitext(file_name)[0] + ".png" 102 | with io.BytesIO() as out: 103 | Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG") 104 | segments_info = [self._convert_category_id(x) for x in segments_info] 105 | self._predictions.append( 106 | { 107 | "image_id": input["image_id"], 108 | "file_name": file_name_png, 109 | "png_string": out.getvalue(), 110 | "segments_info": segments_info, 111 | } 112 | ) 113 | 114 | def evaluate(self): 115 | comm.synchronize() 116 | 117 | self._predictions = comm.gather(self._predictions) 118 | self._predictions = list(itertools.chain(*self._predictions)) 119 | if not comm.is_main_process(): 120 | return 121 | 122 | # PanopticApi requires local files 123 | gt_json = PathManager.get_local_path(self._metadata.panoptic_json) 124 | gt_folder = PathManager.get_local_path(self._metadata.panoptic_root) 125 | 126 | with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: 127 | logger.info("Writing all panoptic predictions to {} ...".format(pred_dir)) 128 | for p in self._predictions: 129 | with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: 130 | f.write(p.pop("png_string")) 131 | 132 | with open(gt_json, "r") as f: 133 | json_data = json.load(f) 134 | json_data["annotations"] = self._predictions 135 | 136 | output_dir = self._output_dir or pred_dir 137 | predictions_json = os.path.join(output_dir, "predictions.json") 138 | with PathManager.open(predictions_json, "w") as f: 139 | f.write(json.dumps(json_data)) 140 | 141 | from panopticapi.evaluation import pq_compute 142 | 143 | with contextlib.redirect_stdout(io.StringIO()): 144 | pq_res = pq_compute( 145 | gt_json, 146 | PathManager.get_local_path(predictions_json), 147 | gt_folder=gt_folder, 148 | pred_folder=pred_dir, 149 | ) 150 | 151 | res = {} 152 | res["PQ"] = 100 * pq_res["All"]["pq"] 153 | res["SQ"] = 100 * pq_res["All"]["sq"] 154 | res["RQ"] = 100 * pq_res["All"]["rq"] 155 | res["PQ_th"] = 100 * pq_res["Things"]["pq"] 156 | res["SQ_th"] = 100 * pq_res["Things"]["sq"] 157 | res["RQ_th"] = 100 * pq_res["Things"]["rq"] 158 | res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] 159 | res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] 160 | res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] 161 | 162 | results = OrderedDict({"panoptic_seg": res}) 163 | _print_panoptic_results(pq_res) 164 | 165 | return results 166 | 167 | 168 | def _print_panoptic_results(pq_res): 169 | headers = ["", "PQ", "SQ", "RQ", "#categories"] 170 | data = [] 171 | for name in ["All", "Things", "Stuff"]: 172 | row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]] 173 | data.append(row) 174 | table = tabulate( 175 | data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center" 176 | ) 177 | logger.info("Panoptic Evaluation Results:\n" + table) 178 | 179 | 180 | if __name__ == "__main__": 181 | from detectron2.utils.logger import setup_logger 182 | 183 | logger = setup_logger() 184 | import argparse 185 | 186 | parser = argparse.ArgumentParser() 187 | parser.add_argument("--gt-json") 188 | parser.add_argument("--gt-dir") 189 | parser.add_argument("--pred-json") 190 | parser.add_argument("--pred-dir") 191 | args = parser.parse_args() 192 | 193 | from panopticapi.evaluation import pq_compute 194 | 195 | with contextlib.redirect_stdout(io.StringIO()): 196 | pq_res = pq_compute( 197 | args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir 198 | ) 199 | _print_panoptic_results(pq_res) 200 | -------------------------------------------------------------------------------- /sas_det/evaluation/rotated_coco_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import itertools 3 | import json 4 | import numpy as np 5 | import os 6 | import torch 7 | from pycocotools.cocoeval import COCOeval, maskUtils 8 | 9 | from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated 10 | from detectron2.utils.file_io import PathManager 11 | 12 | from .coco_evaluation import COCOEvaluator 13 | 14 | 15 | class RotatedCOCOeval(COCOeval): 16 | @staticmethod 17 | def is_rotated(box_list): 18 | if type(box_list) == np.ndarray: 19 | return box_list.shape[1] == 5 20 | elif type(box_list) == list: 21 | if box_list == []: # cannot decide the box_dim 22 | return False 23 | return np.all( 24 | np.array( 25 | [ 26 | (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray)) 27 | for obj in box_list 28 | ] 29 | ) 30 | ) 31 | return False 32 | 33 | @staticmethod 34 | def boxlist_to_tensor(boxlist, output_box_dim): 35 | if type(boxlist) == np.ndarray: 36 | box_tensor = torch.from_numpy(boxlist) 37 | elif type(boxlist) == list: 38 | if boxlist == []: 39 | return torch.zeros((0, output_box_dim), dtype=torch.float32) 40 | else: 41 | box_tensor = torch.FloatTensor(boxlist) 42 | else: 43 | raise Exception("Unrecognized boxlist type") 44 | 45 | input_box_dim = box_tensor.shape[1] 46 | if input_box_dim != output_box_dim: 47 | if input_box_dim == 4 and output_box_dim == 5: 48 | box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS) 49 | else: 50 | raise Exception( 51 | "Unable to convert from {}-dim box to {}-dim box".format( 52 | input_box_dim, output_box_dim 53 | ) 54 | ) 55 | return box_tensor 56 | 57 | def compute_iou_dt_gt(self, dt, gt, is_crowd): 58 | if self.is_rotated(dt) or self.is_rotated(gt): 59 | # TODO: take is_crowd into consideration 60 | assert all(c == 0 for c in is_crowd) 61 | dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5)) 62 | gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5)) 63 | return pairwise_iou_rotated(dt, gt) 64 | else: 65 | # This is the same as the classical COCO evaluation 66 | return maskUtils.iou(dt, gt, is_crowd) 67 | 68 | def computeIoU(self, imgId, catId): 69 | p = self.params 70 | if p.useCats: 71 | gt = self._gts[imgId, catId] 72 | dt = self._dts[imgId, catId] 73 | else: 74 | gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] 75 | dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] 76 | if len(gt) == 0 and len(dt) == 0: 77 | return [] 78 | inds = np.argsort([-d["score"] for d in dt], kind="mergesort") 79 | dt = [dt[i] for i in inds] 80 | if len(dt) > p.maxDets[-1]: 81 | dt = dt[0 : p.maxDets[-1]] 82 | 83 | assert p.iouType == "bbox", "unsupported iouType for iou computation" 84 | 85 | g = [g["bbox"] for g in gt] 86 | d = [d["bbox"] for d in dt] 87 | 88 | # compute iou between each dt and gt region 89 | iscrowd = [int(o["iscrowd"]) for o in gt] 90 | 91 | # Note: this function is copied from cocoeval.py in cocoapi 92 | # and the major difference is here. 93 | ious = self.compute_iou_dt_gt(d, g, iscrowd) 94 | return ious 95 | 96 | 97 | class RotatedCOCOEvaluator(COCOEvaluator): 98 | """ 99 | Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs, 100 | with rotated boxes support. 101 | Note: this uses IOU only and does not consider angle differences. 102 | """ 103 | 104 | def process(self, inputs, outputs): 105 | """ 106 | Args: 107 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 108 | It is a list of dict. Each dict corresponds to an image and 109 | contains keys like "height", "width", "file_name", "image_id". 110 | outputs: the outputs of a COCO model. It is a list of dicts with key 111 | "instances" that contains :class:`Instances`. 112 | """ 113 | for input, output in zip(inputs, outputs): 114 | prediction = {"image_id": input["image_id"]} 115 | 116 | if "instances" in output: 117 | instances = output["instances"].to(self._cpu_device) 118 | 119 | prediction["instances"] = self.instances_to_json(instances, input["image_id"]) 120 | if "proposals" in output: 121 | prediction["proposals"] = output["proposals"].to(self._cpu_device) 122 | self._predictions.append(prediction) 123 | 124 | def instances_to_json(self, instances, img_id): 125 | num_instance = len(instances) 126 | if num_instance == 0: 127 | return [] 128 | 129 | boxes = instances.pred_boxes.tensor.numpy() 130 | if boxes.shape[1] == 4: 131 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) 132 | boxes = boxes.tolist() 133 | scores = instances.scores.tolist() 134 | classes = instances.pred_classes.tolist() 135 | 136 | results = [] 137 | for k in range(num_instance): 138 | result = { 139 | "image_id": img_id, 140 | "category_id": classes[k], 141 | "bbox": boxes[k], 142 | "score": scores[k], 143 | } 144 | 145 | results.append(result) 146 | return results 147 | 148 | def _eval_predictions(self, predictions, img_ids=None): # img_ids: unused 149 | """ 150 | Evaluate predictions on the given tasks. 151 | Fill self._results with the metrics of the tasks. 152 | """ 153 | self._logger.info("Preparing results for COCO format ...") 154 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 155 | 156 | # unmap the category ids for COCO 157 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 158 | reverse_id_mapping = { 159 | v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() 160 | } 161 | for result in coco_results: 162 | result["category_id"] = reverse_id_mapping[result["category_id"]] 163 | 164 | if self._output_dir: 165 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 166 | self._logger.info("Saving results to {}".format(file_path)) 167 | with PathManager.open(file_path, "w") as f: 168 | f.write(json.dumps(coco_results)) 169 | f.flush() 170 | 171 | if not self._do_evaluation: 172 | self._logger.info("Annotations are not available for evaluation.") 173 | return 174 | 175 | self._logger.info("Evaluating predictions ...") 176 | 177 | assert self._tasks is None or set(self._tasks) == { 178 | "bbox" 179 | }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported" 180 | coco_eval = ( 181 | self._evaluate_predictions_on_coco(self._coco_api, coco_results) 182 | if len(coco_results) > 0 183 | else None # cocoapi does not handle empty results very well 184 | ) 185 | 186 | task = "bbox" 187 | res = self._derive_coco_results( 188 | coco_eval, task, class_names=self._metadata.get("thing_classes") 189 | ) 190 | self._results[task] = res 191 | 192 | def _evaluate_predictions_on_coco(self, coco_gt, coco_results): 193 | """ 194 | Evaluate the coco results using COCOEval API. 195 | """ 196 | assert len(coco_results) > 0 197 | 198 | coco_dt = coco_gt.loadRes(coco_results) 199 | 200 | # Only bbox is supported for now 201 | coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox") 202 | 203 | coco_eval.evaluate() 204 | coco_eval.accumulate() 205 | coco_eval.summarize() 206 | 207 | return coco_eval 208 | -------------------------------------------------------------------------------- /sas_det/evaluation/sem_seg_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import itertools 3 | import json 4 | import logging 5 | import numpy as np 6 | import os 7 | from collections import OrderedDict 8 | import PIL.Image as Image 9 | import pycocotools.mask as mask_util 10 | import torch 11 | 12 | from detectron2.data import DatasetCatalog, MetadataCatalog 13 | from detectron2.utils.comm import all_gather, is_main_process, synchronize 14 | from detectron2.utils.file_io import PathManager 15 | 16 | from .evaluator import DatasetEvaluator 17 | 18 | 19 | class SemSegEvaluator(DatasetEvaluator): 20 | """ 21 | Evaluate semantic segmentation metrics. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | dataset_name, 27 | distributed=True, 28 | output_dir=None, 29 | *, 30 | num_classes=None, 31 | ignore_label=None, 32 | ): 33 | """ 34 | Args: 35 | dataset_name (str): name of the dataset to be evaluated. 36 | distributed (bool): if True, will collect results from all ranks for evaluation. 37 | Otherwise, will evaluate the results in the current process. 38 | output_dir (str): an output directory to dump results. 39 | num_classes, ignore_label: deprecated argument 40 | """ 41 | self._logger = logging.getLogger(__name__) 42 | if num_classes is not None: 43 | self._logger.warn( 44 | "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata." 45 | ) 46 | if ignore_label is not None: 47 | self._logger.warn( 48 | "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata." 49 | ) 50 | self._dataset_name = dataset_name 51 | self._distributed = distributed 52 | self._output_dir = output_dir 53 | 54 | self._cpu_device = torch.device("cpu") 55 | 56 | self.input_file_to_gt_file = { 57 | dataset_record["file_name"]: dataset_record["sem_seg_file_name"] 58 | for dataset_record in DatasetCatalog.get(dataset_name) 59 | } 60 | 61 | meta = MetadataCatalog.get(dataset_name) 62 | # Dict that maps contiguous training ids to COCO category ids 63 | try: 64 | c2d = meta.stuff_dataset_id_to_contiguous_id 65 | self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()} 66 | except AttributeError: 67 | self._contiguous_id_to_dataset_id = None 68 | self._class_names = meta.stuff_classes 69 | self._num_classes = len(meta.stuff_classes) 70 | if num_classes is not None: 71 | assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}" 72 | self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label 73 | 74 | def reset(self): 75 | self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64) 76 | self._predictions = [] 77 | 78 | def process(self, inputs, outputs): 79 | """ 80 | Args: 81 | inputs: the inputs to a model. 82 | It is a list of dicts. Each dict corresponds to an image and 83 | contains keys like "height", "width", "file_name". 84 | outputs: the outputs of a model. It is either list of semantic segmentation predictions 85 | (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic 86 | segmentation prediction in the same format. 87 | """ 88 | for input, output in zip(inputs, outputs): 89 | output = output["sem_seg"].argmax(dim=0).to(self._cpu_device) 90 | pred = np.array(output, dtype=np.int) 91 | with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f: 92 | gt = np.array(Image.open(f), dtype=np.int) 93 | 94 | gt[gt == self._ignore_label] = self._num_classes 95 | 96 | self._conf_matrix += np.bincount( 97 | (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), 98 | minlength=self._conf_matrix.size, 99 | ).reshape(self._conf_matrix.shape) 100 | 101 | self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"])) 102 | 103 | def evaluate(self): 104 | """ 105 | Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): 106 | 107 | * Mean intersection-over-union averaged across classes (mIoU) 108 | * Frequency Weighted IoU (fwIoU) 109 | * Mean pixel accuracy averaged across classes (mACC) 110 | * Pixel Accuracy (pACC) 111 | """ 112 | if self._distributed: 113 | synchronize() 114 | conf_matrix_list = all_gather(self._conf_matrix) 115 | self._predictions = all_gather(self._predictions) 116 | self._predictions = list(itertools.chain(*self._predictions)) 117 | if not is_main_process(): 118 | return 119 | 120 | self._conf_matrix = np.zeros_like(self._conf_matrix) 121 | for conf_matrix in conf_matrix_list: 122 | self._conf_matrix += conf_matrix 123 | 124 | if self._output_dir: 125 | PathManager.mkdirs(self._output_dir) 126 | file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") 127 | with PathManager.open(file_path, "w") as f: 128 | f.write(json.dumps(self._predictions)) 129 | 130 | acc = np.full(self._num_classes, np.nan, dtype=np.float) 131 | iou = np.full(self._num_classes, np.nan, dtype=np.float) 132 | tp = self._conf_matrix.diagonal()[:-1].astype(np.float) 133 | pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) 134 | class_weights = pos_gt / np.sum(pos_gt) 135 | pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) 136 | acc_valid = pos_gt > 0 137 | acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] 138 | iou_valid = (pos_gt + pos_pred) > 0 139 | union = pos_gt + pos_pred - tp 140 | iou[acc_valid] = tp[acc_valid] / union[acc_valid] 141 | macc = np.sum(acc[acc_valid]) / np.sum(acc_valid) 142 | miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) 143 | fiou = np.sum(iou[acc_valid] * class_weights[acc_valid]) 144 | pacc = np.sum(tp) / np.sum(pos_gt) 145 | 146 | res = {} 147 | res["mIoU"] = 100 * miou 148 | res["fwIoU"] = 100 * fiou 149 | for i, name in enumerate(self._class_names): 150 | res["IoU-{}".format(name)] = 100 * iou[i] 151 | res["mACC"] = 100 * macc 152 | res["pACC"] = 100 * pacc 153 | for i, name in enumerate(self._class_names): 154 | res["ACC-{}".format(name)] = 100 * acc[i] 155 | 156 | if self._output_dir: 157 | file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") 158 | with PathManager.open(file_path, "wb") as f: 159 | torch.save(res, f) 160 | results = OrderedDict({"sem_seg": res}) 161 | self._logger.info(results) 162 | return results 163 | 164 | def encode_json_sem_seg(self, sem_seg, input_file_name): 165 | """ 166 | Convert semantic segmentation to COCO stuff format with segments encoded as RLEs. 167 | See http://cocodataset.org/#format-results 168 | """ 169 | json_list = [] 170 | for label in np.unique(sem_seg): 171 | if self._contiguous_id_to_dataset_id is not None: 172 | assert ( 173 | label in self._contiguous_id_to_dataset_id 174 | ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name) 175 | dataset_id = self._contiguous_id_to_dataset_id[label] 176 | else: 177 | dataset_id = int(label) 178 | mask = (sem_seg == label).astype(np.uint8) 179 | mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0] 180 | mask_rle["counts"] = mask_rle["counts"].decode("utf-8") 181 | json_list.append( 182 | {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle} 183 | ) 184 | return json_list 185 | -------------------------------------------------------------------------------- /sas_det/evaluation/testing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import numpy as np 4 | import pprint 5 | import sys 6 | from collections.abc import Mapping 7 | 8 | 9 | def print_csv_format(results): 10 | """ 11 | Print main metrics in a format similar to Detectron, 12 | so that they are easy to copypaste into a spreadsheet. 13 | 14 | Args: 15 | results (OrderedDict[dict]): task_name -> {metric -> score} 16 | unordered dict can also be printed, but in arbitrary order 17 | """ 18 | assert isinstance(results, Mapping) or not len(results), results 19 | logger = logging.getLogger(__name__) 20 | for task, res in results.items(): 21 | if isinstance(res, Mapping): 22 | # Don't print "AP-category" metrics since they are usually not tracked. 23 | important_res = [(k, v) for k, v in res.items() if "-" not in k] 24 | logger.info("copypaste: Task: {}".format(task)) 25 | logger.info("copypaste: " + ",".join([k[0] for k in important_res])) 26 | logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res])) 27 | else: 28 | logger.info(f"copypaste: {task}={res}") 29 | 30 | 31 | def verify_results(cfg, results): 32 | """ 33 | Args: 34 | results (OrderedDict[dict]): task_name -> {metric -> score} 35 | 36 | Returns: 37 | bool: whether the verification succeeds or not 38 | """ 39 | expected_results = cfg.TEST.EXPECTED_RESULTS 40 | if not len(expected_results): 41 | return True 42 | 43 | ok = True 44 | for task, metric, expected, tolerance in expected_results: 45 | actual = results[task].get(metric, None) 46 | if actual is None: 47 | ok = False 48 | continue 49 | if not np.isfinite(actual): 50 | ok = False 51 | continue 52 | diff = abs(actual - expected) 53 | if diff > tolerance: 54 | ok = False 55 | 56 | logger = logging.getLogger(__name__) 57 | if not ok: 58 | logger.error("Result verification failed!") 59 | logger.error("Expected Results: " + str(expected_results)) 60 | logger.error("Actual Results: " + pprint.pformat(results)) 61 | 62 | sys.exit(1) 63 | else: 64 | logger.info("Results verification passed.") 65 | return ok 66 | 67 | 68 | def flatten_results_dict(results): 69 | """ 70 | Expand a hierarchical dict of scalars into a flat dict of scalars. 71 | If results[k1][k2][k3] = v, the returned dict will have the entry 72 | {"k1/k2/k3": v}. 73 | 74 | Args: 75 | results (dict): 76 | """ 77 | r = {} 78 | for k, v in results.items(): 79 | if isinstance(v, Mapping): 80 | v = flatten_results_dict(v) 81 | for kk, vv in v.items(): 82 | r[k + "/" + kk] = vv 83 | else: 84 | r[k] = v 85 | return r 86 | -------------------------------------------------------------------------------- /sas_det/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) NEC Laboratories America, Inc. 2 | 3 | from .backbone import ( 4 | build_clip_language_encoder, 5 | get_clip_tokenzier, 6 | get_clip_image_transform, 7 | ) 8 | 9 | from .meta_arch import clip_rcnn as _ 10 | 11 | from .roi_heads import ( 12 | CLIPRes5ROIHeads, 13 | FastRCNNOutputLayers, 14 | ) 15 | -------------------------------------------------------------------------------- /sas_det/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) NEC Laboratories America, Inc. 2 | 3 | from .clip_backbone import ( 4 | build_clip_language_encoder, 5 | get_clip_tokenzier, 6 | get_clip_image_transform, 7 | ) -------------------------------------------------------------------------------- /sas_det/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) NEC Laboratories America, Inc. 2 | 3 | from .clip_roi_heads import ( 4 | CLIPRes5ROIHeads, 5 | # PretrainRes5ROIHeads, 6 | # CLIPStandardROIHeads, 7 | ) 8 | from .clip_roi_heads import FastRCNNOutputLayers 9 | 10 | __all__ = list(globals().keys()) 11 | -------------------------------------------------------------------------------- /test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) NEC Laboratories America, Inc. 4 | """ 5 | A main training script. 6 | 7 | This scripts reads a given config file and runs the training or evaluation. 8 | It is an entry point that is made to train standard models in detectron2. 9 | 10 | In order to let one script support training of many models, 11 | this script contains logic that are specific to these built-in models and therefore 12 | may not be suitable for your own project. 13 | For example, your research project perhaps only needs a single "evaluator". 14 | 15 | Therefore, we recommend you to use detectron2 as an library and take 16 | this file as an example of how to use the library. 17 | You may want to write your own script with your datasets and other customizations. 18 | """ 19 | 20 | import logging 21 | import os 22 | from collections import OrderedDict 23 | import torch 24 | 25 | import detectron2.utils.comm as comm 26 | # from detectron2.checkpoint import DetectionCheckpointer 27 | from detectron2.config import get_cfg 28 | from detectron2.data import MetadataCatalog, build_detection_train_loader 29 | from detectron2.engine import default_argument_parser, default_setup, hooks, launch 30 | from detectron2.engine import DefaultTrainer # this may be modified by regionclip 31 | from detectron2.modeling import GeneralizedRCNNWithTTA 32 | 33 | from sas_det.evaluation import ( 34 | CityscapesInstanceEvaluator, 35 | CityscapesSemSegEvaluator, 36 | COCOEvaluator, 37 | COCOPanopticEvaluator, 38 | DatasetEvaluators, 39 | LVISEvaluator, 40 | PascalVOCDetectionEvaluator, 41 | SemSegEvaluator, 42 | verify_results, 43 | ) 44 | from sas_det.checkpoint import DetectionCheckpointer 45 | from sas_det import add_sas_det_config 46 | 47 | #os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 48 | import torch.multiprocessing 49 | torch.multiprocessing.set_sharing_strategy('file_system') 50 | 51 | class Trainer(DefaultTrainer): 52 | """ 53 | We use the "DefaultTrainer" which contains pre-defined default logic for 54 | standard training workflow. They may not work for you, especially if you 55 | are working on a new research project. In that case you can write your 56 | own training loop. You can use "tools/plain_train_net.py" as an example. 57 | """ 58 | 59 | @classmethod 60 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 61 | """ 62 | Create evaluator(s) for a given dataset. 63 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 64 | For your own dataset, you can simply create an evaluator manually in your 65 | script and do not have to worry about the hacky if-else logic here. 66 | """ 67 | if output_folder is None: 68 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 69 | evaluator_list = [] 70 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 71 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 72 | evaluator_list.append( 73 | SemSegEvaluator( 74 | dataset_name, 75 | distributed=True, 76 | output_dir=output_folder, 77 | ) 78 | ) 79 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 80 | evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) 81 | if evaluator_type == "coco_panoptic_seg": 82 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 83 | if evaluator_type == "cityscapes_instance": 84 | assert ( 85 | torch.cuda.device_count() >= comm.get_rank() 86 | ), "CityscapesEvaluator currently do not work with multiple machines." 87 | return CityscapesInstanceEvaluator(dataset_name) 88 | if evaluator_type == "cityscapes_sem_seg": 89 | assert ( 90 | torch.cuda.device_count() >= comm.get_rank() 91 | ), "CityscapesEvaluator currently do not work with multiple machines." 92 | return CityscapesSemSegEvaluator(dataset_name) 93 | elif evaluator_type == "pascal_voc": 94 | return PascalVOCDetectionEvaluator(dataset_name) 95 | elif evaluator_type == "lvis": 96 | return LVISEvaluator(dataset_name, output_dir=output_folder) 97 | if len(evaluator_list) == 0: 98 | raise NotImplementedError( 99 | "no Evaluator for the dataset {} with the type {}".format( 100 | dataset_name, evaluator_type 101 | ) 102 | ) 103 | elif len(evaluator_list) == 1: 104 | return evaluator_list[0] 105 | return DatasetEvaluators(evaluator_list) 106 | 107 | @classmethod 108 | def test_with_TTA(cls, cfg, model): 109 | logger = logging.getLogger("detectron2.trainer") 110 | # In the end of training, run an evaluation with TTA 111 | # Only support some R-CNN models. 112 | logger.info("Running inference with test-time augmentation ...") 113 | model = GeneralizedRCNNWithTTA(cfg, model) 114 | evaluators = [ 115 | cls.build_evaluator( 116 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") 117 | ) 118 | for name in cfg.DATASETS.TEST 119 | ] 120 | res = cls.test(cfg, model, evaluators) 121 | res = OrderedDict({k + "_TTA": v for k, v in res.items()}) 122 | return res 123 | 124 | 125 | def periodic_update_teacher(trainer): 126 | update_steps = trainer.cfg.MODEL.OVD.PERIODIC_STEPS 127 | cur_iters = trainer.iter 128 | 129 | if cur_iters in update_steps: 130 | model = trainer.model 131 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 132 | # wrapped by DistributedDataParallel 133 | model.module.periodic_update_pairs() 134 | else: 135 | model.periodic_update_pairs() 136 | 137 | 138 | def setup(args): 139 | """ 140 | Create configs and perform basic setups. 141 | """ 142 | cfg = get_cfg() 143 | add_sas_det_config(cfg) # sas_det configs 144 | 145 | cfg.merge_from_file(args.config_file) 146 | cfg.merge_from_list(args.opts) 147 | cfg.freeze() 148 | default_setup(cfg, args) 149 | return cfg 150 | 151 | 152 | def main(args): 153 | cfg = setup(args) 154 | 155 | assert args.eval_only, "This release supports evaluation only." 156 | if args.eval_only: 157 | model = Trainer.build_model(cfg) 158 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 159 | cfg.MODEL.WEIGHTS, resume=args.resume 160 | ) 161 | if cfg.MODEL.META_ARCHITECTURE in ['CLIPRCNN', 'CLIPFastRCNN', 'PretrainFastRCNN', 'WeakPretrainFastRCNN'] \ 162 | and cfg.MODEL.CLIP.BB_RPN_WEIGHTS is not None\ 163 | and cfg.MODEL.CLIP.CROP_REGION_TYPE == 'RPN': # load 2nd pretrained model 164 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR, bb_rpn_weights=True).resume_or_load( 165 | cfg.MODEL.CLIP.BB_RPN_WEIGHTS, resume=False 166 | ) 167 | res = Trainer.test(cfg, model) 168 | if cfg.TEST.AUG.ENABLED: 169 | res.update(Trainer.test_with_TTA(cfg, model)) 170 | if comm.is_main_process(): 171 | verify_results(cfg, res) 172 | return res 173 | 174 | 175 | if __name__ == "__main__": 176 | args = default_argument_parser().parse_args() 177 | print("Command Line Args:", args) 178 | launch( 179 | main, 180 | args.num_gpus, 181 | num_machines=args.num_machines, 182 | machine_rank=args.machine_rank, 183 | dist_url=args.dist_url, 184 | args=(args,), 185 | ) 186 | -------------------------------------------------------------------------------- /tools/offline_eval_onLVIS.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from lvis import LVIS 5 | from lvis import LVISEval, LVISResults 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(description='evaluate PLs quality offline') 9 | parser.add_argument('gt_json', type=str, help='gt coco json file') 10 | parser.add_argument('pl_json', type=str, help='PL coco json file') 11 | args = parser.parse_args() 12 | # print(args) 13 | 14 | ############################################# 15 | gt_LVISJson_file = args.gt_json 16 | pred_LvisJson_file = args.pl_json 17 | 18 | covert_to_result = True # True if .json in coco data format (not coco result format) 19 | 20 | ############################################# 21 | 22 | # load image list in gt_json 23 | lvis_gt = LVIS(gt_LVISJson_file) 24 | gt_img_ids = set(lvis_gt.get_img_ids()) 25 | 26 | if covert_to_result: 27 | PLData = json.load(open(pred_LvisJson_file, 'r')) 28 | PL_list = list() 29 | imageId_list = list() 30 | for anno in PLData['annotations']: 31 | cur_image_id = anno['image_id'] 32 | ## eval only on PLs 33 | if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids): 34 | data = {'image_id': cur_image_id, 35 | 'category_id': anno['category_id'], 36 | 'bbox': anno['bbox'], 37 | 'score': anno['confidence']} 38 | PL_list.append(data) 39 | imageId_list.append(cur_image_id) 40 | # ## eval on all data (GT + PLs) 41 | # if cur_image_id in gt_img_ids: 42 | # data = {'image_id': cur_image_id, 43 | # 'category_id': anno['category_id'], 44 | # 'bbox': anno['bbox'], 45 | # 'score': anno['confidence']} 46 | # PL_list.append(data) 47 | # imageId_list.append(cur_image_id) 48 | 49 | print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) ) 50 | else: 51 | PL_list = json.load(open(pred_LvisJson_file, 'r')) 52 | 53 | # do evaluation 54 | lvis_results = LVISResults(lvis_gt, PL_list, max_dets=300) 55 | lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type="bbox") 56 | lvis_eval.run() 57 | lvis_eval.print_results() 58 | -------------------------------------------------------------------------------- /tools/offline_eval_onO365.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from pycocotools.coco import COCO 5 | from pycocotools.cocoeval import COCOeval 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(description='evaluate PLs quality offline') 10 | parser.add_argument('gt_json', type=str, help='gt coco json file') 11 | parser.add_argument('pl_json', type=str, help='PL coco json file') 12 | parser.add_argument('-r', '--raw', action='store_true') 13 | 14 | args = parser.parse_args() 15 | # print(args) 16 | 17 | ############################################# 18 | gt_COCOJson_file = args.gt_json 19 | pred_COCOJson_file = args.pl_json 20 | ############################################# 21 | 22 | # load image list in gt_json 23 | GtData = json.load(open(gt_COCOJson_file, 'r')) 24 | gt_img_ids = [x['id'] for x in GtData['images']] 25 | gt_img_ids = set(gt_img_ids) 26 | 27 | PLData = json.load(open(pred_COCOJson_file, 'r')) 28 | 29 | if args.raw: 30 | PL_list = PLData 31 | imageId_list = gt_img_ids 32 | else: 33 | PL_list = list() 34 | imageId_list = list() 35 | for anno in PLData['annotations']: 36 | cur_image_id = anno['image_id'] 37 | 38 | score = anno.get('confidence', None) 39 | if score is None: 40 | # take all PLs 41 | data = {'image_id': cur_image_id, 42 | 'category_id': anno['category_id'], 43 | 'bbox': anno['bbox'], 44 | 'score': anno['confidence']} 45 | PL_list.append(data) 46 | imageId_list.append(cur_image_id) 47 | 48 | # if args.raw: 49 | # # take all annos from PLs 50 | # data = {'image_id': cur_image_id, 51 | # 'category_id': anno['category_id'], 52 | # 'bbox': anno['bbox'], 53 | # 'score': anno['confidence']} 54 | # PL_list.append(data) 55 | # imageId_list.append(cur_image_id) 56 | # else: 57 | # if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids): 58 | # data = {'image_id': cur_image_id, 59 | # 'category_id': anno['category_id'], 60 | # 'bbox': anno['bbox'], 61 | # 'score': anno['confidence']} 62 | # PL_list.append(data) 63 | # imageId_list.append(cur_image_id) 64 | 65 | print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) ) 66 | 67 | curSaveJson = './.temp.json' 68 | with open(curSaveJson, 'w') as outfile: 69 | json.dump(PL_list, outfile) 70 | 71 | cocoGt = COCO(gt_COCOJson_file) 72 | cocoDt = cocoGt.loadRes(curSaveJson) 73 | 74 | cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox') 75 | cocoEval.evaluate() 76 | cocoEval.accumulate() 77 | cocoEval.summarize() 78 | --------------------------------------------------------------------------------