├── .gitignore
├── LICENSE
├── README.md
├── datasets
└── .gitkeep
├── pretrained_ckpt
└── .gitkeep
├── sas_det
├── __init__.py
├── checkpoint
│ ├── __init__.py
│ ├── c2_model_loading.py
│ ├── catalog.py
│ ├── clip_model_loading.py
│ └── detection_checkpoint.py
├── config.py
├── configs
│ ├── ovd_coco_R50_C4_ensemble.yaml
│ ├── ovd_coco_R50_C4_ensemble_PLs.yaml
│ ├── ovd_lvis_R50_C4_SAS_Det_3x.yaml
│ ├── ovd_lvis_R50_C4_ensemble_PLs.yaml
│ └── regionclip
│ │ ├── Base-RCNN-C4.yaml
│ │ ├── Base-RCNN-DilatedC5.yaml
│ │ ├── Base-RCNN-FPN.yaml
│ │ ├── Base-RetinaNet.yaml
│ │ ├── COCO-Detection
│ │ ├── fast_rcnn_R_50_FPN_1x.yaml
│ │ ├── faster_rcnn_R_101_C4_3x.yaml
│ │ ├── faster_rcnn_R_101_DC5_3x.yaml
│ │ ├── faster_rcnn_R_101_FPN_3x.yaml
│ │ ├── faster_rcnn_R_50_C4_1x.yaml
│ │ ├── faster_rcnn_R_50_C4_3x.yaml
│ │ ├── faster_rcnn_R_50_DC5_1x.yaml
│ │ ├── faster_rcnn_R_50_DC5_3x.yaml
│ │ ├── faster_rcnn_R_50_FPN_1x.yaml
│ │ ├── faster_rcnn_R_50_FPN_3x.yaml
│ │ ├── faster_rcnn_X_101_32x8d_FPN_3x.yaml
│ │ ├── retinanet_R_101_FPN_3x.yaml
│ │ ├── retinanet_R_50_FPN_1x.py
│ │ ├── retinanet_R_50_FPN_1x.yaml
│ │ ├── retinanet_R_50_FPN_3x.yaml
│ │ ├── rpn_R_50_C4_1x.yaml
│ │ └── rpn_R_50_FPN_1x.yaml
│ │ ├── COCO-InstanceSegmentation
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml
│ │ ├── customized
│ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml
│ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml
│ │ │ ├── CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml
│ │ │ ├── ovd_coco_2x_PLs_per4k_clsBoxConf.yaml
│ │ │ ├── ovd_coco_fCLIP_PLs_clsBoxConf.yaml
│ │ │ ├── ovd_coco_fCLIP_offline_PLs.yaml
│ │ │ └── ovd_coco_frozen_CLIP_RPN.yaml
│ │ ├── mask_rcnn_CLIP_R_50_C4_1x.yaml
│ │ ├── mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml
│ │ ├── mask_rcnn_R_101_C4_3x.yaml
│ │ ├── mask_rcnn_R_101_DC5_3x.yaml
│ │ ├── mask_rcnn_R_101_FPN_3x.yaml
│ │ ├── mask_rcnn_R_50_C4_1x.py
│ │ ├── mask_rcnn_R_50_C4_1x.yaml
│ │ ├── mask_rcnn_R_50_C4_1x_ovd_FSD.yaml
│ │ ├── mask_rcnn_R_50_C4_1x_ovd_coco65.yaml
│ │ ├── mask_rcnn_R_50_C4_3x.yaml
│ │ ├── mask_rcnn_R_50_DC5_1x.yaml
│ │ ├── mask_rcnn_R_50_DC5_3x.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x.py
│ │ ├── mask_rcnn_R_50_FPN_1x.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x_giou.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml
│ │ ├── mask_rcnn_R_50_FPN_3x.yaml
│ │ ├── mask_rcnn_X_101_32x8d_FPN_3x.yaml
│ │ ├── mask_rcnn_regnetx_4gf_dds_fpn_1x.py
│ │ └── mask_rcnn_regnety_4gf_dds_fpn_1x.py
│ │ ├── LVISv0.5-InstanceSegmentation
│ │ ├── mask_rcnn_R_101_FPN_1x.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x.yaml
│ │ └── mask_rcnn_X_101_32x8d_FPN_1x.yaml
│ │ ├── LVISv1-InstanceSegmentation
│ │ ├── CLIP_fast_rcnn_R_50_C4.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_custom_img.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_zsinf.yaml
│ │ ├── CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml
│ │ ├── customized
│ │ │ ├── ovd_lvis_box_PLs_periodic_boxConf.yaml
│ │ │ ├── ovd_lvis_fCLIP_PLs_clsBoxConf.yaml
│ │ │ └── ovd_lvis_frozen_CLIP_RPN.yaml
│ │ ├── mask_rcnn_CLIP_R_50_C4_1x.yaml
│ │ ├── mask_rcnn_CLIP_R_50_FPN_1x.yaml
│ │ ├── mask_rcnn_R_101_FPN_1x.yaml
│ │ ├── mask_rcnn_R_50_C4_1x.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x.yaml
│ │ ├── mask_rcnn_R_50_FPN_2x.yaml
│ │ └── mask_rcnn_X_101_32x8d_FPN_1x.yaml
│ │ ├── Misc
│ │ ├── cascade_mask_rcnn_R_50_FPN_1x.yaml
│ │ ├── cascade_mask_rcnn_R_50_FPN_3x.yaml
│ │ ├── cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
│ │ ├── mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
│ │ ├── mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
│ │ ├── mask_rcnn_R_50_FPN_3x_gn.yaml
│ │ ├── mask_rcnn_R_50_FPN_3x_syncbn.yaml
│ │ ├── mmdet_mask_rcnn_R_50_FPN_1x.py
│ │ ├── panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
│ │ ├── scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
│ │ ├── scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
│ │ ├── scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
│ │ ├── semantic_R_50_FPN_1x.yaml
│ │ └── torchvision_imagenet_R_50.py
│ │ ├── common
│ │ ├── README.md
│ │ ├── coco_schedule.py
│ │ ├── data
│ │ │ ├── coco.py
│ │ │ ├── coco_keypoint.py
│ │ │ └── coco_panoptic_separated.py
│ │ ├── models
│ │ │ ├── cascade_rcnn.py
│ │ │ ├── keypoint_rcnn_fpn.py
│ │ │ ├── mask_rcnn_c4.py
│ │ │ ├── mask_rcnn_fpn.py
│ │ │ ├── panoptic_fpn.py
│ │ │ └── retinanet.py
│ │ ├── optim.py
│ │ └── train.py
│ │ └── pretrain
│ │ ├── RegionCLIP_RN50.yaml
│ │ ├── RegionCLIP_RN50_onlinePL.yaml
│ │ ├── RegionCLIP_RN50_onlinePL_box_weak.yaml
│ │ ├── RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml
│ │ ├── RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml
│ │ ├── RegionCLIP_RN50x4.yaml
│ │ └── RegionCLIP_RN50x4_onlinePL_boxWeak.yaml
├── data
│ ├── __init__.py
│ ├── coco_zeroshot_categories.py
│ ├── lvis.py
│ ├── lvis_v0_5_categories.py
│ ├── lvis_v1_categories.py
│ └── ovd_register.py
├── evaluation
│ ├── __init__.py
│ ├── cityscapes_evaluation.py
│ ├── coco_evaluation.py
│ ├── evaluator.py
│ ├── fast_eval_api.py
│ ├── lvis_evaluation.py
│ ├── panoptic_evaluation.py
│ ├── pascal_voc_evaluation.py
│ ├── rotated_coco_evaluation.py
│ ├── sem_seg_evaluation.py
│ └── testing.py
└── modeling
│ ├── __init__.py
│ ├── backbone
│ ├── __init__.py
│ ├── batch_norm.py
│ └── clip_backbone.py
│ ├── ensemble_fast_rcnn.py
│ ├── ensemble_roi_heads.py
│ ├── meta_arch
│ └── clip_rcnn.py
│ └── roi_heads
│ ├── __init__.py
│ ├── clip_fast_rcnn.py
│ ├── clip_roi_heads.py
│ └── soft_nms.py
├── test_net.py
└── tools
├── gen_cat_text_emb.py
├── offline_eval_onLVIS.py
└── offline_eval_onO365.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # macOS stuff
2 | *.DS_Store
3 |
4 | # Python caches
5 | **/__pycache__
6 |
7 | # Ignore all output directories and experiment scripts in the individual projects
8 | /output/
9 | /datasets
10 | /pretrained_ckpt/
11 | /pretrained_ckpt
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (C) 2023 NEC Laboratories America, Inc. ("NECLA")
2 |
3 |
4 |
5 | This software and any and all related files/code/information is provided by
6 | NECLA to for non-commercial evaluation or research purposes subject to terms in a License agreement the Recipient has agreed to by Recipient’s signature.
7 |
8 |
9 |
10 | The license restriction includes, among other limitations, the Recipient to only evaluate this software and redistribute information related to this software only in the form of technical publications/papers, with no rights to assign a license to third parties or redistribute the software to others.
11 |
12 |
13 |
14 |
15 | IN NO EVENT SHALL NEC BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
16 | SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
17 | USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF NEC HAS BEEN
18 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19 |
20 |
21 |
22 | NEC SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND NEC HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
23 | SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
24 |
25 |
26 |
27 | THE LICENSE FROM NEC FOR THE SOFTWARE REQUIRES THAT LICENSEE
28 | COMPLY WITH ANY AND ALL UNDERLYING COPYRIGHTS AND LICENSE RIGHTS
29 | IN THE SOFTWARE BY THIRD PARTIES.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Taming Self-Training for Open-Vocabulary Object Detection
2 |
3 | Official implementation of online self-training and a split-and-fusion (SAF) head for Open-Vocabulary Object Detection (OVD), SAS-Det for short.
4 | This project was named as Improving Pseudo Labels for Open-Vocabulary Object Detection.
5 |
6 | [arXiv](https://arxiv.org/abs/2308.06412)
7 |
8 |
9 | ## Installation
10 | - Our project is developed on Detectron2. Please follow the official installation [instructions](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md), OR the following instructions.
11 | ```
12 | # create new environment
13 | conda create -n sas_det python=3.8
14 | conda activate sas_det
15 |
16 | # install pytorch
17 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
18 |
19 | # install Detectron2 from a local clone
20 | git clone https://github.com/facebookresearch/detectron2.git
21 | python -m pip install -e detectron2
22 | ```
23 |
24 | - Install CLIP
25 | ```
26 | # install CLIP
27 | pip install scipy
28 | pip install ftfy regex tqdm
29 | pip install git+https://github.com/openai/CLIP.git
30 | ```
31 |
32 |
33 | ## Datasets
34 |
35 | - Please follow RegionCLIP's [dataset instructions](https://github.com/microsoft/RegionCLIP/blob/main/datasets/README.md) to prepare COCO and LVIS datasets.
36 |
37 | - Download and put [metadata](https://drive.google.com/drive/u/1/folders/1R72q0Wg26-PQGqbaK3P3pT2vmGm9uzKU) for datasets in the folder `datasets` (i.e., `$DETECTRON2_DATASETS` used in the last step), which will be used in our evaluation and training.
38 |
39 |
40 | ## Download pretrained weights
41 | - Download various [RegionCLIP's pretrained weights](https://drive.google.com/drive/folders/1hzrJBvcCrahoRcqJRqzkIGFO_HUSJIii). Check [here](https://github.com/microsoft/RegionCLIP/blob/main/docs/MODEL_ZOO.md#model-downloading) for more details.
42 | Create a new folder `pretrained_ckpt` to put those weights. In this repository, `regionclip`, `concept_emb` and `rpn` will be used.
43 |
44 | - Download [our pretrained weights](https://drive.google.com/drive/u/1/folders/1TAr7nZSvpB6nCZCC6nXBw6xgmMmlL0X9) and put them in corresponding folders in `pretrained_ckpt`.
45 | Our pretrained weights includes:
46 | - `r50_3x_pre_RegCLIP_cocoRPN_2`: RPN weights pretrained only with COCO Base categories. This is used for experiments on COCO to avoid potential data leakage.
47 | - `concept_emb`: Complementary to RegionCLIP's `concept_emb`.
48 |
49 | ## Evaluation with released weights
50 |
51 | ### Results on COCO-OVD
52 |
53 |
54 |
55 | Configs |
56 | Novel AP |
57 | Base AP |
58 | Overall AP |
59 |
60 |
61 | w/o SAF head |
62 | 31.4 |
63 | 55.7 |
64 | 49.4 |
65 |
66 |
67 | with SAF head |
68 | 37.4 |
69 | 58.5 |
70 | 53.0 |
71 |
72 |
73 |
74 |
75 |
76 | Evaluation without the SAF Head (baseline in the paper),
77 |
78 |
79 | ```bash
80 | python3 ./test_net.py \
81 | --num-gpus 8 \
82 | --eval-only \
83 | --config-file ./sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml \
84 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco_no_saf_head_baseline.pth \
85 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
86 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
87 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
88 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
89 | MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
90 | OUTPUT_DIR output/eval
91 | ```
92 |
93 |
94 |
95 |
96 | Evaluation with the SAF Head,
97 |
98 |
99 | ```bash
100 | python3 ./test_net.py \
101 | --num-gpus 8 \
102 | --eval-only \
103 | --config-file ./sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml \
104 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco.pth \
105 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
106 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
107 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
108 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_coco_48_base_17_cls_emb.pth \
109 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
110 | MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
111 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/coco_ovd_continue_cat_ids.json" \
112 | MODEL.ENSEMBLE.ALPHA 0.3 MODEL.ENSEMBLE.BETA 0.7 \
113 | OUTPUT_DIR output/eval
114 | ```
115 |
116 |
117 |
118 | ### Results on LVIS-OVD
119 |
120 |
121 |
122 | Configs |
123 | APr |
124 | APc |
125 | APf |
126 | AP |
127 |
128 |
129 | RN50-C4 as backbone |
130 | 20.1 |
131 | 27.1 |
132 | 32.9 |
133 | 28.1 |
134 |
135 |
136 | RN50x4-C4 as backbone |
137 | 29.0 |
138 | 32.3 |
139 | 36.8 |
140 | 33.5 |
141 |
142 |
143 |
144 |
145 |
146 | Evaluation with RN50-C4 as the backbone,
147 |
148 |
149 | ```bash
150 | python3 ./test_net.py \
151 | --num-gpus 8 \
152 | --eval-only \
153 | --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \
154 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50.pth \
155 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
156 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
157 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb.pth \
158 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb.pth \
159 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
160 | MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
161 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \
162 | MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \
163 | OUTPUT_DIR output/eval
164 | ```
165 |
166 |
167 |
168 |
169 | Evaluation with RN50x4-C4 as the backbone,
170 |
171 |
172 | ```bash
173 | python3 ./test_net.py \
174 | --num-gpus 8 \
175 | --eval-only \
176 | --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \
177 | MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50x4.pth \
178 | MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
179 | MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
180 | MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb_rn50x4.pth \
181 | MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb_rn50x4.pth \
182 | MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
183 | MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
184 | MODEL.CLIP.TEXT_EMB_DIM 640 \
185 | MODEL.RESNETS.DEPTH 200 \
186 | MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
187 | MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \
188 | MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \
189 | MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \
190 | OUTPUT_DIR output/eval
191 | ```
192 |
193 |
194 |
195 |
196 | ## Acknowledgement
197 |
198 | This repository was built on top of [Detectron2](https://github.com/facebookresearch/detectron2), [RegionCLIP](https://github.com/microsoft/RegionCLIP), and [VLDet](https://github.com/clin1223/VLDet). We thank the effort from our community.
199 |
--------------------------------------------------------------------------------
/datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/datasets/.gitkeep
--------------------------------------------------------------------------------
/pretrained_ckpt/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/pretrained_ckpt/.gitkeep
--------------------------------------------------------------------------------
/sas_det/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 | from .modeling import ensemble_roi_heads as _
3 | from .config import add_sas_det_config
4 | from .data import *
5 |
6 |
--------------------------------------------------------------------------------
/sas_det/checkpoint/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # File:
4 |
5 |
6 | from . import catalog as _UNUSED # register the handler
7 | from .detection_checkpoint import DetectionCheckpointer
8 | from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
9 |
10 | __all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
11 |
--------------------------------------------------------------------------------
/sas_det/checkpoint/catalog.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 |
4 | from detectron2.utils.file_io import PathHandler, PathManager
5 |
6 |
7 | class ModelCatalog(object):
8 | """
9 | Store mappings from names to third-party models.
10 | """
11 |
12 | S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
13 |
14 | # MSRA models have STRIDE_IN_1X1=True. False otherwise.
15 | # NOTE: all BN models here have fused BN into an affine layer.
16 | # As a result, you should only load them to a model with "FrozenBN".
17 | # Loading them to a model with regular BN or SyncBN is wrong.
18 | # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
19 | # which should be negligible for training.
20 | # NOTE: all models here uses PIXEL_STD=[1,1,1]
21 | # NOTE: Most of the BN models here are no longer used. We use the
22 | # re-converted pre-trained models under detectron2 model zoo instead.
23 | C2_IMAGENET_MODELS = {
24 | "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
25 | "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
26 | "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
27 | "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
28 | "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
29 | "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
30 | "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
31 | }
32 |
33 | C2_DETECTRON_PATH_FORMAT = (
34 | "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl" # noqa B950
35 | )
36 |
37 | C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
38 | C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
39 |
40 | # format: {model_name} -> part of the url
41 | C2_DETECTRON_MODELS = {
42 | "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW", # noqa B950
43 | "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I", # noqa B950
44 | "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7", # noqa B950
45 | "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ", # noqa B950
46 | "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB", # noqa B950
47 | "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC", # noqa B950
48 | "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT", # noqa B950
49 | "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI", # noqa B950
50 | "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q", # noqa B950
51 | "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao", # noqa B950
52 | "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L", # noqa B950
53 | "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179", # noqa B950
54 | "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2", # noqa B950
55 | }
56 |
57 | @staticmethod
58 | def get(name):
59 | if name.startswith("Caffe2Detectron/COCO"):
60 | return ModelCatalog._get_c2_detectron_baseline(name)
61 | if name.startswith("ImageNetPretrained/"):
62 | return ModelCatalog._get_c2_imagenet_pretrained(name)
63 | raise RuntimeError("model not present in the catalog: {}".format(name))
64 |
65 | @staticmethod
66 | def _get_c2_imagenet_pretrained(name):
67 | prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
68 | name = name[len("ImageNetPretrained/") :]
69 | name = ModelCatalog.C2_IMAGENET_MODELS[name]
70 | url = "/".join([prefix, name])
71 | return url
72 |
73 | @staticmethod
74 | def _get_c2_detectron_baseline(name):
75 | name = name[len("Caffe2Detectron/COCO/") :]
76 | url = ModelCatalog.C2_DETECTRON_MODELS[name]
77 | if "keypoint_rcnn" in name:
78 | dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
79 | else:
80 | dataset = ModelCatalog.C2_DATASET_COCO
81 |
82 | if "35998355/rpn_R-50-C4_1x" in name:
83 | # this one model is somehow different from others ..
84 | type = "rpn"
85 | else:
86 | type = "generalized_rcnn"
87 |
88 | # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
89 | url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
90 | prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
91 | )
92 | return url
93 |
94 |
95 | class ModelCatalogHandler(PathHandler):
96 | """
97 | Resolve URL like catalog://.
98 | """
99 |
100 | PREFIX = "catalog://"
101 |
102 | def _get_supported_prefixes(self):
103 | return [self.PREFIX]
104 |
105 | def _get_local_path(self, path, **kwargs):
106 | logger = logging.getLogger(__name__)
107 | catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
108 | logger.info("Catalog entry {} points to {}".format(path, catalog_path))
109 | return PathManager.get_local_path(catalog_path, **kwargs)
110 |
111 | def _open(self, path, mode="r", **kwargs):
112 | return PathManager.open(self._get_local_path(path), mode, **kwargs)
113 |
114 |
115 | PathManager.register_handler(ModelCatalogHandler())
116 |
--------------------------------------------------------------------------------
/sas_det/checkpoint/detection_checkpoint.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | import os
4 | import pickle
5 | import torch
6 | from fvcore.common.checkpoint import Checkpointer
7 | from torch.nn.parallel import DistributedDataParallel
8 |
9 | import detectron2.utils.comm as comm
10 | from detectron2.utils.env import TORCH_VERSION
11 | from detectron2.utils.file_io import PathManager
12 |
13 | from .c2_model_loading import align_and_update_state_dicts
14 | from .clip_model_loading import align_and_update_state_dicts_for_CLIP
15 |
16 | class DetectionCheckpointer(Checkpointer):
17 | """
18 | Same as :class:`Checkpointer`, but is able to:
19 | 1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models.
20 | 2. correctly load checkpoints that are only available on the master worker
21 | """
22 |
23 | def __init__(self, model, save_dir="", *, save_to_disk=None, bb_rpn_weights=False, **checkpointables):
24 | is_main_process = comm.is_main_process()
25 | super().__init__(
26 | model,
27 | save_dir,
28 | save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
29 | **checkpointables,
30 | )
31 | self.path_manager = PathManager
32 | self.bb_rpn_weights = bb_rpn_weights
33 |
34 | def load(self, path, *args, **kwargs):
35 | need_sync = False
36 |
37 | if path and isinstance(self.model, DistributedDataParallel):
38 | logger = logging.getLogger(__name__)
39 | path = self.path_manager.get_local_path(path)
40 | has_file = os.path.isfile(path)
41 | all_has_file = comm.all_gather(has_file)
42 | if not all_has_file[0]:
43 | raise OSError(f"File {path} not found on main worker.")
44 | if not all(all_has_file):
45 | logger.warning(
46 | f"Not all workers can read checkpoint {path}. "
47 | "Training may fail to fully resume."
48 | )
49 | # TODO: broadcast the checkpoint file contents from main
50 | # worker, and load from it instead.
51 | need_sync = True
52 | if not has_file:
53 | path = None # don't load if not readable
54 | ret = super().load(path, *args, **kwargs)
55 |
56 | if need_sync:
57 | logger.info("Broadcasting model states from main worker ...")
58 | if TORCH_VERSION >= (1, 7):
59 | self.model._sync_params_and_buffers()
60 | return ret
61 |
62 | def _load_file(self, filename):
63 | if filename.endswith(".pkl"):
64 | with PathManager.open(filename, "rb") as f:
65 | data = pickle.load(f, encoding="latin1")
66 | if "model" in data and "__author__" in data:
67 | # file is in Detectron2 model zoo format
68 | self.logger.info("Reading a file from '{}'".format(data["__author__"]))
69 | return data
70 | else:
71 | # assume file is from Caffe2 / Detectron1 model zoo
72 | if "blobs" in data:
73 | # Detection models have "blobs", but ImageNet models don't
74 | data = data["blobs"]
75 | data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
76 | return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
77 | elif filename.endswith(".pyth"):
78 | # assume file is from pycls; no one else seems to use the ".pyth" extension
79 | with PathManager.open(filename, "rb") as f:
80 | data = torch.load(f)
81 | assert (
82 | "model_state" in data
83 | ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'."
84 | model_state = {
85 | k: v
86 | for k, v in data["model_state"].items()
87 | if not k.endswith("num_batches_tracked")
88 | }
89 | return {"model": model_state, "__author__": "pycls", "matching_heuristics": True}
90 | elif "OAI_CLIP" in filename:
91 | # assume file is from OpenAI CLIP pre-trained model
92 | loaded = super()._load_file(filename) # load native pth checkpoint
93 | if "model" not in loaded:
94 | loaded = {"model": loaded}
95 | return {"model": loaded["model"], "__author__": "OAI_CLIP", "matching_heuristics": True}
96 |
97 | loaded = super()._load_file(filename) # load native pth checkpoint
98 | if "model" not in loaded:
99 | loaded = {"model": loaded}
100 | return loaded
101 |
102 | def _load_model(self, checkpoint):
103 | if checkpoint.get("matching_heuristics", False) or self.bb_rpn_weights:
104 | self._convert_ndarray_to_tensor(checkpoint["model"])
105 | # convert weights by name-matching heuristics
106 | if checkpoint.get("__author__", "NA") == "OAI_CLIP" or self.bb_rpn_weights: # for OAI_CLIP or 2nd ckpt (offline modules)
107 | checkpoint["model"] = align_and_update_state_dicts_for_CLIP(
108 | self.model.state_dict(),
109 | checkpoint["model"],
110 | bb_rpn_weights=self.bb_rpn_weights,
111 | )
112 | else: # default loading
113 | checkpoint["model"] = align_and_update_state_dicts(
114 | self.model.state_dict(),
115 | checkpoint["model"],
116 | c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
117 | )
118 | # for non-caffe2 models, use standard ways to load it
119 | incompatible = super()._load_model(checkpoint)
120 | del checkpoint # try saving memory
121 |
122 | model_buffers = dict(self.model.named_buffers(recurse=False))
123 | for k in ["pixel_mean", "pixel_std"]:
124 | # Ignore missing key message about pixel_mean/std.
125 | # Though they may be missing in old checkpoints, they will be correctly
126 | # initialized from config anyway.
127 | if k in model_buffers:
128 | try:
129 | incompatible.missing_keys.remove(k)
130 | except ValueError:
131 | pass
132 | return incompatible
--------------------------------------------------------------------------------
/sas_det/configs/ovd_coco_R50_C4_ensemble.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./regionclip/Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "CLIPFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: ""
8 | MASK_ON: False
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | # RPN: # not used
16 | # HEAD_NAME: StandardRPNHead
17 | # IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "EnsembleCLIPRes5ROIHeads" #
20 | IN_FEATURES: ["res4"]
21 | NUM_CLASSES: 48 # base categories only
22 | SCORE_THRESH_TEST: 0.001
23 | ROI_BOX_HEAD:
24 | NAME: "FastRCNNConvFCHead" # for text head
25 | NUM_FC: 2
26 | POOLER_RESOLUTION: 14
27 | CLS_AGNOSTIC_BBOX_REG: True
28 | ROI_MASK_HEAD:
29 | NAME: "MaskRCNNConvUpsampleHead"
30 | NUM_CONV: 0
31 | POOLER_RESOLUTION: 14
32 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
33 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
34 | CLIP:
35 | CROP_REGION_TYPE: "RPN"
36 | USE_TEXT_EMB_CLASSIFIER: True
37 | CLSS_TEMP: 0.01
38 | NO_BOX_DELTA: False
39 | BG_CLS_LOSS_WEIGHT: 0.2
40 | FOCAL_SCALED_LOSS: 0.5
41 | INPUT:
42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 | DATASETS:
44 | TRAIN: ("coco_2017_ovd_b_train",)
45 | TEST: ("coco_2017_ovd_all_test",)
46 | TEST:
47 | EVAL_PERIOD: 5000
48 | SOLVER:
49 | IMS_PER_BATCH: 16
50 | BASE_LR: 0.002
51 | STEPS: (60000, 80000)
52 | MAX_ITER: 90000
53 | WARMUP_ITERS: 5000
54 | CHECKPOINT_PERIOD: 10000
55 | INPUT:
56 | MIN_SIZE_TRAIN_SAMPLING: choice
57 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
58 | MAX_SIZE_TRAIN: 1333
59 | MIN_SIZE_TEST: 800
60 | MAX_SIZE_TEST: 1333
61 | FORMAT: "RGB"
--------------------------------------------------------------------------------
/sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./ovd_coco_R50_C4_ensemble.yaml"
2 | MODEL:
3 | ROI_BOX_HEAD:
4 | NAME: "CLIP_BOX_HEAD" # close-branch head
5 | OVD:
6 | WITH_PSEUDO_LABELS: True
7 | #
8 | USE_ADAPTIVE_THRES: True
9 | PL_THRESHOLD: 0.85
10 | PL_NMS_THRES: 0.5
11 | RPN_FUSION_METHOD: "avg_norm_scores"
12 | CATEGORY_INFO: None
13 | # periodic update
14 | USE_PERIODIC_UPDATE: True
15 | # box reg, cls loss
16 | BOX_CONFIDENCE_THRES: 1.0
17 | USE_CONFIDENCE_WEIGHT: True
--------------------------------------------------------------------------------
/sas_det/configs/ovd_lvis_R50_C4_SAS_Det_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./ovd_lvis_R50_C4_ensemble_PLs.yaml"
2 | DATASETS:
3 | TRAIN: ('lvis_v1_train_SASDet_r50x4_PLs', 'lvis_v1_o365_SASDet_r50x4_PLs',)
4 | SOLVER:
5 | CHECKPOINT_PERIOD: 20000
6 | STEPS: (210000, 250000)
7 | MAX_ITER: 270000
8 | TEST:
9 | EVAL_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | ROI_HEADS:
4 | NAME: "EnsembleCLIPRes5ROIHeads" #
5 | ROI_BOX_HEAD:
6 | NAME: "CLIP_BOX_HEAD" # close-branch head
7 | # NUM_FC: 2
8 | POOLER_RESOLUTION: 14
9 | CLS_AGNOSTIC_BBOX_REG: True
10 | OVD:
11 | WITH_PSEUDO_LABELS: True
12 | #
13 | USE_ADAPTIVE_THRES: True
14 | PL_NMS_THRES: 0.5
15 | PL_THRESHOLD: 0.925
16 | MIN_AVG_PLS: 2.0
17 | MAX_AVG_PLS: 4.0
18 | ADAPTIVE_THRES_DELTA: 0.005
19 | RPN_FUSION_METHOD: "avg_logits"
20 | CATEGORY_INFO: None # if None, assume novel cat ids >= len(base_categories)
21 | # periodic update
22 | USE_PERIODIC_UPDATE: True
23 | PERIODIC_STEPS: (120000, 160000)
24 | # box reg, cls loss
25 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
26 | USE_CONFIDENCE_WEIGHT: False # False for LVIS
27 | ENSEMBLE:
28 | ALPHA: 0.33
29 | BETA: 0.67
30 | # TEST_CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
31 | SOLVER:
32 | CHECKPOINT_PERIOD: 20000
33 | TEST:
34 | EVAL_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-C4.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "GeneralizedRCNN"
3 | RPN:
4 | PRE_NMS_TOPK_TEST: 6000
5 | POST_NMS_TOPK_TEST: 1000
6 | ROI_HEADS:
7 | NAME: "Res5ROIHeads"
8 | DATASETS:
9 | TRAIN: ("coco_2017_train",)
10 | TEST: ("coco_2017_val",)
11 | SOLVER:
12 | IMS_PER_BATCH: 16
13 | BASE_LR: 0.02
14 | STEPS: (60000, 80000)
15 | MAX_ITER: 90000
16 | INPUT:
17 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
18 | VERSION: 2
19 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-DilatedC5.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "GeneralizedRCNN"
3 | RESNETS:
4 | OUT_FEATURES: ["res5"]
5 | RES5_DILATION: 2
6 | RPN:
7 | IN_FEATURES: ["res5"]
8 | PRE_NMS_TOPK_TEST: 6000
9 | POST_NMS_TOPK_TEST: 1000
10 | ROI_HEADS:
11 | NAME: "StandardROIHeads"
12 | IN_FEATURES: ["res5"]
13 | ROI_BOX_HEAD:
14 | NAME: "FastRCNNConvFCHead"
15 | NUM_FC: 2
16 | POOLER_RESOLUTION: 7
17 | ROI_MASK_HEAD:
18 | NAME: "MaskRCNNConvUpsampleHead"
19 | NUM_CONV: 4
20 | POOLER_RESOLUTION: 14
21 | DATASETS:
22 | TRAIN: ("coco_2017_train",)
23 | TEST: ("coco_2017_val",)
24 | SOLVER:
25 | IMS_PER_BATCH: 16
26 | BASE_LR: 0.02
27 | STEPS: (60000, 80000)
28 | MAX_ITER: 90000
29 | INPUT:
30 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
31 | VERSION: 2
32 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "GeneralizedRCNN"
3 | BACKBONE:
4 | NAME: "build_resnet_fpn_backbone"
5 | RESNETS:
6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
7 | FPN:
8 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
9 | ANCHOR_GENERATOR:
10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
12 | RPN:
13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level
16 | # Detectron1 uses 2000 proposals per-batch,
17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 | POST_NMS_TOPK_TRAIN: 1000
20 | POST_NMS_TOPK_TEST: 1000
21 | ROI_HEADS:
22 | NAME: "StandardROIHeads"
23 | IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 | ROI_BOX_HEAD:
25 | NAME: "FastRCNNConvFCHead"
26 | NUM_FC: 2
27 | POOLER_RESOLUTION: 7
28 | ROI_MASK_HEAD:
29 | NAME: "MaskRCNNConvUpsampleHead"
30 | NUM_CONV: 4
31 | POOLER_RESOLUTION: 14
32 | DATASETS:
33 | TRAIN: ("coco_2017_train",)
34 | TEST: ("coco_2017_val",)
35 | SOLVER:
36 | IMS_PER_BATCH: 16
37 | BASE_LR: 0.02
38 | STEPS: (60000, 80000)
39 | MAX_ITER: 90000
40 | INPUT:
41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RetinaNet.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "RetinaNet"
3 | BACKBONE:
4 | NAME: "build_retinanet_resnet_fpn_backbone"
5 | RESNETS:
6 | OUT_FEATURES: ["res3", "res4", "res5"]
7 | ANCHOR_GENERATOR:
8 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
9 | FPN:
10 | IN_FEATURES: ["res3", "res4", "res5"]
11 | RETINANET:
12 | IOU_THRESHOLDS: [0.4, 0.5]
13 | IOU_LABELS: [0, -1, 1]
14 | SMOOTH_L1_LOSS_BETA: 0.0
15 | DATASETS:
16 | TRAIN: ("coco_2017_train",)
17 | TEST: ("coco_2017_val",)
18 | SOLVER:
19 | IMS_PER_BATCH: 16
20 | BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
21 | STEPS: (60000, 80000)
22 | MAX_ITER: 90000
23 | INPUT:
24 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
25 | VERSION: 2
26 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | LOAD_PROPOSALS: True
6 | RESNETS:
7 | DEPTH: 50
8 | PROPOSAL_GENERATOR:
9 | NAME: "PrecomputedProposals"
10 | DATASETS:
11 | TRAIN: ("coco_2017_train",)
12 | PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
13 | TEST: ("coco_2017_val",)
14 | PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
15 | DATALOADER:
16 | # proposals are part of the dataset_dicts, and take a lot of RAM
17 | NUM_WORKERS: 2
18 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: False
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | MASK_ON: False
4 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
5 | PIXEL_STD: [57.375, 57.120, 58.395]
6 | RESNETS:
7 | STRIDE_IN_1X1: False # this is a C2 model
8 | NUM_GROUPS: 32
9 | WIDTH_PER_GROUP: 8
10 | DEPTH: 101
11 | SOLVER:
12 | STEPS: (210000, 250000)
13 | MAX_ITER: 270000
14 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | SOLVER:
7 | STEPS: (210000, 250000)
8 | MAX_ITER: 270000
9 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.optim import SGD as optimizer
2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
3 | from ..common.data.coco import dataloader
4 | from ..common.models.retinanet import model
5 | from ..common.train import train
6 |
7 | dataloader.train.mapper.use_instance_mask = False
8 | model.backbone.bottom_up.freeze_at = 2
9 | optimizer.lr = 0.01
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | RESNETS:
5 | DEPTH: 50
6 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | RESNETS:
5 | DEPTH: 50
6 | SOLVER:
7 | STEPS: (210000, 250000)
8 | MAX_ITER: 270000
9 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/rpn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "ProposalNetwork"
4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
5 | MASK_ON: False
6 | RESNETS:
7 | DEPTH: 50
8 | RPN:
9 | PRE_NMS_TOPK_TEST: 12000
10 | POST_NMS_TOPK_TEST: 2000
11 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/rpn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "ProposalNetwork"
4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
5 | MASK_ON: False
6 | RESNETS:
7 | DEPTH: 50
8 | RPN:
9 | POST_NMS_TOPK_TEST: 2000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "CLIPFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: ""
8 | MASK_ON: False
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | RPN:
16 | HEAD_NAME: StandardRPNHead
17 | IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "CLIPRes5ROIHeads"
20 | IN_FEATURES: ["res4"]
21 | NUM_CLASSES: 48 # base categories
22 | SCORE_THRESH_TEST: 0.001
23 | ROI_BOX_HEAD:
24 | NAME: ""
25 | NUM_FC: 0
26 | POOLER_RESOLUTION: 14
27 | CLS_AGNOSTIC_BBOX_REG: True
28 | ROI_MASK_HEAD:
29 | NAME: "MaskRCNNConvUpsampleHead"
30 | NUM_CONV: 0
31 | POOLER_RESOLUTION: 14
32 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
33 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
34 | CLIP:
35 | CROP_REGION_TYPE: "RPN"
36 | USE_TEXT_EMB_CLASSIFIER: True
37 | CLSS_TEMP: 0.01
38 | NO_BOX_DELTA: False
39 | BG_CLS_LOSS_WEIGHT: 0.2
40 | FOCAL_SCALED_LOSS: 0.5
41 | INPUT:
42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 | DATASETS:
44 | TRAIN: ("coco_2017_ovd_b_train",)
45 | TEST: ("coco_2017_ovd_all_test",)
46 | TEST:
47 | EVAL_PERIOD: 25000
48 | SOLVER:
49 | IMS_PER_BATCH: 16
50 | BASE_LR: 0.002
51 | STEPS: (60000, 80000)
52 | MAX_ITER: 90000
53 | WARMUP_ITERS: 5000
54 | CHECKPOINT_PERIOD: 10000
55 | INPUT:
56 | MIN_SIZE_TRAIN_SAMPLING: choice
57 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
58 | MAX_SIZE_TRAIN: 1333
59 | MIN_SIZE_TEST: 800
60 | MAX_SIZE_TEST: 1333
61 | FORMAT: "RGB"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | ROI_HEADS:
4 | NUM_CLASSES: 80
5 | DATASETS:
6 | TRAIN: ("coco_2017_train",)
7 | TEST: ("coco_2017_val",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | DATASETS:
3 | TEST: ("coco_2017_ovd_b_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | DATASETS:
3 | TEST: ("coco_2017_ovd_t_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | ROI_HEADS:
4 | NUM_CLASSES: 65
5 | NMS_THRESH_TEST: 0.5
6 | CLIP:
7 | NO_BOX_DELTA: True # no box refinement
8 | OFFLINE_RPN_NMS_THRESH: 0.7
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | BACKBONE:
4 | NAME: "build_clip_resnet_backbone_from_pretrain"
5 | ROI_HEADS:
6 | NUM_CLASSES: 65
7 | NMS_THRESH_TEST: 0.5
8 | CLIP:
9 | NO_BOX_DELTA: True # no box refinement
10 | OFFLINE_RPN_NMS_THRESH: 0.9
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | ROI_HEADS:
4 | NUM_CLASSES: 65 # base + novel categories
5 | OVD:
6 | WITH_PSEUDO_LABELS: True
7 | USE_ADAPTIVE_THRES: True
8 | PL_THRESHOLD: 0.8 # init pl threshold
9 | PL_NMS_THRES: 0.5
10 | RPN_FUSION_METHOD: "avg_norm_scores"
11 | USE_PERIODIC_UPDATE: True
12 | BOX_CONFIDENCE_THRES: 1.0 # only use pseudo boxes with confidence > BOX_CONFIDENCE_THRES. 1.0 means no pseudo boxes
13 | USE_CONFIDENCE_WEIGHT: True
14 | DATASETS:
15 | TRAIN: ("coco_2017_ovd_b_train_65cats",)
16 | TEST: ("coco_2017_ovd_all_test",)
17 | DATALOADER:
18 | FILTER_EMPTY_ANNOTATIONS: False # empty images may contain novel categories
19 | SOLVER:
20 | CHECKPOINT_PERIOD: 10000
21 | TEST:
22 | EVAL_PERIOD: 5000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml"
2 | SOLVER:
3 | STEPS: (210000, 250000)
4 | MAX_ITER: 270000
5 | CHECKPOINT_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | CLIP:
4 | CROP_REGION_TYPE: "RPN"
5 | # OFFLINE_RPN_NMS_THRESH: 0.3 # will change offline_cfg.MODEL.RPN.NMS_THRESH, will affect the eval performance
6 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
7 | PRETRAIN_SAMPLE_REGIONS: 32 # num_regions_per_img, topk in box selection
8 | # for inference
9 | NO_BOX_DELTA: False # check
10 | USE_TEXT_EMB_CLASSIFIER: True
11 | MULTIPLY_RPN_SCORE: False # check
12 | WEAK_LOSS:
13 | WEAK_LOSS_WEIGHT: 0.01
14 | BOX_SELECT_THRES: 0.97 # threshold in box selection
15 | NEG_CONCEPT_NUM: 10
16 | DATASETS:
17 | TRAIN: ("coco_2017_ovd_b_train", "coco_caption_nouns_train_4764tags",) # coco_2017_ovd_b_train with 48 cats
18 | TEST: ("coco_generalized_del_val",)
19 | INPUT:
20 | CUSTOM_AUG: ResizeShortestEdge
21 | MIN_SIZE_TRAIN_SAMPLING: range
22 | MIN_SIZE_TRAIN: (800, 800)
23 | DATALOADER:
24 | SAMPLER_TRAIN: "MultiDatasetSampler"
25 | DATASET_RATIO: [1, 4]
26 | USE_DIFF_BS_SIZE: True
27 | DATASET_BS: [2, 8]
28 | USE_RFS: [False, False]
29 | DATASET_MIN_SIZES: [[800, 800], [400, 400]]
30 | DATASET_MAX_SIZES: [1333, 667]
31 | FILTER_EMPTY_ANNOTATIONS: False
32 | MULTI_DATASET_GROUPING: True
33 | DATASET_ANN: ['box', 'caption']
34 | NUM_WORKERS: 8
35 | TEST:
36 | EVAL_PERIOD: 10000
37 | FIND_UNUSED_PARAM: True
38 | WITH_IMAGE_LABELS: True
39 | OUTPUT_DIR: output/test
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_2x_PLs_per4k_clsBoxConf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml"
2 | MODEL:
3 | OVD:
4 | WITH_PSEUDO_LABELS: True
5 | USE_ADAPTIVE_THRES: True
6 | PL_THRESHOLD: 0.9
7 | MIN_AVG_PLS: 1.0
8 | MAX_AVG_PLS: 3.0
9 | PL_NMS_THRES: 0.5
10 | RPN_FUSION_METHOD: "avg_norm_scores"
11 | CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json"
12 | # periodic update
13 | USE_PERIODIC_UPDATE: True
14 | PERIODIC_STEPS: (40000, 80000, 120000, 160000)
15 | # box reg
16 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
17 | SOLVER:
18 | STEPS: (120000, 160000)
19 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
20 | CHECKPOINT_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_PLs_clsBoxConf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "MyCLIPFastRCNN"
4 | ROI_HEADS:
5 | NUM_CLASSES: 65 # base + novel categories
6 | OVD:
7 | WITH_PSEUDO_LABELS: True
8 | USE_ADAPTIVE_THRES: True
9 | PL_THRESHOLD: 0.9
10 | MIN_AVG_PLS: 1.0
11 | MAX_AVG_PLS: 3.0
12 | PL_NMS_THRES: 0.5
13 | RPN_FUSION_METHOD: "avg_norm_scores"
14 | CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json"
15 | # periodic update
16 | USE_PERIODIC_UPDATE: True
17 | PERIODIC_STEPS: (40000, 60000, 80000)
18 | # box reg
19 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
20 | DATASETS:
21 | TRAIN: ("coco_2017_ovd_b_train_65cats",)
22 | TEST: ("coco_2017_ovd_all_test",)
23 | SOLVER:
24 | CHECKPOINT_PERIOD: 10000
25 | TEST:
26 | EVAL_PERIOD: 5000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_offline_PLs.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "MyCLIPFastRCNN"
4 | ROI_HEADS:
5 | NUM_CLASSES: 65 # base + novel categories
6 | OVD:
7 | WITH_PSEUDO_LABELS: False # no online PLs
8 | # box reg
9 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
10 | DATASETS:
11 | TRAIN: ("",)
12 | TEST: ("coco_2017_ovd_all_test",)
13 | SOLVER:
14 | CHECKPOINT_PERIOD: 10000
15 | TEST:
16 | EVAL_PERIOD: 5000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_frozen_CLIP_RPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "MyCLIPFastRCNN"
4 | # IGNORE_CLS_LOSS: True
5 | CLIP:
6 | FREEZE_BACKBONE: True
7 | SOLVER:
8 | IMS_PER_BATCH: 16
9 | BASE_LR: 0.002
10 | STEPS: (60000, 80000)
11 | MAX_ITER: 90000
12 | WARMUP_ITERS: 5000
13 | CHECKPOINT_PERIOD: 10000
14 | TEST:
15 | EVAL_PERIOD: 5000
16 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "GeneralizedRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
8 | MASK_ON: True
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | RPN:
16 | HEAD_NAME: StandardRPNHead
17 | IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 | IN_FEATURES: ["res4"]
21 | ROI_BOX_HEAD:
22 | NAME: ""
23 | NUM_FC: 0
24 | POOLER_RESOLUTION: 14
25 | ROI_MASK_HEAD:
26 | NAME: "MaskRCNNConvUpsampleHead"
27 | NUM_CONV: 0
28 | POOLER_RESOLUTION: 14
29 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] #
30 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] #
31 | INPUT:
32 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
33 | TEST:
34 | EVAL_PERIOD: 50000
35 | SOLVER:
36 | IMS_PER_BATCH: 16
37 | BASE_LR: 0.02
38 | STEPS: (60000, 80000)
39 | MAX_ITER: 90000
40 | INPUT:
41 | MIN_SIZE_TRAIN_SAMPLING: choice
42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 | MAX_SIZE_TRAIN: 1333
44 | MIN_SIZE_TEST: 800
45 | MAX_SIZE_TEST: 1333
46 | FORMAT: "RGB" # "BGR"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "GeneralizedRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
8 | MASK_ON: True
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | RPN:
16 | HEAD_NAME: StandardRPNHead
17 | IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 | IN_FEATURES: ["res4"]
21 | NUM_CLASSES: 48
22 | ROI_BOX_HEAD:
23 | NAME: ""
24 | NUM_FC: 0
25 | POOLER_RESOLUTION: 14
26 | ROI_MASK_HEAD:
27 | NAME: "MaskRCNNConvUpsampleHead"
28 | NUM_CONV: 0
29 | POOLER_RESOLUTION: 14
30 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] #
31 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] #
32 | INPUT:
33 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
34 | DATASETS:
35 | TRAIN: ("coco_2017_ovd_b_train",)
36 | TEST: ("coco_2017_ovd_b_test",)
37 | TEST:
38 | EVAL_PERIOD: 50000
39 | SOLVER:
40 | IMS_PER_BATCH: 16
41 | BASE_LR: 0.02
42 | STEPS: (60000, 80000)
43 | MAX_ITER: 90000
44 | INPUT:
45 | MIN_SIZE_TRAIN_SAMPLING: choice
46 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
47 | MAX_SIZE_TRAIN: 1333
48 | MIN_SIZE_TEST: 800
49 | MAX_SIZE_TEST: 1333
50 | FORMAT: "RGB" # "BGR"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 101
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.train import train
2 | from ..common.optim import SGD as optimizer
3 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
4 | from ..common.data.coco import dataloader
5 | from ..common.models.mask_rcnn_c4 import model
6 |
7 | model.backbone.freeze_at = 2
8 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 48
9 | DATASETS:
10 | TRAIN: ("coco_2017_ovd_b_train",)
11 | TEST: ("coco_2017_ovd_all_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_coco65.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 65
9 | DATASETS:
10 | TRAIN: ("coco_2017_ovd_all_train",)
11 | TEST: ("coco_2017_ovd_all_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.optim import SGD as optimizer
2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
3 | from ..common.data.coco import dataloader
4 | from ..common.models.mask_rcnn_fpn import model
5 | from ..common.train import train
6 |
7 | model.backbone.bottom_up.freeze_at = 2
8 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_giou.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | RPN:
8 | BBOX_REG_LOSS_TYPE: "giou"
9 | BBOX_REG_LOSS_WEIGHT: 2.0
10 | ROI_BOX_HEAD:
11 | BBOX_REG_LOSS_TYPE: "giou"
12 | BBOX_REG_LOSS_WEIGHT: 10.0
13 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 48
9 | DATASETS:
10 | TRAIN: ("coco_2017_ovd_b_train",)
11 | TEST: ("coco_2017_ovd_b_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 65
9 | DATASETS:
10 | TRAIN: ("coco_2017_ovd_all_train",)
11 | TEST: ("coco_2017_ovd_all_test",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | MASK_ON: True
4 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
5 | PIXEL_STD: [57.375, 57.120, 58.395]
6 | RESNETS:
7 | STRIDE_IN_1X1: False # this is a C2 model
8 | NUM_GROUPS: 32
9 | WIDTH_PER_GROUP: 8
10 | DEPTH: 101
11 | SOLVER:
12 | STEPS: (210000, 250000)
13 | MAX_ITER: 270000
14 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnetx_4gf_dds_fpn_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.optim import SGD as optimizer
2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
3 | from ..common.data.coco import dataloader
4 | from ..common.models.mask_rcnn_fpn import model
5 | from ..common.train import train
6 |
7 | from detectron2.config import LazyCall as L
8 | from detectron2.modeling.backbone import RegNet
9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
10 |
11 |
12 | # Replace default ResNet with RegNetX-4GF from the DDS paper. Config source:
13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnetx/RegNetX-4.0GF_dds_8gpu.yaml#L4-L9 # noqa
14 | model.backbone.bottom_up = L(RegNet)(
15 | stem_class=SimpleStem,
16 | stem_width=32,
17 | block_class=ResBottleneckBlock,
18 | depth=23,
19 | w_a=38.65,
20 | w_0=96,
21 | w_m=2.43,
22 | group_width=40,
23 | freeze_at=2,
24 | norm="FrozenBN",
25 | out_features=["s1", "s2", "s3", "s4"],
26 | )
27 | model.pixel_std = [57.375, 57.120, 58.395]
28 |
29 | optimizer.weight_decay = 5e-5
30 | train.init_checkpoint = (
31 | "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906383/RegNetX-4.0GF_dds_8gpu.pyth"
32 | )
33 | # RegNets benefit from enabling cudnn benchmark mode
34 | train.cudnn_benchmark = True
35 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnety_4gf_dds_fpn_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.optim import SGD as optimizer
2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
3 | from ..common.data.coco import dataloader
4 | from ..common.models.mask_rcnn_fpn import model
5 | from ..common.train import train
6 |
7 | from detectron2.config import LazyCall as L
8 | from detectron2.modeling.backbone import RegNet
9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
10 |
11 |
12 | # Replace default ResNet with RegNetY-4GF from the DDS paper. Config source:
13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnety/RegNetY-4.0GF_dds_8gpu.yaml#L4-L10 # noqa
14 | model.backbone.bottom_up = L(RegNet)(
15 | stem_class=SimpleStem,
16 | stem_width=32,
17 | block_class=ResBottleneckBlock,
18 | depth=22,
19 | w_a=31.41,
20 | w_0=96,
21 | w_m=2.24,
22 | group_width=64,
23 | se_ratio=0.25,
24 | freeze_at=2,
25 | norm="FrozenBN",
26 | out_features=["s1", "s2", "s3", "s4"],
27 | )
28 | model.pixel_std = [57.375, 57.120, 58.395]
29 |
30 | optimizer.weight_decay = 5e-5
31 | train.init_checkpoint = (
32 | "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906838/RegNetY-4.0GF_dds_8gpu.pyth"
33 | )
34 | # RegNets benefit from enabling cudnn benchmark mode
35 | train.cudnn_benchmark = True
36 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 101
7 | ROI_HEADS:
8 | NUM_CLASSES: 1230
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v0.5_train",)
14 | TEST: ("lvis_v0.5_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | DATALOADER:
18 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 | REPEAT_THRESHOLD: 0.001
20 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 1230
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v0.5_train",)
14 | TEST: ("lvis_v0.5_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | DATALOADER:
18 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 | REPEAT_THRESHOLD: 0.001
20 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
4 | PIXEL_STD: [57.375, 57.120, 58.395]
5 | MASK_ON: True
6 | RESNETS:
7 | STRIDE_IN_1X1: False # this is a C2 model
8 | NUM_GROUPS: 32
9 | WIDTH_PER_GROUP: 8
10 | DEPTH: 101
11 | ROI_HEADS:
12 | NUM_CLASSES: 1230
13 | SCORE_THRESH_TEST: 0.0001
14 | INPUT:
15 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
16 | DATASETS:
17 | TRAIN: ("lvis_v0.5_train",)
18 | TEST: ("lvis_v0.5_val",)
19 | TEST:
20 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
21 | DATALOADER:
22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 | REPEAT_THRESHOLD: 0.001
24 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "CLIPFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: ""
8 | MASK_ON: True
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | RPN:
16 | HEAD_NAME: StandardRPNHead
17 | IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "CLIPRes5ROIHeads"
20 | IN_FEATURES: ["res4"]
21 | NUM_CLASSES: 866 # 1203
22 | SCORE_THRESH_TEST: 0.02
23 | ROI_BOX_HEAD:
24 | NAME: ""
25 | NUM_FC: 0
26 | POOLER_RESOLUTION: 14
27 | CLS_AGNOSTIC_BBOX_REG: True
28 | ROI_MASK_HEAD:
29 | NAME: "MaskRCNNConvUpsampleHead"
30 | NUM_CONV: 0
31 | POOLER_RESOLUTION: 14
32 | CLS_AGNOSTIC_MASK: True
33 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
34 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
35 | CLIP:
36 | CROP_REGION_TYPE: "RPN"
37 | USE_TEXT_EMB_CLASSIFIER: True
38 | CLSS_TEMP: 0.01
39 | NO_BOX_DELTA: False
40 | BG_CLS_LOSS_WEIGHT: 0.8
41 | MULTIPLY_RPN_SCORE: True
42 | INPUT:
43 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
44 | DATASETS:
45 | TRAIN: ("lvis_v1_train",)
46 | TEST: ("lvis_v1_val",)
47 | TEST:
48 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
49 | EVAL_PERIOD: 25000
50 | SOLVER:
51 | IMS_PER_BATCH: 16
52 | BASE_LR: 0.002
53 | STEPS: (120000, 160000)
54 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
55 | WARMUP_ITERS: 5000
56 | DATALOADER:
57 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
58 | REPEAT_THRESHOLD: 0.001
59 | INPUT:
60 | MIN_SIZE_TRAIN_SAMPLING: choice
61 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
62 | MAX_SIZE_TRAIN: 1333
63 | MIN_SIZE_TEST: 800
64 | MAX_SIZE_TEST: 1333
65 | FORMAT: "RGB"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | MASK_ON: False
4 | ROI_HEADS:
5 | NUM_CLASSES: 1203
6 | NMS_THRESH_TEST: 0.3
7 | CLIP:
8 | NO_BOX_DELTA: True
9 | OFFLINE_RPN_NMS_THRESH: 0.9
10 | VIS: True # Note: visualize the scores before multiplying RPN scores, if any
11 | DATASETS:
12 | TRAIN: ("lvis_v1_train_custom_img",)
13 | TEST: ("lvis_v1_val_custom_img",)
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | MASK_ON: False
4 | ROI_HEADS:
5 | NUM_CLASSES: 1203
6 | NMS_THRESH_TEST: 0.5
7 | CLIP:
8 | NO_BOX_DELTA: True
9 | OFFLINE_RPN_NMS_THRESH: 0.9
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | MASK_ON: False
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone_from_pretrain"
6 | ROI_HEADS:
7 | NUM_CLASSES: 1203
8 | NMS_THRESH_TEST: 0.5
9 | CLIP:
10 | NO_BOX_DELTA: True
11 | OFFLINE_RPN_NMS_THRESH: 0.9
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_box_PLs_periodic_boxConf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | ROI_HEADS:
4 | NUM_CLASSES: 1203 # base + novel categories
5 | OVD:
6 | WITH_PSEUDO_LABELS: True
7 | #
8 | USE_ADAPTIVE_THRES: True
9 | PL_NMS_THRES: 0.5
10 | PL_THRESHOLD: 0.925
11 | MIN_AVG_PLS: 2.0
12 | MAX_AVG_PLS: 3.0
13 | ADAPTIVE_THRES_DELTA: 0.005
14 | RPN_FUSION_METHOD: "avg_logits"
15 | # CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
16 | # periodic update
17 | USE_PERIODIC_UPDATE: True
18 | PERIODIC_STEPS: (120000, 160000)
19 | # box reg
20 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
21 | DATASETS:
22 | TRAIN: ("lvis_v1_train_base_1203cats",)
23 | TEST: ("lvis_v1_val",)
24 | SOLVER:
25 | IMS_PER_BATCH: 16
26 | BASE_LR: 0.002
27 | STEPS: (120000, 160000)
28 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
29 | WARMUP_ITERS: 5000
30 | CHECKPOINT_PERIOD: 20000
31 | TEST:
32 | EVAL_PERIOD: 20000
33 | OUTPUT_DIR: output/ovd_lvis_ft_PLs_per4kUpdate_boxConf
34 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_fCLIP_PLs_clsBoxConf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "MyCLIPFastRCNN"
4 | ROI_HEADS:
5 | NUM_CLASSES: 1203 # base + novel categories
6 | OVD:
7 | WITH_PSEUDO_LABELS: True
8 | #
9 | USE_ADAPTIVE_THRES: True
10 | PL_NMS_THRES: 0.5
11 | PL_THRESHOLD: 0.925
12 | MIN_AVG_PLS: 1.0
13 | MAX_AVG_PLS: 3.0
14 | ADAPTIVE_THRES_DELTA: 0.005
15 | RPN_FUSION_METHOD: "avg_logits"
16 | CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
17 | # periodic update
18 | USE_PERIODIC_UPDATE: True
19 | PERIODIC_STEPS: (40000, 80000, 120000, 160000)
20 | # box reg
21 | BOX_CONFIDENCE_THRES: 1.0 # no box reg for PL boxes
22 | DATASETS:
23 | TRAIN: ("lvis_v1_train_base_1203cats",)
24 | TEST: ("lvis_v1_val",)
25 | SOLVER:
26 | IMS_PER_BATCH: 16
27 | BASE_LR: 0.002
28 | STEPS: (120000, 160000)
29 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
30 | WARMUP_ITERS: 5000
31 | CHECKPOINT_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_frozen_CLIP_RPN.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "MyCLIPFastRCNN"
4 | # IGNORE_CLS_LOSS: True
5 | CLIP:
6 | FREEZE_BACKBONE: True
7 | SOLVER:
8 | IMS_PER_BATCH: 16
9 | BASE_LR: 0.002
10 | STEPS: (60000, 80000)
11 | MAX_ITER: 90000
12 | WARMUP_ITERS: 5000
13 | CHECKPOINT_PERIOD: 10000
14 | TEST:
15 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
16 | EVAL_PERIOD: 20000
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "GeneralizedRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
8 | MASK_ON: True
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | RPN:
16 | HEAD_NAME: StandardRPNHead
17 | IN_FEATURES: ["res4"]
18 | ROI_HEADS:
19 | NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 | IN_FEATURES: ["res4"]
21 | NUM_CLASSES: 1203
22 | SCORE_THRESH_TEST: 0.0001
23 | ROI_BOX_HEAD:
24 | NAME: ""
25 | NUM_FC: 0
26 | POOLER_RESOLUTION: 14
27 | ROI_MASK_HEAD:
28 | NAME: "MaskRCNNConvUpsampleHead"
29 | NUM_CONV: 0
30 | POOLER_RESOLUTION: 14
31 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] #
32 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] #
33 | INPUT:
34 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
35 | DATASETS:
36 | TRAIN: ("lvis_v1_train",)
37 | TEST: ("lvis_v1_val",)
38 | TEST:
39 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
40 | EVAL_PERIOD: 25000
41 | SOLVER:
42 | IMS_PER_BATCH: 16
43 | BASE_LR: 0.02
44 | STEPS: (120000, 160000) # (140000,) #
45 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
46 | DATALOADER:
47 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
48 | REPEAT_THRESHOLD: 0.001
49 | INPUT:
50 | MIN_SIZE_TRAIN_SAMPLING: choice
51 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
52 | MAX_SIZE_TRAIN: 1333
53 | MIN_SIZE_TEST: 800
54 | MAX_SIZE_TEST: 1333
55 | FORMAT: "RGB" # "BGR"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "GeneralizedRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
8 | MASK_ON: True
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | FPN:
16 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 | OUT_CHANNELS: 256
18 | FUSE_TYPE: sum
19 | RPN:
20 | HEAD_NAME: StandardRPNHead
21 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
22 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
23 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level
24 | # Detectron1 uses 2000 proposals per-batch,
25 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
26 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
27 | POST_NMS_TOPK_TRAIN: 1000
28 | POST_NMS_TOPK_TEST: 1000
29 | ROI_HEADS:
30 | NAME: "StandardROIHeads"
31 | IN_FEATURES: ["p2", "p3", "p4", "p5"]
32 | NUM_CLASSES: 1203
33 | SCORE_THRESH_TEST: 0.0001
34 | ROI_BOX_HEAD:
35 | NAME: "FastRCNNConvFCHead"
36 | NUM_FC: 2
37 | POOLER_RESOLUTION: 7
38 | ROI_MASK_HEAD:
39 | NAME: "MaskRCNNConvUpsampleHead"
40 | NUM_CONV: 4
41 | POOLER_RESOLUTION: 14
42 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] #
43 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] #
44 | INPUT:
45 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
46 | DATASETS:
47 | TRAIN: ("lvis_v1_train",)
48 | TEST: ("lvis_v1_val",)
49 | TEST:
50 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
51 | EVAL_PERIOD: 50000
52 | SOLVER:
53 | IMS_PER_BATCH: 16
54 | BASE_LR: 0.02
55 | STEPS: (120000, 160000) # (140000,) #
56 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
57 | DATALOADER:
58 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
59 | REPEAT_THRESHOLD: 0.001
60 | INPUT:
61 | MIN_SIZE_TRAIN_SAMPLING: choice
62 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
63 | MAX_SIZE_TRAIN: 1333
64 | MIN_SIZE_TEST: 800
65 | MAX_SIZE_TEST: 1333
66 | FORMAT: "RGB" # "BGR"
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 101
7 | ROI_HEADS:
8 | NUM_CLASSES: 1203
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v1_train",)
14 | TEST: ("lvis_v1_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | SOLVER:
18 | STEPS: (120000, 160000)
19 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
20 | DATALOADER:
21 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
22 | REPEAT_THRESHOLD: 0.001
23 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 1203
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v1_train",)
14 | TEST: ("lvis_v1_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | EVAL_PERIOD: 50000
18 | SOLVER:
19 | STEPS: (120000, 160000)
20 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 | REPEAT_THRESHOLD: 0.001
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 1203
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v1_train",)
14 | TEST: ("lvis_v1_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | EVAL_PERIOD: 50000
18 | SOLVER:
19 | STEPS: (120000, 160000)
20 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 | REPEAT_THRESHOLD: 0.001
24 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_2x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NUM_CLASSES: 1203
9 | SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 | TRAIN: ("lvis_v1_train",)
14 | TEST: ("lvis_v1_val",)
15 | TEST:
16 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
17 | EVAL_PERIOD: 50000
18 | SOLVER:
19 | STEPS: (240000, 320000) #(120000, 160000)
20 | MAX_ITER: 360000 # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 | REPEAT_THRESHOLD: 0.001
24 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
4 | PIXEL_STD: [57.375, 57.120, 58.395]
5 | MASK_ON: True
6 | RESNETS:
7 | STRIDE_IN_1X1: False # this is a C2 model
8 | NUM_GROUPS: 32
9 | WIDTH_PER_GROUP: 8
10 | DEPTH: 101
11 | ROI_HEADS:
12 | NUM_CLASSES: 1203
13 | SCORE_THRESH_TEST: 0.0001
14 | INPUT:
15 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
16 | DATASETS:
17 | TRAIN: ("lvis_v1_train",)
18 | TEST: ("lvis_v1_val",)
19 | SOLVER:
20 | STEPS: (120000, 160000)
21 | MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
22 | TEST:
23 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
24 | DATALOADER:
25 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
26 | REPEAT_THRESHOLD: 0.001
27 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NAME: CascadeROIHeads
9 | ROI_BOX_HEAD:
10 | CLS_AGNOSTIC_BBOX_REG: True
11 | RPN:
12 | POST_NMS_TOPK_TRAIN: 2000
13 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_HEADS:
8 | NAME: CascadeROIHeads
9 | ROI_BOX_HEAD:
10 | CLS_AGNOSTIC_BBOX_REG: True
11 | RPN:
12 | POST_NMS_TOPK_TRAIN: 2000
13 | SOLVER:
14 | STEPS: (210000, 250000)
15 | MAX_ITER: 270000
16 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | MASK_ON: True
4 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
5 | RESNETS:
6 | STRIDE_IN_1X1: False # this is a C2 model
7 | NUM_GROUPS: 32
8 | WIDTH_PER_GROUP: 8
9 | DEPTH: 152
10 | DEFORM_ON_PER_STAGE: [False, True, True, True]
11 | ROI_HEADS:
12 | NAME: "CascadeROIHeads"
13 | ROI_BOX_HEAD:
14 | NAME: "FastRCNNConvFCHead"
15 | NUM_CONV: 4
16 | NUM_FC: 1
17 | NORM: "GN"
18 | CLS_AGNOSTIC_BBOX_REG: True
19 | ROI_MASK_HEAD:
20 | NUM_CONV: 8
21 | NORM: "GN"
22 | RPN:
23 | POST_NMS_TOPK_TRAIN: 2000
24 | SOLVER:
25 | IMS_PER_BATCH: 128
26 | STEPS: (35000, 45000)
27 | MAX_ITER: 50000
28 | BASE_LR: 0.16
29 | INPUT:
30 | MIN_SIZE_TRAIN: (640, 864)
31 | MIN_SIZE_TRAIN_SAMPLING: "range"
32 | MAX_SIZE_TRAIN: 1440
33 | CROP:
34 | ENABLED: True
35 | TEST:
36 | EVAL_PERIOD: 2500
37 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | ROI_BOX_HEAD:
8 | CLS_AGNOSTIC_BBOX_REG: True
9 | ROI_MASK_HEAD:
10 | CLS_AGNOSTIC_MASK: True
11 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
8 | DEFORM_MODULATED: False
9 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
8 | DEFORM_MODULATED: False
9 | SOLVER:
10 | STEPS: (210000, 250000)
11 | MAX_ITER: 270000
12 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | NORM: "GN"
8 | STRIDE_IN_1X1: False
9 | FPN:
10 | NORM: "GN"
11 | ROI_BOX_HEAD:
12 | NAME: "FastRCNNConvFCHead"
13 | NUM_CONV: 4
14 | NUM_FC: 1
15 | NORM: "GN"
16 | ROI_MASK_HEAD:
17 | NORM: "GN"
18 | SOLVER:
19 | # 3x schedule
20 | STEPS: (210000, 250000)
21 | MAX_ITER: 270000
22 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | NORM: "SyncBN"
8 | STRIDE_IN_1X1: True
9 | FPN:
10 | NORM: "SyncBN"
11 | ROI_BOX_HEAD:
12 | NAME: "FastRCNNConvFCHead"
13 | NUM_CONV: 4
14 | NUM_FC: 1
15 | NORM: "SyncBN"
16 | ROI_MASK_HEAD:
17 | NORM: "SyncBN"
18 | SOLVER:
19 | # 3x schedule
20 | STEPS: (210000, 250000)
21 | MAX_ITER: 270000
22 | TEST:
23 | PRECISE_BN:
24 | ENABLED: True
25 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mmdet_mask_rcnn_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
1 | # An example config to train a mmdetection model using detectron2.
2 |
3 | from ..common.data.coco import dataloader
4 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
5 | from ..common.optim import SGD as optimizer
6 | from ..common.train import train
7 |
8 | from detectron2.modeling.mmdet_wrapper import MMDetDetector
9 | from detectron2.config import LazyCall as L
10 |
11 | model = L(MMDetDetector)(
12 | detector=dict(
13 | type="MaskRCNN",
14 | pretrained="torchvision://resnet50",
15 | backbone=dict(
16 | type="ResNet",
17 | depth=50,
18 | num_stages=4,
19 | out_indices=(0, 1, 2, 3),
20 | frozen_stages=1,
21 | norm_cfg=dict(type="BN", requires_grad=True),
22 | norm_eval=True,
23 | style="pytorch",
24 | ),
25 | neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
26 | rpn_head=dict(
27 | type="RPNHead",
28 | in_channels=256,
29 | feat_channels=256,
30 | anchor_generator=dict(
31 | type="AnchorGenerator",
32 | scales=[8],
33 | ratios=[0.5, 1.0, 2.0],
34 | strides=[4, 8, 16, 32, 64],
35 | ),
36 | bbox_coder=dict(
37 | type="DeltaXYWHBBoxCoder",
38 | target_means=[0.0, 0.0, 0.0, 0.0],
39 | target_stds=[1.0, 1.0, 1.0, 1.0],
40 | ),
41 | loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
42 | loss_bbox=dict(type="L1Loss", loss_weight=1.0),
43 | ),
44 | roi_head=dict(
45 | type="StandardRoIHead",
46 | bbox_roi_extractor=dict(
47 | type="SingleRoIExtractor",
48 | roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
49 | out_channels=256,
50 | featmap_strides=[4, 8, 16, 32],
51 | ),
52 | bbox_head=dict(
53 | type="Shared2FCBBoxHead",
54 | in_channels=256,
55 | fc_out_channels=1024,
56 | roi_feat_size=7,
57 | num_classes=80,
58 | bbox_coder=dict(
59 | type="DeltaXYWHBBoxCoder",
60 | target_means=[0.0, 0.0, 0.0, 0.0],
61 | target_stds=[0.1, 0.1, 0.2, 0.2],
62 | ),
63 | reg_class_agnostic=False,
64 | loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
65 | loss_bbox=dict(type="L1Loss", loss_weight=1.0),
66 | ),
67 | mask_roi_extractor=dict(
68 | type="SingleRoIExtractor",
69 | roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0),
70 | out_channels=256,
71 | featmap_strides=[4, 8, 16, 32],
72 | ),
73 | mask_head=dict(
74 | type="FCNMaskHead",
75 | num_convs=4,
76 | in_channels=256,
77 | conv_out_channels=256,
78 | num_classes=80,
79 | loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0),
80 | ),
81 | ),
82 | # model training and testing settings
83 | train_cfg=dict(
84 | rpn=dict(
85 | assigner=dict(
86 | type="MaxIoUAssigner",
87 | pos_iou_thr=0.7,
88 | neg_iou_thr=0.3,
89 | min_pos_iou=0.3,
90 | match_low_quality=True,
91 | ignore_iof_thr=-1,
92 | ),
93 | sampler=dict(
94 | type="RandomSampler",
95 | num=256,
96 | pos_fraction=0.5,
97 | neg_pos_ub=-1,
98 | add_gt_as_proposals=False,
99 | ),
100 | allowed_border=-1,
101 | pos_weight=-1,
102 | debug=False,
103 | ),
104 | rpn_proposal=dict(
105 | nms_pre=2000,
106 | max_per_img=1000,
107 | nms=dict(type="nms", iou_threshold=0.7),
108 | min_bbox_size=0,
109 | ),
110 | rcnn=dict(
111 | assigner=dict(
112 | type="MaxIoUAssigner",
113 | pos_iou_thr=0.5,
114 | neg_iou_thr=0.5,
115 | min_pos_iou=0.5,
116 | match_low_quality=True,
117 | ignore_iof_thr=-1,
118 | ),
119 | sampler=dict(
120 | type="RandomSampler",
121 | num=512,
122 | pos_fraction=0.25,
123 | neg_pos_ub=-1,
124 | add_gt_as_proposals=True,
125 | ),
126 | mask_size=28,
127 | pos_weight=-1,
128 | debug=False,
129 | ),
130 | ),
131 | test_cfg=dict(
132 | rpn=dict(
133 | nms_pre=1000,
134 | max_per_img=1000,
135 | nms=dict(type="nms", iou_threshold=0.7),
136 | min_bbox_size=0,
137 | ),
138 | rcnn=dict(
139 | score_thr=0.05,
140 | nms=dict(type="nms", iou_threshold=0.5),
141 | max_per_img=100,
142 | mask_thr_binary=0.5,
143 | ),
144 | ),
145 | ),
146 | pixel_mean=[123.675, 116.280, 103.530],
147 | pixel_std=[58.395, 57.120, 57.375],
148 | )
149 |
150 | dataloader.train.mapper.image_format = "RGB" # torchvision pretrained model
151 | train.init_checkpoint = None # pretrained model is loaded inside backbone
152 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml:
--------------------------------------------------------------------------------
1 | # A large PanopticFPN for demo purposes.
2 | # Use GN on backbone to support semantic seg.
3 | # Use Cascade + Deform Conv to improve localization.
4 | _BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
5 | MODEL:
6 | WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
7 | RESNETS:
8 | DEPTH: 101
9 | NORM: "GN"
10 | DEFORM_ON_PER_STAGE: [False, True, True, True]
11 | STRIDE_IN_1X1: False
12 | FPN:
13 | NORM: "GN"
14 | ROI_HEADS:
15 | NAME: CascadeROIHeads
16 | ROI_BOX_HEAD:
17 | CLS_AGNOSTIC_BBOX_REG: True
18 | ROI_MASK_HEAD:
19 | NORM: "GN"
20 | RPN:
21 | POST_NMS_TOPK_TRAIN: 2000
22 | SOLVER:
23 | STEPS: (105000, 125000)
24 | MAX_ITER: 135000
25 | IMS_PER_BATCH: 32
26 | BASE_LR: 0.04
27 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
2 | MODEL:
3 | # Train from random initialization.
4 | WEIGHTS: ""
5 | # It makes sense to divide by STD when training from scratch
6 | # But it seems to make no difference on the results and C2's models didn't do this.
7 | # So we keep things consistent with C2.
8 | # PIXEL_STD: [57.375, 57.12, 58.395]
9 | MASK_ON: True
10 | BACKBONE:
11 | FREEZE_AT: 0
12 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
13 | # to learn what you need for training from scratch.
14 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
2 | MODEL:
3 | PIXEL_STD: [57.375, 57.12, 58.395]
4 | WEIGHTS: ""
5 | MASK_ON: True
6 | RESNETS:
7 | STRIDE_IN_1X1: False
8 | BACKBONE:
9 | FREEZE_AT: 0
10 | SOLVER:
11 | # 9x schedule
12 | IMS_PER_BATCH: 64 # 4x the standard
13 | STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k
14 | MAX_ITER: 202500 # 90k * 9 / 4
15 | BASE_LR: 0.08
16 | TEST:
17 | EVAL_PERIOD: 2500
18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
19 | # to learn what you need for training from scratch.
20 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml"
2 | MODEL:
3 | PIXEL_STD: [57.375, 57.12, 58.395]
4 | WEIGHTS: ""
5 | MASK_ON: True
6 | RESNETS:
7 | STRIDE_IN_1X1: False
8 | BACKBONE:
9 | FREEZE_AT: 0
10 | SOLVER:
11 | # 9x schedule
12 | IMS_PER_BATCH: 64 # 4x the standard
13 | STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k
14 | MAX_ITER: 202500 # 90k * 9 / 4
15 | BASE_LR: 0.08
16 | TEST:
17 | EVAL_PERIOD: 2500
18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
19 | # to learn what you need for training from scratch.
20 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/semantic_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
5 | RESNETS:
6 | DEPTH: 50
7 | DATASETS:
8 | TRAIN: ("coco_2017_train_panoptic_stuffonly",)
9 | TEST: ("coco_2017_val_panoptic_stuffonly",)
10 | INPUT:
11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/torchvision_imagenet_R_50.py:
--------------------------------------------------------------------------------
1 | """
2 | An example config file to train a ImageNet classifier with detectron2.
3 | Model and dataloader both come from torchvision.
4 | This shows how to use detectron2 as a general engine for any new models and tasks.
5 | To run, use the following command:
6 |
7 | python tools/lazyconfig_train_net.py --config-file configs/Misc/torchvision_imagenet_R_50.py \
8 | --num-gpus 8 dataloader.train.dataset.root=/path/to/imagenet/
9 | """
10 |
11 |
12 | import torch
13 | from torch import nn
14 | from torch.nn import functional as F
15 | from omegaconf import OmegaConf
16 | import torchvision
17 | from torchvision.transforms import transforms as T
18 | from torchvision.models.resnet import ResNet, Bottleneck
19 | from fvcore.common.param_scheduler import MultiStepParamScheduler
20 |
21 | from detectron2.solver import WarmupParamScheduler
22 | from detectron2.solver.build import get_default_optimizer_params
23 | from detectron2.config import LazyCall as L
24 | from detectron2.model_zoo import get_config
25 | from detectron2.data.samplers import TrainingSampler, InferenceSampler
26 | from detectron2.evaluation import DatasetEvaluator
27 | from detectron2.utils import comm
28 |
29 |
30 | def build_data_loader(dataset, batch_size, num_workers, training=True):
31 | return torch.utils.data.DataLoader(
32 | dataset,
33 | sampler=(TrainingSampler if training else InferenceSampler)(len(dataset)),
34 | batch_size=batch_size,
35 | num_workers=num_workers,
36 | pin_memory=True,
37 | )
38 |
39 |
40 | class ClassificationNet(nn.Module):
41 | def __init__(self, model: nn.Module):
42 | super().__init__()
43 | self.model = model
44 |
45 | @property
46 | def device(self):
47 | return list(self.model.parameters())[0].device
48 |
49 | def forward(self, inputs):
50 | image, label = inputs
51 | pred = self.model(image.to(self.device))
52 | if self.training:
53 | label = label.to(self.device)
54 | return F.cross_entropy(pred, label)
55 | else:
56 | return pred
57 |
58 |
59 | class ClassificationAcc(DatasetEvaluator):
60 | def reset(self):
61 | self.corr = self.total = 0
62 |
63 | def process(self, inputs, outputs):
64 | image, label = inputs
65 | self.corr += (outputs.argmax(dim=1).cpu() == label.cpu()).sum().item()
66 | self.total += len(label)
67 |
68 | def evaluate(self):
69 | all_corr_total = comm.all_gather([self.corr, self.total])
70 | corr = sum(x[0] for x in all_corr_total)
71 | total = sum(x[1] for x in all_corr_total)
72 | return {"accuracy": corr / total}
73 |
74 |
75 | dataloader = OmegaConf.create()
76 | dataloader.train = L(build_data_loader)(
77 | dataset=L(torchvision.datasets.ImageNet)(
78 | root="/path/to/imagenet",
79 | split="train",
80 | transform=L(T.Compose)(
81 | transforms=[
82 | L(T.RandomResizedCrop)(size=224),
83 | L(T.RandomHorizontalFlip)(),
84 | T.ToTensor(),
85 | L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
86 | ]
87 | ),
88 | ),
89 | batch_size=256 // 8,
90 | num_workers=4,
91 | training=True,
92 | )
93 |
94 | dataloader.test = L(build_data_loader)(
95 | dataset=L(torchvision.datasets.ImageNet)(
96 | root="${...train.dataset.root}",
97 | split="val",
98 | transform=L(T.Compose)(
99 | transforms=[
100 | L(T.Resize)(size=256),
101 | L(T.CenterCrop)(size=224),
102 | T.ToTensor(),
103 | L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
104 | ]
105 | ),
106 | ),
107 | batch_size=256 // 8,
108 | num_workers=4,
109 | training=False,
110 | )
111 |
112 | dataloader.evaluator = L(ClassificationAcc)()
113 |
114 | model = L(ClassificationNet)(
115 | model=(ResNet)(block=Bottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)
116 | )
117 |
118 |
119 | optimizer = L(torch.optim.SGD)(
120 | params=L(get_default_optimizer_params)(),
121 | lr=0.1,
122 | momentum=0.9,
123 | weight_decay=1e-4,
124 | )
125 |
126 | lr_multiplier = L(WarmupParamScheduler)(
127 | scheduler=L(MultiStepParamScheduler)(
128 | values=[1.0, 0.1, 0.01, 0.001], milestones=[30, 60, 90, 100]
129 | ),
130 | warmup_length=1 / 100,
131 | warmup_factor=0.1,
132 | )
133 |
134 |
135 | train = get_config("common/train.py").train
136 | train.init_checkpoint = None
137 | train.max_iter = 100 * 1281167 // 256
138 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/README.md:
--------------------------------------------------------------------------------
1 | This directory provides definitions for a few common models, dataloaders, scheduler,
2 | and optimizers that are often used in training.
3 | The definition of these objects are provided in the form of lazy instantiation:
4 | their arguments can be edited by users before constructing the objects.
5 |
6 | They can be imported, or loaded by `model_zoo.get_config` API in users' own configs.
7 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/coco_schedule.py:
--------------------------------------------------------------------------------
1 | from fvcore.common.param_scheduler import MultiStepParamScheduler
2 |
3 | from detectron2.config import LazyCall as L
4 | from detectron2.solver import WarmupParamScheduler
5 |
6 |
7 | def default_X_scheduler(num_X):
8 | """
9 | Returns the config for a default multi-step LR scheduler such as "1x", "3x",
10 | commonly referred to in papers, where every 1x has the total length of 1440k
11 | training images (~12 COCO epochs). LR is decayed twice at the end of training
12 | following the strategy defined in "Rethinking ImageNet Pretraining", Sec 4.
13 |
14 | Args:
15 | num_X: a positive real number
16 |
17 | Returns:
18 | DictConfig: configs that define the multiplier for LR during training
19 | """
20 | # total number of iterations assuming 16 batch size, using 1440000/16=90000
21 | total_steps_16bs = num_X * 90000
22 |
23 | if num_X <= 2:
24 | scheduler = L(MultiStepParamScheduler)(
25 | values=[1.0, 0.1, 0.01],
26 | # note that scheduler is scale-invariant. This is equivalent to
27 | # milestones=[6, 8, 9]
28 | milestones=[60000, 80000, 90000],
29 | )
30 | else:
31 | scheduler = L(MultiStepParamScheduler)(
32 | values=[1.0, 0.1, 0.01],
33 | milestones=[total_steps_16bs - 60000, total_steps_16bs - 20000, total_steps_16bs],
34 | )
35 | return L(WarmupParamScheduler)(
36 | scheduler=scheduler,
37 | warmup_length=1000 / total_steps_16bs,
38 | warmup_method="linear",
39 | warmup_factor=0.001,
40 | )
41 |
42 |
43 | lr_multiplier_1x = default_X_scheduler(1)
44 | lr_multiplier_2x = default_X_scheduler(2)
45 | lr_multiplier_3x = default_X_scheduler(3)
46 | lr_multiplier_6x = default_X_scheduler(6)
47 | lr_multiplier_9x = default_X_scheduler(9)
48 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco.py:
--------------------------------------------------------------------------------
1 | from omegaconf import OmegaConf
2 |
3 | import detectron2.data.transforms as T
4 | from detectron2.config import LazyCall as L
5 | from detectron2.data import (
6 | DatasetMapper,
7 | build_detection_test_loader,
8 | build_detection_train_loader,
9 | get_detection_dataset_dicts,
10 | )
11 | from detectron2.evaluation import COCOEvaluator
12 |
13 | dataloader = OmegaConf.create()
14 |
15 | dataloader.train = L(build_detection_train_loader)(
16 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
17 | mapper=L(DatasetMapper)(
18 | is_train=True,
19 | augmentations=[
20 | L(T.ResizeShortestEdge)(
21 | short_edge_length=(640, 672, 704, 736, 768, 800),
22 | sample_style="choice",
23 | max_size=1333,
24 | ),
25 | L(T.RandomFlip)(horizontal=True),
26 | ],
27 | image_format="BGR",
28 | use_instance_mask=True,
29 | ),
30 | total_batch_size=16,
31 | num_workers=4,
32 | )
33 |
34 | dataloader.test = L(build_detection_test_loader)(
35 | dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
36 | mapper=L(DatasetMapper)(
37 | is_train=False,
38 | augmentations=[
39 | L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333),
40 | ],
41 | image_format="${...train.mapper.image_format}",
42 | ),
43 | num_workers=4,
44 | )
45 |
46 | dataloader.evaluator = L(COCOEvaluator)(
47 | dataset_name="${..test.dataset.names}",
48 | )
49 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco_keypoint.py:
--------------------------------------------------------------------------------
1 | from detectron2.data.detection_utils import create_keypoint_hflip_indices
2 |
3 | from .coco import dataloader
4 |
5 | dataloader.train.dataset.min_keypoints = 1
6 | dataloader.train.dataset.names = "keypoints_coco_2017_train"
7 | dataloader.test.dataset.names = "keypoints_coco_2017_val"
8 |
9 | dataloader.train.mapper.update(
10 | use_instance_mask=False,
11 | use_keypoint=True,
12 | keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names),
13 | )
14 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco_panoptic_separated.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.evaluation import (
3 | COCOEvaluator,
4 | COCOPanopticEvaluator,
5 | DatasetEvaluators,
6 | SemSegEvaluator,
7 | )
8 |
9 | from .coco import dataloader
10 |
11 | dataloader.train.dataset.names = "coco_2017_train_panoptic_separated"
12 | dataloader.train.dataset.filter_empty = False
13 | dataloader.test.dataset.names = "coco_2017_val_panoptic_separated"
14 |
15 |
16 | dataloader.evaluator = [
17 | L(COCOEvaluator)(
18 | dataset_name="${...test.dataset.names}",
19 | ),
20 | L(SemSegEvaluator)(
21 | dataset_name="${...test.dataset.names}",
22 | ),
23 | L(COCOPanopticEvaluator)(
24 | dataset_name="${...test.dataset.names}",
25 | ),
26 | ]
27 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/cascade_rcnn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.box_regression import Box2BoxTransform
4 | from detectron2.modeling.matcher import Matcher
5 | from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads
6 |
7 | from .mask_rcnn_fpn import model
8 |
9 | # arguments that don't exist for Cascade R-CNN
10 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
11 |
12 | model.roi_heads.update(
13 | _target_=CascadeROIHeads,
14 | box_heads=[
15 | L(FastRCNNConvFCHead)(
16 | input_shape=ShapeSpec(channels=256, height=7, width=7),
17 | conv_dims=[],
18 | fc_dims=[1024, 1024],
19 | )
20 | for k in range(3)
21 | ],
22 | box_predictors=[
23 | L(FastRCNNOutputLayers)(
24 | input_shape=ShapeSpec(channels=1024),
25 | test_score_thresh=0.05,
26 | box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
27 | cls_agnostic_bbox_reg=True,
28 | num_classes="${...num_classes}",
29 | )
30 | for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
31 | ],
32 | proposal_matchers=[
33 | L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
34 | for th in [0.5, 0.6, 0.7]
35 | ],
36 | )
37 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/keypoint_rcnn_fpn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.poolers import ROIPooler
4 | from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead
5 |
6 | from .mask_rcnn_fpn import model
7 |
8 | [model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]]
9 |
10 | model.roi_heads.update(
11 | num_classes=1,
12 | keypoint_in_features=["p2", "p3", "p4", "p5"],
13 | keypoint_pooler=L(ROIPooler)(
14 | output_size=14,
15 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
16 | sampling_ratio=0,
17 | pooler_type="ROIAlignV2",
18 | ),
19 | keypoint_head=L(KRCNNConvDeconvUpsampleHead)(
20 | input_shape=ShapeSpec(channels=256, width=14, height=14),
21 | num_keypoints=17,
22 | conv_dims=[512] * 8,
23 | loss_normalizer="visible",
24 | ),
25 | )
26 |
27 | # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
28 | # 1000 proposals per-image is found to hurt box AP.
29 | # Therefore we increase it to 1500 per-image.
30 | model.proposal_generator.post_nms_topk = (1500, 1000)
31 |
32 | # Keypoint AP degrades (though box AP improves) when using plain L1 loss
33 | model.roi_heads.box_predictor.smooth_l1_beta = 0.5
34 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/mask_rcnn_c4.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.meta_arch import GeneralizedRCNN
4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
5 | from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet
6 | from detectron2.modeling.box_regression import Box2BoxTransform
7 | from detectron2.modeling.matcher import Matcher
8 | from detectron2.modeling.poolers import ROIPooler
9 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
10 | from detectron2.modeling.roi_heads import (
11 | FastRCNNOutputLayers,
12 | MaskRCNNConvUpsampleHead,
13 | Res5ROIHeads,
14 | )
15 |
16 | model = L(GeneralizedRCNN)(
17 | backbone=L(ResNet)(
18 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
19 | stages=L(ResNet.make_default_stages)(
20 | depth=50,
21 | stride_in_1x1=True,
22 | norm="FrozenBN",
23 | ),
24 | out_features=["res4"],
25 | ),
26 | proposal_generator=L(RPN)(
27 | in_features=["res4"],
28 | head=L(StandardRPNHead)(in_channels=1024, num_anchors=15),
29 | anchor_generator=L(DefaultAnchorGenerator)(
30 | sizes=[[32, 64, 128, 256, 512]],
31 | aspect_ratios=[0.5, 1.0, 2.0],
32 | strides=[16],
33 | offset=0.0,
34 | ),
35 | anchor_matcher=L(Matcher)(
36 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
37 | ),
38 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
39 | batch_size_per_image=256,
40 | positive_fraction=0.5,
41 | pre_nms_topk=(12000, 6000),
42 | post_nms_topk=(2000, 1000),
43 | nms_thresh=0.7,
44 | ),
45 | roi_heads=L(Res5ROIHeads)(
46 | num_classes=80,
47 | batch_size_per_image=512,
48 | positive_fraction=0.25,
49 | proposal_matcher=L(Matcher)(
50 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
51 | ),
52 | in_features=["res4"],
53 | pooler=L(ROIPooler)(
54 | output_size=14,
55 | scales=(1.0 / 16,),
56 | sampling_ratio=0,
57 | pooler_type="ROIAlignV2",
58 | ),
59 | res5=L(ResNet.make_stage)(
60 | block_class=BottleneckBlock,
61 | num_blocks=3,
62 | stride_per_block=[2, 1, 1],
63 | in_channels=1024,
64 | bottleneck_channels=512,
65 | out_channels=2048,
66 | norm="FrozenBN",
67 | stride_in_1x1=True,
68 | ),
69 | box_predictor=L(FastRCNNOutputLayers)(
70 | input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1),
71 | test_score_thresh=0.05,
72 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
73 | num_classes="${..num_classes}",
74 | ),
75 | mask_head=L(MaskRCNNConvUpsampleHead)(
76 | input_shape=L(ShapeSpec)(
77 | channels="${...res5.out_channels}",
78 | width="${...pooler.output_size}",
79 | height="${...pooler.output_size}",
80 | ),
81 | num_classes="${..num_classes}",
82 | conv_dims=[256],
83 | ),
84 | ),
85 | pixel_mean=[103.530, 116.280, 123.675],
86 | pixel_std=[1.0, 1.0, 1.0],
87 | input_format="BGR",
88 | )
89 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/mask_rcnn_fpn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling.meta_arch import GeneralizedRCNN
4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
6 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet
7 | from detectron2.modeling.box_regression import Box2BoxTransform
8 | from detectron2.modeling.matcher import Matcher
9 | from detectron2.modeling.poolers import ROIPooler
10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
11 | from detectron2.modeling.roi_heads import (
12 | StandardROIHeads,
13 | FastRCNNOutputLayers,
14 | MaskRCNNConvUpsampleHead,
15 | FastRCNNConvFCHead,
16 | )
17 |
18 | model = L(GeneralizedRCNN)(
19 | backbone=L(FPN)(
20 | bottom_up=L(ResNet)(
21 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
22 | stages=L(ResNet.make_default_stages)(
23 | depth=50,
24 | stride_in_1x1=True,
25 | norm="FrozenBN",
26 | ),
27 | out_features=["res2", "res3", "res4", "res5"],
28 | ),
29 | in_features="${.bottom_up.out_features}",
30 | out_channels=256,
31 | top_block=L(LastLevelMaxPool)(),
32 | ),
33 | proposal_generator=L(RPN)(
34 | in_features=["p2", "p3", "p4", "p5", "p6"],
35 | head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
36 | anchor_generator=L(DefaultAnchorGenerator)(
37 | sizes=[[32], [64], [128], [256], [512]],
38 | aspect_ratios=[0.5, 1.0, 2.0],
39 | strides=[4, 8, 16, 32, 64],
40 | offset=0.0,
41 | ),
42 | anchor_matcher=L(Matcher)(
43 | thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
44 | ),
45 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
46 | batch_size_per_image=256,
47 | positive_fraction=0.5,
48 | pre_nms_topk=(2000, 1000),
49 | post_nms_topk=(1000, 1000),
50 | nms_thresh=0.7,
51 | ),
52 | roi_heads=L(StandardROIHeads)(
53 | num_classes=80,
54 | batch_size_per_image=512,
55 | positive_fraction=0.25,
56 | proposal_matcher=L(Matcher)(
57 | thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
58 | ),
59 | box_in_features=["p2", "p3", "p4", "p5"],
60 | box_pooler=L(ROIPooler)(
61 | output_size=7,
62 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
63 | sampling_ratio=0,
64 | pooler_type="ROIAlignV2",
65 | ),
66 | box_head=L(FastRCNNConvFCHead)(
67 | input_shape=ShapeSpec(channels=256, height=7, width=7),
68 | conv_dims=[],
69 | fc_dims=[1024, 1024],
70 | ),
71 | box_predictor=L(FastRCNNOutputLayers)(
72 | input_shape=ShapeSpec(channels=1024),
73 | test_score_thresh=0.05,
74 | box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
75 | num_classes="${..num_classes}",
76 | ),
77 | mask_in_features=["p2", "p3", "p4", "p5"],
78 | mask_pooler=L(ROIPooler)(
79 | output_size=14,
80 | scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
81 | sampling_ratio=0,
82 | pooler_type="ROIAlignV2",
83 | ),
84 | mask_head=L(MaskRCNNConvUpsampleHead)(
85 | input_shape=ShapeSpec(channels=256, width=14, height=14),
86 | num_classes="${..num_classes}",
87 | conv_dims=[256, 256, 256, 256, 256],
88 | ),
89 | ),
90 | pixel_mean=[103.530, 116.280, 123.675],
91 | pixel_std=[1.0, 1.0, 1.0],
92 | input_format="BGR",
93 | )
94 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/panoptic_fpn.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import LazyCall as L
2 | from detectron2.layers import ShapeSpec
3 | from detectron2.modeling import PanopticFPN
4 | from detectron2.modeling.meta_arch.semantic_seg import SemSegFPNHead
5 |
6 | from .mask_rcnn_fpn import model
7 |
8 | model._target_ = PanopticFPN
9 | model.sem_seg_head = L(SemSegFPNHead)(
10 | input_shape={
11 | f: L(ShapeSpec)(stride=s, channels="${....backbone.out_channels}")
12 | for f, s in zip(["p2", "p3", "p4", "p5"], [4, 8, 16, 32])
13 | },
14 | ignore_value=255,
15 | num_classes=54, # COCO stuff + 1
16 | conv_dims=128,
17 | common_stride=4,
18 | loss_weight=0.5,
19 | norm="GN",
20 | )
21 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/retinanet.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from detectron2.config import LazyCall as L
4 | from detectron2.layers import ShapeSpec
5 | from detectron2.modeling.meta_arch import RetinaNet
6 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
7 | from detectron2.modeling.backbone.fpn import LastLevelP6P7
8 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet
9 | from detectron2.modeling.box_regression import Box2BoxTransform
10 | from detectron2.modeling.matcher import Matcher
11 | from detectron2.modeling.meta_arch.retinanet import RetinaNetHead
12 |
13 | model = L(RetinaNet)(
14 | backbone=L(FPN)(
15 | bottom_up=L(ResNet)(
16 | stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
17 | stages=L(ResNet.make_default_stages)(
18 | depth=50,
19 | stride_in_1x1=True,
20 | norm="FrozenBN",
21 | ),
22 | out_features=["res3", "res4", "res5"],
23 | ),
24 | in_features=["res3", "res4", "res5"],
25 | out_channels=256,
26 | top_block=L(LastLevelP6P7)(in_channels=2048, out_channels="${..out_channels}"),
27 | ),
28 | head=L(RetinaNetHead)(
29 | input_shape=[ShapeSpec(channels=256)],
30 | num_classes="${..num_classes}",
31 | conv_dims=[256, 256, 256, 256],
32 | prior_prob=0.01,
33 | num_anchors=9,
34 | ),
35 | anchor_generator=L(DefaultAnchorGenerator)(
36 | sizes=[[x, x * 2 ** (1.0 / 3), x * 2 ** (2.0 / 3)] for x in [32, 64, 128, 256, 512]],
37 | aspect_ratios=[0.5, 1.0, 2.0],
38 | strides=[8, 16, 32, 64, 128],
39 | offset=0.0,
40 | ),
41 | box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
42 | anchor_matcher=L(Matcher)(
43 | thresholds=[0.4, 0.5], labels=[0, -1, 1], allow_low_quality_matches=True
44 | ),
45 | num_classes=80,
46 | head_in_features=["p3", "p4", "p5", "p6", "p7"],
47 | focal_loss_alpha=0.25,
48 | focal_loss_gamma=2.0,
49 | pixel_mean=[103.530, 116.280, 123.675],
50 | pixel_std=[1.0, 1.0, 1.0],
51 | input_format="BGR",
52 | )
53 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/optim.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from detectron2.config import LazyCall as L
4 | from detectron2.solver.build import get_default_optimizer_params
5 |
6 | SGD = L(torch.optim.SGD)(
7 | params=L(get_default_optimizer_params)(
8 | # params.model is meant to be set to the model object, before instantiating
9 | # the optimizer.
10 | weight_decay_norm=0.0
11 | ),
12 | lr=0.02,
13 | momentum=0.9,
14 | weight_decay=1e-4,
15 | )
16 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/train.py:
--------------------------------------------------------------------------------
1 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py"
2 | # You can use your own instead, together with your own train_net.py
3 | train = dict(
4 | output_dir="./output",
5 | init_checkpoint="detectron2://ImageNetPretrained/MSRA/R-50.pkl",
6 | bb_rpn_checkpoint="",
7 | max_iter=90000,
8 | amp=dict(enabled=False), # options for Automatic Mixed Precision
9 | ddp=dict( # options for DistributedDataParallel
10 | broadcast_buffers=False,
11 | find_unused_parameters=False,
12 | fp16_compression=False,
13 | ),
14 | checkpointer=dict(period=5000, max_to_keep=100), # options for PeriodicCheckpointer
15 | eval_period=5000,
16 | log_period=20,
17 | device="cuda"
18 | # ...
19 | )
20 |
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "PretrainFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: ""
8 | MASK_ON: False
9 | RESNETS:
10 | DEPTH: 50
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | ROI_HEADS:
16 | NAME: "PretrainRes5ROIHeads"
17 | IN_FEATURES: ["res4"]
18 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
19 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
20 | CLIP:
21 | CLSS_TEMP: 0.01
22 | CROP_REGION_TYPE: "RPN"
23 | OFFLINE_RPN_NMS_THRESH: 0.5
24 | GATHER_GPUS: True
25 | CONCEPT_THRES: 0.1
26 | PRETRAIN_RPN_REGIONS: 300
27 | PRETRAIN_SAMPLE_REGIONS: 100
28 | PRETRAIN_IMG_TXT_LEVEL: True
29 | PRETRAIN_ONLY_EOT: True
30 | TEACHER_RESNETS_DEPTH: 50
31 | TEACHER_POOLER_RESOLUTION: 14
32 | DATASETS:
33 | TRAIN: ("imgtxtpairs",)
34 | FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",)
35 | PATH_TRAIN: ("./datasets/coco/val2017",) # ("/tmp/datasets/CC3M",)
36 | TEST: ()
37 | DATALOADER:
38 | ASPECT_RATIO_GROUPING: False
39 | NUM_WORKERS: 4
40 | TEST:
41 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
42 | EVAL_PERIOD: 2500000
43 | SOLVER:
44 | IMS_PER_BATCH: 96 # 32 gpus
45 | BASE_LR: 0.002
46 | WEIGHT_DECAY: 0.0001
47 | STEPS: (300000, 525000)
48 | MAX_ITER: 600000
49 | CLIP_GRADIENTS:
50 | ENABLED: True
51 | CLIP_TYPE: "norm"
52 | CLIP_VALUE: 5.0
53 | INPUT:
54 | MIN_SIZE_TRAIN_SAMPLING: choice
55 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
56 | MAX_SIZE_TRAIN: 1333
57 | MIN_SIZE_TEST: 800
58 | MAX_SIZE_TEST: 1333
59 | FORMAT: "RGB"
60 | AUG: # Data Augmentation from MSR-CLIP
61 | TRAIN:
62 | IMAGE_SIZE: [800,]
63 | MAX_SIZE: 1333
64 | TEST:
65 | IMAGE_SIZE: [800,]
66 | MAX_SIZE: 1333
67 | INTERPOLATION: 3
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./RegionCLIP_RN50.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "WeakPretrainFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone_from_pretrain"
6 | FREEZE_AT: 2
7 | CLIP:
8 | CROP_REGION_TYPE: "RPN"
9 | OFFLINE_RPN_NMS_THRESH: 0.3 # will affect the eval performance
10 | # GATHER_GPUS: True
11 | PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
12 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection
13 | # for ZS inference
14 | NO_BOX_DELTA: True # no box refinement
15 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
16 | USE_TEXT_EMB_CLASSIFIER: True
17 | MULTIPLY_RPN_SCORE: True
18 | WEAK_LOSS:
19 | WEAK_LOSS_WEIGHT: 0.01
20 | IMAGE_LOSS_WEIGHT: 0.1
21 | BOX_SELECT_THRES: 0.97 # threshold in box selection
22 | # for ZS inference
23 | ROI_HEADS:
24 | NAME: "CLIPRes5ROIHeads" # pretrain roi head
25 | IN_FEATURES: ["res4"]
26 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score
27 | # for ZS inference
28 | NMS_THRESH_TEST: 0.5
29 | # for ZS inference
30 | ROI_BOX_HEAD:
31 | NAME: ""
32 | NUM_FC: 0
33 | # POOLER_RESOLUTION: 14
34 | CLS_AGNOSTIC_BBOX_REG: True
35 | DATASETS:
36 | # TRAIN: ("coco_zeroshot_train_del", "coco_caption_nouns_train_4764tags",)
37 | TRAIN: ("coco_caption_nouns_train_4764tags",)
38 | TEST: ("coco_generalized_del_val",)
39 | INPUT:
40 | CUSTOM_AUG: ResizeShortestEdge
41 | MIN_SIZE_TRAIN_SAMPLING: range
42 | MIN_SIZE_TRAIN: (400, 400)
43 | MAX_SIZE_TRAIN: 667
44 | DATALOADER:
45 | # SAMPLER_TRAIN: "MultiDatasetSampler"
46 | # DATASET_RATIO: [1, 4]
47 | # USE_DIFF_BS_SIZE: True # if use build_custom_augmentation
48 | # DATASET_BS: [2, 8]
49 | # USE_RFS: [False, False]
50 | # DATASET_MIN_SIZES: [[800, 800], [400, 400]]
51 | # DATASET_MAX_SIZES: [1333, 667]
52 | # DATASET_MIN_SIZES: [[800, 800], [400, 400]]
53 | # DATASET_MAX_SIZES: [1333, 667]
54 | FILTER_EMPTY_ANNOTATIONS: False
55 | DATASET_ANN: ['caption',]
56 | # MULTI_DATASET_GROUPING: True
57 | # DATASET_ANN: ['box', 'caption']
58 | # NUM_WORKERS: 8
59 | TEST:
60 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300
61 | EVAL_PERIOD: 10000
62 | SOLVER:
63 | IMS_PER_BATCH: 96 # 32 gpus
64 | BASE_LR: 0.002
65 | WEIGHT_DECAY: 0.0001
66 | STEPS: (60000, 80000)
67 | MAX_ITER: 90000
68 | CHECKPOINT_PERIOD: 20000
69 | CLIP_GRADIENTS:
70 | ENABLED: True
71 | CLIP_TYPE: "norm"
72 | CLIP_VALUE: 5.0
73 | FIND_UNUSED_PARAM: True
74 | WITH_IMAGE_LABELS: True # load image tags
75 | OUTPUT_DIR: output/r50_onlinePL_pre
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./RegionCLIP_RN50.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "WeakPretrainFastRCNN"
4 | IGNORE_CLS_LOSS: True # disable weak loss
5 | BACKBONE:
6 | NAME: "build_clip_resnet_backbone_from_pretrain"
7 | FREEZE_AT: 2
8 | ROI_HEADS:
9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head
10 | IN_FEATURES: ["res4"]
11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score
12 | # for ZS inference
13 | NMS_THRESH_TEST: 0.5
14 | # for ZS inference
15 | ROI_BOX_HEAD:
16 | NAME: ""
17 | NUM_FC: 0
18 | # POOLER_RESOLUTION: 14
19 | CLS_AGNOSTIC_BBOX_REG: True
20 | CLIP:
21 | CROP_REGION_TYPE: "RPN"
22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 | # GATHER_GPUS: True
24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection
26 | # for ZS inference
27 | NO_BOX_DELTA: False # pretrain roi head
28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 | USE_TEXT_EMB_CLASSIFIER: True
30 | MULTIPLY_RPN_SCORE: True
31 | WEAK_LOSS:
32 | WEAK_LOSS_WEIGHT: 0.01
33 | IMAGE_LOSS_WEIGHT: 0.1
34 | BOX_SELECT_THRES: 0.97 # threshold in box selection
35 | DATASETS:
36 | TRAIN: ("lvis_v1_train_base_box_only", "coco_caption_nouns_train_4764tags",)
37 | TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 | CUSTOM_AUG: ResizeShortestEdge
40 | MIN_SIZE_TRAIN_SAMPLING: range
41 | MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 | SAMPLER_TRAIN: "MultiDatasetSampler"
44 | DATASET_RATIO: [1, 8]
45 | USE_DIFF_BS_SIZE: True
46 | DATASET_BS: [2, 16]
47 | USE_RFS: [False, False]
48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 | DATASET_MAX_SIZES: [1333, 667]
50 | FILTER_EMPTY_ANNOTATIONS: False
51 | MULTI_DATASET_GROUPING: True
52 | DATASET_ANN: ['box', 'caption']
53 | NUM_WORKERS: 8
54 | TEST:
55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300
56 | EVAL_PERIOD: 10000
57 | SOLVER:
58 | IMS_PER_BATCH: 96 # 32 gpus
59 | BASE_LR: 0.002
60 | WEIGHT_DECAY: 0.0001
61 | STEPS: (60000, 80000)
62 | MAX_ITER: 90000
63 | CHECKPOINT_PERIOD: 20000
64 | CLIP_GRADIENTS:
65 | ENABLED: True
66 | CLIP_TYPE: "norm"
67 | CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_weak
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./RegionCLIP_RN50.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "WeakPretrainFastRCNN"
4 | IGNORE_CLS_LOSS: True # disable weak loss
5 | BACKBONE:
6 | NAME: "build_clip_resnet_backbone_from_pretrain"
7 | FREEZE_AT: 2
8 | ROI_HEADS:
9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head
10 | IN_FEATURES: ["res4"]
11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score
12 | # for ZS inference
13 | NMS_THRESH_TEST: 0.5
14 | # for ZS inference
15 | ROI_BOX_HEAD:
16 | NAME: ""
17 | NUM_FC: 0
18 | # POOLER_RESOLUTION: 14
19 | CLS_AGNOSTIC_BBOX_REG: True
20 | CLIP:
21 | CROP_REGION_TYPE: "RPN"
22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 | # GATHER_GPUS: True
24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection
26 | # for ZS inference
27 | NO_BOX_DELTA: False # pretrain roi head
28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 | USE_TEXT_EMB_CLASSIFIER: True
30 | MULTIPLY_RPN_SCORE: True
31 | WEAK_LOSS:
32 | WEAK_LOSS_WEIGHT: 0.01
33 | IMAGE_LOSS_WEIGHT: 0.1
34 | BOX_SELECT_THRES: 0.97 # threshold in box selection
35 | DATASETS:
36 | TRAIN: ("lvis_v1_train_base_box_only", "cc3m_v1_nouns_train_4764tags",)
37 | TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 | CUSTOM_AUG: ResizeShortestEdge
40 | MIN_SIZE_TRAIN_SAMPLING: range
41 | MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 | SAMPLER_TRAIN: "MultiDatasetSampler"
44 | DATASET_RATIO: [1, 8]
45 | USE_DIFF_BS_SIZE: True
46 | DATASET_BS: [2, 16]
47 | USE_RFS: [False, False]
48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 | DATASET_MAX_SIZES: [1333, 667]
50 | FILTER_EMPTY_ANNOTATIONS: False
51 | MULTI_DATASET_GROUPING: True
52 | DATASET_ANN: ['box', 'caption']
53 | NUM_WORKERS: 8
54 | TEST:
55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300
56 | EVAL_PERIOD: 10000
57 | SOLVER:
58 | IMS_PER_BATCH: 96 # 32 gpus
59 | BASE_LR: 0.002
60 | WEIGHT_DECAY: 0.0001
61 | STEPS: (240000, 320000)
62 | MAX_ITER: 360000
63 | CHECKPOINT_PERIOD: 40000
64 | CLIP_GRADIENTS:
65 | ENABLED: True
66 | CLIP_TYPE: "norm"
67 | CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./RegionCLIP_RN50.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "WeakPretrainFastRCNN"
4 | IGNORE_CLS_LOSS: True # disable weak loss
5 | BACKBONE:
6 | NAME: "build_clip_resnet_backbone_from_pretrain"
7 | FREEZE_AT: 2
8 | ROI_HEADS:
9 | NAME: "CLIPRes5ROIHeads" # pretrain roi head
10 | IN_FEATURES: ["res4"]
11 | NUM_CLASSES: 1 # box only roi head, used in pretraining to setup self.cls_score
12 | # for ZS inference
13 | NMS_THRESH_TEST: 0.5
14 | # for ZS inference
15 | ROI_BOX_HEAD:
16 | NAME: ""
17 | NUM_FC: 0
18 | # POOLER_RESOLUTION: 14
19 | CLS_AGNOSTIC_BBOX_REG: True
20 | CLIP:
21 | CROP_REGION_TYPE: "RPN"
22 | OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 | # GATHER_GPUS: True
24 | # PRETRAIN_RPN_REGIONS: 300 # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 | PRETRAIN_SAMPLE_REGIONS: 64 # num_regions_per_img, topk in box selection
26 | # for ZS inference
27 | NO_BOX_DELTA: False # pretrain roi head
28 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 | USE_TEXT_EMB_CLASSIFIER: True
30 | MULTIPLY_RPN_SCORE: True
31 | WEAK_LOSS:
32 | WEAK_LOSS_WEIGHT: 0.01
33 | IMAGE_LOSS_WEIGHT: 0.1
34 | BOX_SELECT_THRES: 0.97 # threshold in box selection
35 | DATASETS:
36 | TRAIN: ("lvis_v1_train_base_box_only", "loc_narr_nouns_train_4764tags",)
37 | TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 | CUSTOM_AUG: ResizeShortestEdge
40 | MIN_SIZE_TRAIN_SAMPLING: range
41 | MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 | SAMPLER_TRAIN: "MultiDatasetSampler"
44 | DATASET_RATIO: [1, 8]
45 | USE_DIFF_BS_SIZE: True
46 | DATASET_BS: [2, 16]
47 | USE_RFS: [False, False]
48 | DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 | DATASET_MAX_SIZES: [1333, 667]
50 | FILTER_EMPTY_ANNOTATIONS: False
51 | MULTI_DATASET_GROUPING: True
52 | DATASET_ANN: ['box', 'caption']
53 | NUM_WORKERS: 8
54 | TEST:
55 | DETECTIONS_PER_IMAGE: 100 # LVIS allows up to 300
56 | EVAL_PERIOD: 10000
57 | SOLVER:
58 | IMS_PER_BATCH: 96 # 32 gpus
59 | BASE_LR: 0.002
60 | WEIGHT_DECAY: 0.0001
61 | STEPS: (240000, 320000)
62 | MAX_ITER: 360000
63 | CHECKPOINT_PERIOD: 40000
64 | CLIP_GRADIENTS:
65 | ENABLED: True
66 | CLIP_TYPE: "norm"
67 | CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 | META_ARCHITECTURE: "PretrainFastRCNN"
4 | BACKBONE:
5 | NAME: "build_clip_resnet_backbone"
6 | FREEZE_AT: 2
7 | WEIGHTS: ""
8 | MASK_ON: False
9 | RESNETS:
10 | DEPTH: 200
11 | OUT_FEATURES: ["res4"]
12 | NORM: FrozenBN
13 | STEM_OUT_CHANNELS: 64
14 | RES2_OUT_CHANNELS: 256
15 | ROI_HEADS:
16 | NAME: "PretrainRes5ROIHeads"
17 | IN_FEATURES: ["res4"]
18 | ROI_BOX_HEAD:
19 | POOLER_RESOLUTION: 18
20 | PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
21 | PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
22 | CLIP:
23 | CLSS_TEMP: 0.01
24 | CROP_REGION_TYPE: "RPN"
25 | OFFLINE_RPN_NMS_THRESH: 0.5
26 | GATHER_GPUS: True
27 | CONCEPT_THRES: 0.1
28 | PRETRAIN_RPN_REGIONS: 300
29 | PRETRAIN_SAMPLE_REGIONS: 100
30 | PRETRAIN_IMG_TXT_LEVEL: True
31 | PRETRAIN_ONLY_EOT: True
32 | TEACHER_RESNETS_DEPTH: 200
33 | TEACHER_POOLER_RESOLUTION: 18
34 | # INPUT:
35 | # MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
36 | DATASETS:
37 | TRAIN: ("imgtxtpairs",)
38 | FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",)
39 | PATH_TRAIN: ("/home/v-yiwuzhong/projects/azureblobs/vlpdatasets/coco-caption/val2017",) # ("/tmp/datasets/CC3M",)
40 | TEST: ()
41 | DATALOADER:
42 | ASPECT_RATIO_GROUPING: False
43 | NUM_WORKERS: 4
44 | TEST:
45 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
46 | EVAL_PERIOD: 2500000
47 | SOLVER:
48 | IMS_PER_BATCH: 96 # 32 gpus
49 | BASE_LR: 0.002
50 | WEIGHT_DECAY: 0.0001
51 | STEPS: (300000, 525000)
52 | MAX_ITER: 600000
53 | CLIP_GRADIENTS:
54 | ENABLED: True
55 | CLIP_TYPE: "norm"
56 | CLIP_VALUE: 5.0
57 | INPUT:
58 | MIN_SIZE_TRAIN_SAMPLING: choice
59 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
60 | MAX_SIZE_TRAIN: 1333
61 | MIN_SIZE_TEST: 800
62 | MAX_SIZE_TEST: 1333
63 | FORMAT: "RGB"
64 | AUG: # Data Augmentation from MSR-CLIP
65 | TRAIN:
66 | IMAGE_SIZE: [800,]
67 | MAX_SIZE: 1333
68 | TEST:
69 | IMAGE_SIZE: [800,]
70 | MAX_SIZE: 1333
71 | INTERPOLATION: 3
--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4_onlinePL_boxWeak.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./RegionCLIP_RN50_onlinePL_box_weak.yaml"
2 | MODEL:
3 | RESNETS:
4 | DEPTH: 200
5 | ROI_BOX_HEAD:
6 | POOLER_RESOLUTION: 18
7 | CLIP:
8 | TEACHER_RESNETS_DEPTH: 200
9 | TEACHER_POOLER_RESOLUTION: 18
10 | TEXT_EMB_DIM: 640
11 | # TEXT_EMB_PATH: None # for classifer, not used in pretraining if MODEL.IGNORE_CLS_LOSS True
12 | OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth" # use emb from r50x4
13 | OUTPUT_DIR: output/r50x4_pre_onlinePL_boxWeak
14 |
15 |
--------------------------------------------------------------------------------
/sas_det/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import ovd_register as _ovd_register # ensure the builtin datasets are registered
3 |
4 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
5 |
--------------------------------------------------------------------------------
/sas_det/data/coco_zeroshot_categories.py:
--------------------------------------------------------------------------------
1 | # COCO categories for zero-shot setting
2 | # 65 categories in total, 48 base categories for training, 17 unseen categories are only used in testing
3 | # from http://ankan.umiacs.io/files/mscoco_seen_classes.json, http://ankan.umiacs.io/files/mscoco_unseen_classes.json
4 |
5 | # 17 class names in order, obtained from load_coco_json() function
6 | COCO_UNSEEN_CLS = ['airplane', 'bus', 'cat', 'dog', 'cow', 'elephant', 'umbrella', \
7 | 'tie', 'snowboard', 'skateboard', 'cup', 'knife', 'cake', 'couch', 'keyboard', \
8 | 'sink', 'scissors']
9 |
10 | # 48 class names in order, obtained from load_coco_json() function
11 | COCO_SEEN_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'train', 'truck', \
12 | 'boat', 'bench', 'bird', 'horse', 'sheep', 'bear', 'zebra', 'giraffe', \
13 | 'backpack', 'handbag', 'suitcase', 'frisbee', 'skis', 'kite', 'surfboard', \
14 | 'bottle', 'fork', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', \
15 | 'broccoli', 'carrot', 'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', \
16 | 'laptop', 'mouse', 'remote', 'microwave', 'oven', 'toaster', \
17 | 'refrigerator', 'book', 'clock', 'vase', 'toothbrush']
18 |
19 | # 65 class names in order, obtained from load_coco_json() function
20 | COCO_OVD_ALL_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', \
21 | 'bus', 'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', \
22 | 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', \
23 | 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard', \
24 | 'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', \
25 | 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut', 'cake', \
26 | 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', \
27 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', \
28 | 'scissors', 'toothbrush']
29 |
30 | # 80 class names
31 | COCO_80_ALL_CLS = {1: 'person',
32 | 2: 'bicycle',
33 | 3: 'car',
34 | 4: 'motorcycle',
35 | 5: 'airplane',
36 | 6: 'bus',
37 | 7: 'train',
38 | 8: 'truck',
39 | 9: 'boat',
40 | 10: 'traffic light',
41 | 11: 'fire hydrant',
42 | 12: 'stop sign',
43 | 13: 'parking meter',
44 | 14: 'bench',
45 | 15: 'bird',
46 | 16: 'cat',
47 | 17: 'dog',
48 | 18: 'horse',
49 | 19: 'sheep',
50 | 20: 'cow',
51 | 21: 'elephant',
52 | 22: 'bear',
53 | 23: 'zebra',
54 | 24: 'giraffe',
55 | 25: 'backpack',
56 | 26: 'umbrella',
57 | 27: 'handbag',
58 | 28: 'tie',
59 | 29: 'suitcase',
60 | 30: 'frisbee',
61 | 31: 'skis',
62 | 32: 'snowboard',
63 | 33: 'sports ball',
64 | 34: 'kite',
65 | 35: 'baseball bat',
66 | 36: 'baseball glove',
67 | 37: 'skateboard',
68 | 38: 'surfboard',
69 | 39: 'tennis racket',
70 | 40: 'bottle',
71 | 41: 'wine glass',
72 | 42: 'cup',
73 | 43: 'fork',
74 | 44: 'knife',
75 | 45: 'spoon',
76 | 46: 'bowl',
77 | 47: 'banana',
78 | 48: 'apple',
79 | 49: 'sandwich',
80 | 50: 'orange',
81 | 51: 'broccoli',
82 | 52: 'carrot',
83 | 53: 'hot dog',
84 | 54: 'pizza',
85 | 55: 'donut',
86 | 56: 'cake',
87 | 57: 'chair',
88 | 58: 'couch',
89 | 59: 'potted plant',
90 | 60: 'bed',
91 | 61: 'dining table',
92 | 62: 'toilet',
93 | 63: 'tv',
94 | 64: 'laptop',
95 | 65: 'mouse',
96 | 66: 'remote',
97 | 67: 'keyboard',
98 | 68: 'cell phone',
99 | 69: 'microwave',
100 | 70: 'oven',
101 | 71: 'toaster',
102 | 72: 'sink',
103 | 73: 'refrigerator',
104 | 74: 'book',
105 | 75: 'clock',
106 | 76: 'vase',
107 | 77: 'scissors',
108 | 78: 'teddy bear',
109 | 79: 'hair drier',
110 | 80: 'toothbrush'}
111 |
112 | if __name__ == "__main__":
113 | # from https://github.com/alirezazareian/ovr-cnn/blob/master/ipynb/001.ipynb
114 | # Create zero-shot setting data split in COCO
115 | import json
116 | import ipdb
117 |
118 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
119 | coco_train_anno_all = json.load(fin)
120 |
121 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
122 | coco_train_anno_seen = json.load(fin)
123 |
124 | with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
125 | coco_train_anno_unseen = json.load(fin)
126 |
127 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
128 | coco_val_anno_all = json.load(fin)
129 |
130 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
131 | coco_val_anno_seen = json.load(fin)
132 |
133 | with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
134 | coco_val_anno_unseen = json.load(fin)
135 |
136 | labels_seen = COCO_SEEN_CLS
137 | labels_unseen = COCO_UNSEEN_CLS
138 | labels_all = [item['name'] for item in coco_val_anno_all['categories']] # 80 class names
139 | # len(labels_seen), len(labels_unseen)
140 | # set(labels_seen) - set(labels_all)
141 | # set(labels_unseen) - set(labels_all)
142 |
143 | class_id_to_split = {} # {1: 'seen', 2: 'seen', 3: 'seen', 4: 'seen', 5: 'unseen',...}
144 | class_name_to_split = {} # {'person': 'seen', 'bicycle': 'seen', 'car': 'seen', 'motorcycle': 'seen', 'airplane': 'unseen',...}
145 | for item in coco_val_anno_all['categories']:
146 | if item['name'] in labels_seen:
147 | class_id_to_split[item['id']] = 'seen'
148 | class_name_to_split[item['name']] = 'seen'
149 | elif item['name'] in labels_unseen:
150 | class_id_to_split[item['id']] = 'unseen'
151 | class_name_to_split[item['name']] = 'unseen'
152 |
153 | # class_name_to_emb = {}
154 | # with open('../datasets/coco/zero-shot/glove.6B.300d.txt', 'r') as fin:
155 | # for row in fin:
156 | # row_tk = row.split()
157 | # if row_tk[0] in class_name_to_split:
158 | # class_name_to_emb[row_tk[0]] = [float(num) for num in row_tk[1:]]
159 | # len(class_name_to_emb), len(class_name_to_split)
160 |
161 | def filter_annotation(anno_dict, split_name_list):
162 | """
163 | COCO annotations have fields: dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])
164 | This function (1) filters the category metadata (list) in 'categories';
165 | (2) filter instance annotation in 'annotations'; (3) filter image metadata (list) in 'images
166 | """
167 | filtered_categories = []
168 | for item in anno_dict['categories']:
169 | if class_id_to_split.get(item['id']) in split_name_list:
170 | #item['embedding'] = class_name_to_emb[item['name']]
171 | item['split'] = class_id_to_split.get(item['id'])
172 | filtered_categories.append(item)
173 | anno_dict['categories'] = filtered_categories
174 |
175 | filtered_images = []
176 | filtered_annotations = []
177 | useful_image_ids = set()
178 | for item in anno_dict['annotations']:
179 | if class_id_to_split.get(item['category_id']) in split_name_list:
180 | filtered_annotations.append(item)
181 | useful_image_ids.add(item['image_id'])
182 | for item in anno_dict['images']:
183 | if item['id'] in useful_image_ids:
184 | filtered_images.append(item)
185 | anno_dict['annotations'] = filtered_annotations
186 | anno_dict['images'] = filtered_images
187 |
188 | filter_annotation(coco_train_anno_seen, ['seen'])
189 | filter_annotation(coco_train_anno_unseen, ['unseen'])
190 | filter_annotation(coco_train_anno_all, ['seen', 'unseen'])
191 | filter_annotation(coco_val_anno_seen, ['seen'])
192 | filter_annotation(coco_val_anno_unseen, ['unseen'])
193 | filter_annotation(coco_val_anno_all, ['seen', 'unseen'])
194 |
195 | with open('./datasets/coco/annotations/ovd_ins_train2017_b.json', 'w') as fout:
196 | json.dump(coco_train_anno_seen, fout)
197 | with open('./datasets/coco/annotations/ovd_ins_train2017_t.json', 'w') as fout:
198 | json.dump(coco_train_anno_unseen, fout)
199 | with open('./datasets/coco/annotations/ovd_ins_train2017_all.json', 'w') as fout:
200 | json.dump(coco_train_anno_all, fout)
201 | with open('./datasets/coco/annotations/ovd_ins_val2017_b.json', 'w') as fout:
202 | json.dump(coco_val_anno_seen, fout)
203 | with open('./datasets/coco/annotations/ovd_ins_val2017_t.json', 'w') as fout:
204 | json.dump(coco_val_anno_unseen, fout)
205 | with open('./datasets/coco/annotations/ovd_ins_val2017_all.json', 'w') as fout:
206 | json.dump(coco_val_anno_all, fout)
--------------------------------------------------------------------------------
/sas_det/data/ovd_register.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 |
4 |
5 | """
6 | This file registers pre-defined datasets at hard-coded paths, and their metadata.
7 |
8 | We hard-code metadata for common datasets. This will enable:
9 | 1. Consistency check when loading the datasets
10 | 2. Use models on these standard datasets directly and run demos,
11 | without having to download the dataset annotations
12 |
13 | We hard-code some paths to the dataset that's assumed to
14 | exist in "./datasets/".
15 |
16 | Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
17 | To add new dataset, refer to the tutorial "docs/DATASETS.md".
18 | """
19 |
20 | import os
21 |
22 | from detectron2.data import DatasetCatalog, MetadataCatalog
23 |
24 | from detectron2.data.datasets.builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata
25 | # from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
26 | # from .cityscapes_panoptic import register_all_cityscapes_panoptic
27 | from detectron2.data.datasets.coco import load_sem_seg, register_coco_instances
28 | # from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
29 | # from detectron2.data.datasets.lvis import get_lvis_instances_meta, register_lvis_instances
30 | # from .pascal_voc import register_pascal_voc
31 |
32 | from .lvis import get_lvis_instances_meta, register_lvis_instances_w_PLs, register_lvis_instances
33 |
34 | # ==== Predefined datasets and splits for COCO ==========
35 |
36 | _PREDEFINED_SPLITS_COCO = {}
37 | # _PREDEFINED_SPLITS_COCO["coco"] = {
38 | # "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
39 | # "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
40 | # "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
41 | # "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"),
42 | # "coco_2014_valminusminival": (
43 | # "coco/val2014",
44 | # "coco/annotations/instances_valminusminival2014.json",
45 | # ),
46 | # "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
47 | # "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
48 | # "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
49 | # "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
50 | # "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
51 | # }
52 | _PREDEFINED_SPLITS_COCO["coco_ovd"] = {
53 | "coco_2017_ovd_all_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_all.json"),
54 | "coco_2017_ovd_b_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b.json"),
55 | "coco_2017_ovd_b_train_65cats": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats.json"),
56 | "coco_2017_ovd_b_train_65cats_all_images": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats_all_images.json"),
57 | "coco_2017_ovd_t_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_t.json"),
58 | #
59 | "coco_2017_ovd_all_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_all.json"),
60 | "coco_2017_ovd_b_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_b.json"),
61 | "coco_2017_ovd_t_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_t.json"),
62 | #
63 | "coco_2017_ovd_retain_val": ("coco/val2017", "coco/annotations/ovd_ins_val2017_retain_15.json"),
64 | }
65 |
66 |
67 | def register_all_coco(root):
68 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
69 | if dataset_name == 'coco_ovd': # for zero-shot split
70 | for key, (image_root, json_file) in splits_per_dataset.items():
71 | # Assume pre-defined datasets live in `./datasets`.
72 | register_coco_instances(
73 | key,
74 | {}, # empty metadata, it will be overwritten in load_coco_json() function
75 | os.path.join(root, json_file) if "://" not in json_file else json_file,
76 | os.path.join(root, image_root),
77 | )
78 | else: # default splits
79 | for key, (image_root, json_file) in splits_per_dataset.items():
80 | # Assume pre-defined datasets live in `./datasets`.
81 | register_coco_instances(
82 | key,
83 | _get_builtin_metadata(dataset_name),
84 | os.path.join(root, json_file) if "://" not in json_file else json_file,
85 | os.path.join(root, image_root),
86 | )
87 |
88 |
89 | # ==== Predefined datasets and splits for LVIS ==========
90 |
91 | _PREDEFINED_SPLITS_LVIS = {
92 | # # openset setting
93 | # "lvis_v1": {
94 | # "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"),
95 | # "lvis_v1_train_p0": ("coco/", "lvis/lvis_v1_train_p0.json"),
96 | # "lvis_v1_train_p1": ("coco/", "lvis/lvis_v1_train_p1.json"),
97 | # "lvis_v1_train_p2": ("coco/", "lvis/lvis_v1_train_p2.json"),
98 | # "lvis_v1_train_p3": ("coco/", "lvis/lvis_v1_train_p3.json"),
99 | # #
100 | # "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"),
101 | # "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
102 | # "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
103 | # },
104 | # custom image setting
105 | "lvis_v1_custom_img": {
106 | "lvis_v1_train_custom_img": ("coco/", "lvis/lvis_v1_train.json"),
107 | "lvis_v1_val_custom_img": ("coco/", "lvis/lvis_v1_val.json"),
108 | "lvis_v1_test_dev_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
109 | "lvis_v1_test_challenge_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
110 | },
111 | # regular fully supervised setting
112 | "lvis_v1_fullysup": {
113 | "lvis_v1_train_fullysup": ("coco/", "lvis/lvis_v1_train.json"),
114 | "lvis_v1_val_fullysup": ("coco/", "lvis/lvis_v1_val.json"),
115 | "lvis_v1_test_dev_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
116 | "lvis_v1_test_challenge_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
117 | #
118 | "lvis_v1_train_base_1203cats": ("coco/", "lvis/lvis_v1_train_baseOnly.json"),
119 | "lvis_v1_val_1@10": ("coco/", "lvis/lvis_v1_val_1@10.json"),
120 | },
121 | # PLs for ensemble by zsy
122 | "lvis_v1_PLs": {
123 | "lvis_v1_train_base_PLs_r50x4": ("coco/", "lvis/regionclip_PLs/inst_train_defRegCLIPr50x4_PLs_93.json"),
124 | "lvis_v1_train_SASDet_r50x4_PLs": ("coco/", "lvis/regionclip_PLs/lvis_v1_train_SASDet_r50x4_PLs_t62.json"),
125 | "lvis_v1_o365_SASDet_r50x4_PLs": ("Objects365/train", "Objects365/regionclip_PLs/zsy_objv1_train_SASDet_r50x4_PLs_t83.json"),
126 | }
127 | }
128 |
129 |
130 | def register_all_lvis(root):
131 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
132 | if dataset_name == "lvis_v1_PLs":
133 | for key, (image_root, json_file) in splits_per_dataset.items():
134 | register_lvis_instances_w_PLs(
135 | key,
136 | get_lvis_instances_meta(dataset_name), # TODO: meta for PLs, category order is rearranged
137 | os.path.join(root, json_file) if "://" not in json_file else json_file,
138 | os.path.join(root, image_root),
139 | )
140 | else:
141 | for key, (image_root, json_file) in splits_per_dataset.items():
142 | if dataset_name == "lvis_v1":
143 | args = {'filter_open_cls': True, 'run_custom_img': False}
144 | elif dataset_name == 'lvis_v1_custom_img':
145 | args = {'filter_open_cls': False, 'run_custom_img': True}
146 | elif dataset_name == 'lvis_v1_fullysup':
147 | args = {'filter_open_cls': False, 'run_custom_img': False}
148 | register_lvis_instances(
149 | key,
150 | get_lvis_instances_meta(dataset_name),
151 | os.path.join(root, json_file) if "://" not in json_file else json_file,
152 | os.path.join(root, image_root),
153 | args,
154 | )
155 |
156 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
157 | register_all_coco(_root)
158 | register_all_lvis(_root)
159 |
160 | # # True for open source;
161 | # # Internally at fb, we register them elsewhere
162 | # if __name__.endswith(".builtin"):
163 | # # Assume pre-defined datasets live in `./datasets`.
164 | # _root = os.getenv("DETECTRON2_DATASETS", "datasets")
165 | # register_all_coco(_root)
166 | # register_all_lvis(_root)
--------------------------------------------------------------------------------
/sas_det/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator
3 | from .coco_evaluation import COCOEvaluator
4 | from .rotated_coco_evaluation import RotatedCOCOEvaluator
5 | from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
6 | from .lvis_evaluation import LVISEvaluator
7 | from .panoptic_evaluation import COCOPanopticEvaluator
8 | from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
9 | from .sem_seg_evaluation import SemSegEvaluator
10 | from .testing import print_csv_format, verify_results
11 |
12 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
13 |
--------------------------------------------------------------------------------
/sas_det/evaluation/cityscapes_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import glob
3 | import logging
4 | import numpy as np
5 | import os
6 | import tempfile
7 | from collections import OrderedDict
8 | import torch
9 | from PIL import Image
10 |
11 | from detectron2.data import MetadataCatalog
12 | from detectron2.utils import comm
13 | from detectron2.utils.file_io import PathManager
14 |
15 | from .evaluator import DatasetEvaluator
16 |
17 |
18 | class CityscapesEvaluator(DatasetEvaluator):
19 | """
20 | Base class for evaluation using cityscapes API.
21 | """
22 |
23 | def __init__(self, dataset_name):
24 | """
25 | Args:
26 | dataset_name (str): the name of the dataset.
27 | It must have the following metadata associated with it:
28 | "thing_classes", "gt_dir".
29 | """
30 | self._metadata = MetadataCatalog.get(dataset_name)
31 | self._cpu_device = torch.device("cpu")
32 | self._logger = logging.getLogger(__name__)
33 |
34 | def reset(self):
35 | self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
36 | self._temp_dir = self._working_dir.name
37 | # All workers will write to the same results directory
38 | # TODO this does not work in distributed training
39 | self._temp_dir = comm.all_gather(self._temp_dir)[0]
40 | if self._temp_dir != self._working_dir.name:
41 | self._working_dir.cleanup()
42 | self._logger.info(
43 | "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
44 | )
45 |
46 |
47 | class CityscapesInstanceEvaluator(CityscapesEvaluator):
48 | """
49 | Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
50 |
51 | Note:
52 | * It does not work in multi-machine distributed training.
53 | * It contains a synchronization, therefore has to be used on all ranks.
54 | * Only the main process runs evaluation.
55 | """
56 |
57 | def process(self, inputs, outputs):
58 | from cityscapesscripts.helpers.labels import name2label
59 |
60 | for input, output in zip(inputs, outputs):
61 | file_name = input["file_name"]
62 | basename = os.path.splitext(os.path.basename(file_name))[0]
63 | pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
64 |
65 | if "instances" in output:
66 | output = output["instances"].to(self._cpu_device)
67 | num_instances = len(output)
68 | with open(pred_txt, "w") as fout:
69 | for i in range(num_instances):
70 | pred_class = output.pred_classes[i]
71 | classes = self._metadata.thing_classes[pred_class]
72 | class_id = name2label[classes].id
73 | score = output.scores[i]
74 | mask = output.pred_masks[i].numpy().astype("uint8")
75 | png_filename = os.path.join(
76 | self._temp_dir, basename + "_{}_{}.png".format(i, classes)
77 | )
78 |
79 | Image.fromarray(mask * 255).save(png_filename)
80 | fout.write(
81 | "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
82 | )
83 | else:
84 | # Cityscapes requires a prediction file for every ground truth image.
85 | with open(pred_txt, "w") as fout:
86 | pass
87 |
88 | def evaluate(self):
89 | """
90 | Returns:
91 | dict: has a key "segm", whose value is a dict of "AP" and "AP50".
92 | """
93 | comm.synchronize()
94 | if comm.get_rank() > 0:
95 | return
96 | import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
97 |
98 | self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
99 |
100 | # set some global states in cityscapes evaluation API, before evaluating
101 | cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
102 | cityscapes_eval.args.predictionWalk = None
103 | cityscapes_eval.args.JSONOutput = False
104 | cityscapes_eval.args.colorized = False
105 | cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
106 |
107 | # These lines are adopted from
108 | # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
109 | gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
110 | groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
111 | assert len(
112 | groundTruthImgList
113 | ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
114 | cityscapes_eval.args.groundTruthSearch
115 | )
116 | predictionImgList = []
117 | for gt in groundTruthImgList:
118 | predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
119 | results = cityscapes_eval.evaluateImgLists(
120 | predictionImgList, groundTruthImgList, cityscapes_eval.args
121 | )["averages"]
122 |
123 | ret = OrderedDict()
124 | ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
125 | self._working_dir.cleanup()
126 | return ret
127 |
128 |
129 | class CityscapesSemSegEvaluator(CityscapesEvaluator):
130 | """
131 | Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
132 |
133 | Note:
134 | * It does not work in multi-machine distributed training.
135 | * It contains a synchronization, therefore has to be used on all ranks.
136 | * Only the main process runs evaluation.
137 | """
138 |
139 | def process(self, inputs, outputs):
140 | from cityscapesscripts.helpers.labels import trainId2label
141 |
142 | for input, output in zip(inputs, outputs):
143 | file_name = input["file_name"]
144 | basename = os.path.splitext(os.path.basename(file_name))[0]
145 | pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
146 |
147 | output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
148 | pred = 255 * np.ones(output.shape, dtype=np.uint8)
149 | for train_id, label in trainId2label.items():
150 | if label.ignoreInEval:
151 | continue
152 | pred[output == train_id] = label.id
153 | Image.fromarray(pred).save(pred_filename)
154 |
155 | def evaluate(self):
156 | comm.synchronize()
157 | if comm.get_rank() > 0:
158 | return
159 | # Load the Cityscapes eval script *after* setting the required env var,
160 | # since the script reads CITYSCAPES_DATASET into global variables at load time.
161 | import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
162 |
163 | self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
164 |
165 | # set some global states in cityscapes evaluation API, before evaluating
166 | cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
167 | cityscapes_eval.args.predictionWalk = None
168 | cityscapes_eval.args.JSONOutput = False
169 | cityscapes_eval.args.colorized = False
170 |
171 | # These lines are adopted from
172 | # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
173 | gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
174 | groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
175 | assert len(
176 | groundTruthImgList
177 | ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
178 | cityscapes_eval.args.groundTruthSearch
179 | )
180 | predictionImgList = []
181 | for gt in groundTruthImgList:
182 | predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
183 | results = cityscapes_eval.evaluateImgLists(
184 | predictionImgList, groundTruthImgList, cityscapes_eval.args
185 | )
186 | ret = OrderedDict()
187 | ret["sem_seg"] = {
188 | "IoU": 100.0 * results["averageScoreClasses"],
189 | "iIoU": 100.0 * results["averageScoreInstClasses"],
190 | "IoU_sup": 100.0 * results["averageScoreCategories"],
191 | "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
192 | }
193 | self._working_dir.cleanup()
194 | return ret
195 |
--------------------------------------------------------------------------------
/sas_det/evaluation/evaluator.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import datetime
3 | import logging
4 | import time
5 | from collections import OrderedDict, abc
6 | from contextlib import ExitStack, contextmanager
7 | from typing import List, Union
8 | import torch
9 | from torch import nn
10 |
11 | from detectron2.utils.comm import get_world_size, is_main_process
12 | from detectron2.utils.logger import log_every_n_seconds
13 |
14 |
15 | class DatasetEvaluator:
16 | """
17 | Base class for a dataset evaluator.
18 |
19 | The function :func:`inference_on_dataset` runs the model over
20 | all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
21 |
22 | This class will accumulate information of the inputs/outputs (by :meth:`process`),
23 | and produce evaluation results in the end (by :meth:`evaluate`).
24 | """
25 |
26 | def reset(self):
27 | """
28 | Preparation for a new round of evaluation.
29 | Should be called before starting a round of evaluation.
30 | """
31 | pass
32 |
33 | def process(self, inputs, outputs):
34 | """
35 | Process the pair of inputs and outputs.
36 | If they contain batches, the pairs can be consumed one-by-one using `zip`:
37 |
38 | .. code-block:: python
39 |
40 | for input_, output in zip(inputs, outputs):
41 | # do evaluation on single input/output pair
42 | ...
43 |
44 | Args:
45 | inputs (list): the inputs that's used to call the model.
46 | outputs (list): the return value of `model(inputs)`
47 | """
48 | pass
49 |
50 | def evaluate(self):
51 | """
52 | Evaluate/summarize the performance, after processing all input/output pairs.
53 |
54 | Returns:
55 | dict:
56 | A new evaluator class can return a dict of arbitrary format
57 | as long as the user can process the results.
58 | In our train_net.py, we expect the following format:
59 |
60 | * key: the name of the task (e.g., bbox)
61 | * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
62 | """
63 | pass
64 |
65 |
66 | class DatasetEvaluators(DatasetEvaluator):
67 | """
68 | Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
69 |
70 | This class dispatches every evaluation call to
71 | all of its :class:`DatasetEvaluator`.
72 | """
73 |
74 | def __init__(self, evaluators):
75 | """
76 | Args:
77 | evaluators (list): the evaluators to combine.
78 | """
79 | super().__init__()
80 | self._evaluators = evaluators
81 |
82 | def reset(self):
83 | for evaluator in self._evaluators:
84 | evaluator.reset()
85 |
86 | def process(self, inputs, outputs):
87 | for evaluator in self._evaluators:
88 | evaluator.process(inputs, outputs)
89 |
90 | def evaluate(self):
91 | results = OrderedDict()
92 | for evaluator in self._evaluators:
93 | result = evaluator.evaluate()
94 | if is_main_process() and result is not None:
95 | for k, v in result.items():
96 | assert (
97 | k not in results
98 | ), "Different evaluators produce results with the same key {}".format(k)
99 | results[k] = v
100 | return results
101 |
102 |
103 | def inference_on_dataset(
104 | model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None]
105 | ):
106 | """
107 | Run model on the data_loader and evaluate the metrics with evaluator.
108 | Also benchmark the inference speed of `model.__call__` accurately.
109 | The model will be used in eval mode.
110 |
111 | Args:
112 | model (callable): a callable which takes an object from
113 | `data_loader` and returns some outputs.
114 |
115 | If it's an nn.Module, it will be temporarily set to `eval` mode.
116 | If you wish to evaluate a model in `training` mode instead, you can
117 | wrap the given model and override its behavior of `.eval()` and `.train()`.
118 | data_loader: an iterable object with a length.
119 | The elements it generates will be the inputs to the model.
120 | evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
121 | but don't want to do any evaluation.
122 |
123 | Returns:
124 | The return value of `evaluator.evaluate()`
125 | """
126 | num_devices = get_world_size()
127 | logger = logging.getLogger(__name__)
128 | logger.info("Start inference on {} batches".format(len(data_loader)))
129 |
130 | total = len(data_loader) # inference data loader must have a fixed length
131 | if evaluator is None:
132 | # create a no-op evaluator
133 | evaluator = DatasetEvaluators([])
134 | if isinstance(evaluator, abc.MutableSequence):
135 | evaluator = DatasetEvaluators(evaluator)
136 | evaluator.reset()
137 |
138 | num_warmup = min(5, total - 1)
139 | start_time = time.perf_counter()
140 | total_data_time = 0
141 | total_compute_time = 0
142 | total_eval_time = 0
143 | with ExitStack() as stack:
144 | if isinstance(model, nn.Module):
145 | stack.enter_context(inference_context(model))
146 | stack.enter_context(torch.no_grad())
147 |
148 | start_data_time = time.perf_counter()
149 | for idx, inputs in enumerate(data_loader):
150 | total_data_time += time.perf_counter() - start_data_time
151 | if idx == num_warmup:
152 | start_time = time.perf_counter()
153 | total_data_time = 0
154 | total_compute_time = 0
155 | total_eval_time = 0
156 |
157 | start_compute_time = time.perf_counter()
158 | outputs = model(inputs)
159 | if torch.cuda.is_available():
160 | torch.cuda.synchronize()
161 | total_compute_time += time.perf_counter() - start_compute_time
162 |
163 | start_eval_time = time.perf_counter()
164 | evaluator.process(inputs, outputs)
165 | total_eval_time += time.perf_counter() - start_eval_time
166 |
167 | iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
168 | data_seconds_per_iter = total_data_time / iters_after_start
169 | compute_seconds_per_iter = total_compute_time / iters_after_start
170 | eval_seconds_per_iter = total_eval_time / iters_after_start
171 | total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
172 | if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
173 | eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
174 | log_every_n_seconds(
175 | logging.INFO,
176 | (
177 | f"Inference done {idx + 1}/{total}. "
178 | f"Dataloading: {data_seconds_per_iter:.4f} s / iter. "
179 | f"Inference: {compute_seconds_per_iter:.4f} s / iter. "
180 | f"Eval: {eval_seconds_per_iter:.4f} s / iter. "
181 | f"Total: {total_seconds_per_iter:.4f} s / iter. "
182 | f"ETA={eta}"
183 | ),
184 | n=5,
185 | )
186 | start_data_time = time.perf_counter()
187 |
188 | # Measure the time only for this worker (before the synchronization barrier)
189 | total_time = time.perf_counter() - start_time
190 | total_time_str = str(datetime.timedelta(seconds=total_time))
191 | # NOTE this format is parsed by grep
192 | logger.info(
193 | "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
194 | total_time_str, total_time / (total - num_warmup), num_devices
195 | )
196 | )
197 | total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
198 | logger.info(
199 | "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
200 | total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
201 | )
202 | )
203 |
204 | results = evaluator.evaluate()
205 | # An evaluator may return None when not in main process.
206 | # Replace it by an empty dict instead to make it easier for downstream code to handle
207 | if results is None:
208 | results = {}
209 | return results
210 |
211 |
212 | @contextmanager
213 | def inference_context(model):
214 | """
215 | A context where the model is temporarily changed to eval mode,
216 | and restored to previous mode afterwards.
217 |
218 | Args:
219 | model: a torch Module
220 | """
221 | training_mode = model.training
222 | model.eval()
223 | yield
224 | model.train(training_mode)
225 |
--------------------------------------------------------------------------------
/sas_det/evaluation/fast_eval_api.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 | import numpy as np
5 | import time
6 | from pycocotools.cocoeval import COCOeval
7 |
8 | from detectron2 import _C
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class COCOeval_opt(COCOeval):
14 | """
15 | This is a slightly modified version of the original COCO API, where the functions evaluateImg()
16 | and accumulate() are implemented in C++ to speedup evaluation
17 | """
18 |
19 | def evaluate(self):
20 | """
21 | Run per image evaluation on given images and store results in self.evalImgs_cpp, a
22 | datastructure that isn't readable from Python but is used by a c++ implementation of
23 | accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure
24 | self.evalImgs because this datastructure is a computational bottleneck.
25 | :return: None
26 | """
27 | tic = time.time()
28 |
29 | p = self.params
30 | # add backward compatibility if useSegm is specified in params
31 | if p.useSegm is not None:
32 | p.iouType = "segm" if p.useSegm == 1 else "bbox"
33 | logger.info("Evaluate annotation type *{}*".format(p.iouType))
34 | p.imgIds = list(np.unique(p.imgIds))
35 | if p.useCats:
36 | p.catIds = list(np.unique(p.catIds))
37 | p.maxDets = sorted(p.maxDets)
38 | self.params = p
39 |
40 | self._prepare() # bottleneck
41 |
42 | # loop through images, area range, max detection number
43 | catIds = p.catIds if p.useCats else [-1]
44 |
45 | if p.iouType == "segm" or p.iouType == "bbox":
46 | computeIoU = self.computeIoU
47 | elif p.iouType == "keypoints":
48 | computeIoU = self.computeOks
49 | self.ious = {
50 | (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
51 | } # bottleneck
52 |
53 | maxDet = p.maxDets[-1]
54 |
55 | # <<<< Beginning of code differences with original COCO API
56 | def convert_instances_to_cpp(instances, is_det=False):
57 | # Convert annotations for a list of instances in an image to a format that's fast
58 | # to access in C++
59 | instances_cpp = []
60 | for instance in instances:
61 | instance_cpp = _C.InstanceAnnotation(
62 | int(instance["id"]),
63 | instance["score"] if is_det else instance.get("score", 0.0),
64 | instance["area"],
65 | bool(instance.get("iscrowd", 0)),
66 | bool(instance.get("ignore", 0)),
67 | )
68 | instances_cpp.append(instance_cpp)
69 | return instances_cpp
70 |
71 | # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
72 | ground_truth_instances = [
73 | [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
74 | for imgId in p.imgIds
75 | ]
76 | detected_instances = [
77 | [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
78 | for imgId in p.imgIds
79 | ]
80 | ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
81 |
82 | if not p.useCats:
83 | # For each image, flatten per-category lists into a single list
84 | ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
85 | detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
86 |
87 | # Call C++ implementation of self.evaluateImgs()
88 | self._evalImgs_cpp = _C.COCOevalEvaluateImages(
89 | p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
90 | )
91 | self._evalImgs = None
92 |
93 | self._paramsEval = copy.deepcopy(self.params)
94 | toc = time.time()
95 | logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
96 | # >>>> End of code differences with original COCO API
97 |
98 | def accumulate(self):
99 | """
100 | Accumulate per image evaluation results and store the result in self.eval. Does not
101 | support changing parameter settings from those used by self.evaluate()
102 | """
103 | logger.info("Accumulating evaluation results...")
104 | tic = time.time()
105 | assert hasattr(
106 | self, "_evalImgs_cpp"
107 | ), "evaluate() must be called before accmulate() is called."
108 |
109 | self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
110 |
111 | # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
112 | self.eval["recall"] = np.array(self.eval["recall"]).reshape(
113 | self.eval["counts"][:1] + self.eval["counts"][2:]
114 | )
115 |
116 | # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
117 | # num_area_ranges X num_max_detections
118 | self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
119 | self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
120 | toc = time.time()
121 | logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic))
122 |
--------------------------------------------------------------------------------
/sas_det/evaluation/panoptic_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import contextlib
3 | import io
4 | import itertools
5 | import json
6 | import logging
7 | import numpy as np
8 | import os
9 | import tempfile
10 | from collections import OrderedDict
11 | from typing import Optional
12 | from PIL import Image
13 | from tabulate import tabulate
14 |
15 | from detectron2.data import MetadataCatalog
16 | from detectron2.utils import comm
17 | from detectron2.utils.file_io import PathManager
18 |
19 | from .evaluator import DatasetEvaluator
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 |
24 | class COCOPanopticEvaluator(DatasetEvaluator):
25 | """
26 | Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
27 | It saves panoptic segmentation prediction in `output_dir`
28 |
29 | It contains a synchronize call and has to be called from all workers.
30 | """
31 |
32 | def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
33 | """
34 | Args:
35 | dataset_name: name of the dataset
36 | output_dir: output directory to save results for evaluation.
37 | """
38 | self._metadata = MetadataCatalog.get(dataset_name)
39 | self._thing_contiguous_id_to_dataset_id = {
40 | v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
41 | }
42 | self._stuff_contiguous_id_to_dataset_id = {
43 | v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
44 | }
45 |
46 | self._output_dir = output_dir
47 | if self._output_dir is not None:
48 | PathManager.mkdirs(self._output_dir)
49 |
50 | def reset(self):
51 | self._predictions = []
52 |
53 | def _convert_category_id(self, segment_info):
54 | isthing = segment_info.pop("isthing", None)
55 | if isthing is None:
56 | # the model produces panoptic category id directly. No more conversion needed
57 | return segment_info
58 | if isthing is True:
59 | segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
60 | segment_info["category_id"]
61 | ]
62 | else:
63 | segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
64 | segment_info["category_id"]
65 | ]
66 | return segment_info
67 |
68 | def process(self, inputs, outputs):
69 | from panopticapi.utils import id2rgb
70 |
71 | for input, output in zip(inputs, outputs):
72 | panoptic_img, segments_info = output["panoptic_seg"]
73 | panoptic_img = panoptic_img.cpu().numpy()
74 | if segments_info is None:
75 | # If "segments_info" is None, we assume "panoptic_img" is a
76 | # H*W int32 image storing the panoptic_id in the format of
77 | # category_id * label_divisor + instance_id. We reserve -1 for
78 | # VOID label, and add 1 to panoptic_img since the official
79 | # evaluation script uses 0 for VOID label.
80 | label_divisor = self._metadata.label_divisor
81 | segments_info = []
82 | for panoptic_label in np.unique(panoptic_img):
83 | if panoptic_label == -1:
84 | # VOID region.
85 | continue
86 | pred_class = panoptic_label // label_divisor
87 | isthing = (
88 | pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
89 | )
90 | segments_info.append(
91 | {
92 | "id": int(panoptic_label) + 1,
93 | "category_id": int(pred_class),
94 | "isthing": bool(isthing),
95 | }
96 | )
97 | # Official evaluation script uses 0 for VOID label.
98 | panoptic_img += 1
99 |
100 | file_name = os.path.basename(input["file_name"])
101 | file_name_png = os.path.splitext(file_name)[0] + ".png"
102 | with io.BytesIO() as out:
103 | Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
104 | segments_info = [self._convert_category_id(x) for x in segments_info]
105 | self._predictions.append(
106 | {
107 | "image_id": input["image_id"],
108 | "file_name": file_name_png,
109 | "png_string": out.getvalue(),
110 | "segments_info": segments_info,
111 | }
112 | )
113 |
114 | def evaluate(self):
115 | comm.synchronize()
116 |
117 | self._predictions = comm.gather(self._predictions)
118 | self._predictions = list(itertools.chain(*self._predictions))
119 | if not comm.is_main_process():
120 | return
121 |
122 | # PanopticApi requires local files
123 | gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
124 | gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
125 |
126 | with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
127 | logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
128 | for p in self._predictions:
129 | with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
130 | f.write(p.pop("png_string"))
131 |
132 | with open(gt_json, "r") as f:
133 | json_data = json.load(f)
134 | json_data["annotations"] = self._predictions
135 |
136 | output_dir = self._output_dir or pred_dir
137 | predictions_json = os.path.join(output_dir, "predictions.json")
138 | with PathManager.open(predictions_json, "w") as f:
139 | f.write(json.dumps(json_data))
140 |
141 | from panopticapi.evaluation import pq_compute
142 |
143 | with contextlib.redirect_stdout(io.StringIO()):
144 | pq_res = pq_compute(
145 | gt_json,
146 | PathManager.get_local_path(predictions_json),
147 | gt_folder=gt_folder,
148 | pred_folder=pred_dir,
149 | )
150 |
151 | res = {}
152 | res["PQ"] = 100 * pq_res["All"]["pq"]
153 | res["SQ"] = 100 * pq_res["All"]["sq"]
154 | res["RQ"] = 100 * pq_res["All"]["rq"]
155 | res["PQ_th"] = 100 * pq_res["Things"]["pq"]
156 | res["SQ_th"] = 100 * pq_res["Things"]["sq"]
157 | res["RQ_th"] = 100 * pq_res["Things"]["rq"]
158 | res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
159 | res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
160 | res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
161 |
162 | results = OrderedDict({"panoptic_seg": res})
163 | _print_panoptic_results(pq_res)
164 |
165 | return results
166 |
167 |
168 | def _print_panoptic_results(pq_res):
169 | headers = ["", "PQ", "SQ", "RQ", "#categories"]
170 | data = []
171 | for name in ["All", "Things", "Stuff"]:
172 | row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
173 | data.append(row)
174 | table = tabulate(
175 | data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
176 | )
177 | logger.info("Panoptic Evaluation Results:\n" + table)
178 |
179 |
180 | if __name__ == "__main__":
181 | from detectron2.utils.logger import setup_logger
182 |
183 | logger = setup_logger()
184 | import argparse
185 |
186 | parser = argparse.ArgumentParser()
187 | parser.add_argument("--gt-json")
188 | parser.add_argument("--gt-dir")
189 | parser.add_argument("--pred-json")
190 | parser.add_argument("--pred-dir")
191 | args = parser.parse_args()
192 |
193 | from panopticapi.evaluation import pq_compute
194 |
195 | with contextlib.redirect_stdout(io.StringIO()):
196 | pq_res = pq_compute(
197 | args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
198 | )
199 | _print_panoptic_results(pq_res)
200 |
--------------------------------------------------------------------------------
/sas_det/evaluation/rotated_coco_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import itertools
3 | import json
4 | import numpy as np
5 | import os
6 | import torch
7 | from pycocotools.cocoeval import COCOeval, maskUtils
8 |
9 | from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
10 | from detectron2.utils.file_io import PathManager
11 |
12 | from .coco_evaluation import COCOEvaluator
13 |
14 |
15 | class RotatedCOCOeval(COCOeval):
16 | @staticmethod
17 | def is_rotated(box_list):
18 | if type(box_list) == np.ndarray:
19 | return box_list.shape[1] == 5
20 | elif type(box_list) == list:
21 | if box_list == []: # cannot decide the box_dim
22 | return False
23 | return np.all(
24 | np.array(
25 | [
26 | (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray))
27 | for obj in box_list
28 | ]
29 | )
30 | )
31 | return False
32 |
33 | @staticmethod
34 | def boxlist_to_tensor(boxlist, output_box_dim):
35 | if type(boxlist) == np.ndarray:
36 | box_tensor = torch.from_numpy(boxlist)
37 | elif type(boxlist) == list:
38 | if boxlist == []:
39 | return torch.zeros((0, output_box_dim), dtype=torch.float32)
40 | else:
41 | box_tensor = torch.FloatTensor(boxlist)
42 | else:
43 | raise Exception("Unrecognized boxlist type")
44 |
45 | input_box_dim = box_tensor.shape[1]
46 | if input_box_dim != output_box_dim:
47 | if input_box_dim == 4 and output_box_dim == 5:
48 | box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
49 | else:
50 | raise Exception(
51 | "Unable to convert from {}-dim box to {}-dim box".format(
52 | input_box_dim, output_box_dim
53 | )
54 | )
55 | return box_tensor
56 |
57 | def compute_iou_dt_gt(self, dt, gt, is_crowd):
58 | if self.is_rotated(dt) or self.is_rotated(gt):
59 | # TODO: take is_crowd into consideration
60 | assert all(c == 0 for c in is_crowd)
61 | dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
62 | gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
63 | return pairwise_iou_rotated(dt, gt)
64 | else:
65 | # This is the same as the classical COCO evaluation
66 | return maskUtils.iou(dt, gt, is_crowd)
67 |
68 | def computeIoU(self, imgId, catId):
69 | p = self.params
70 | if p.useCats:
71 | gt = self._gts[imgId, catId]
72 | dt = self._dts[imgId, catId]
73 | else:
74 | gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
75 | dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
76 | if len(gt) == 0 and len(dt) == 0:
77 | return []
78 | inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
79 | dt = [dt[i] for i in inds]
80 | if len(dt) > p.maxDets[-1]:
81 | dt = dt[0 : p.maxDets[-1]]
82 |
83 | assert p.iouType == "bbox", "unsupported iouType for iou computation"
84 |
85 | g = [g["bbox"] for g in gt]
86 | d = [d["bbox"] for d in dt]
87 |
88 | # compute iou between each dt and gt region
89 | iscrowd = [int(o["iscrowd"]) for o in gt]
90 |
91 | # Note: this function is copied from cocoeval.py in cocoapi
92 | # and the major difference is here.
93 | ious = self.compute_iou_dt_gt(d, g, iscrowd)
94 | return ious
95 |
96 |
97 | class RotatedCOCOEvaluator(COCOEvaluator):
98 | """
99 | Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
100 | with rotated boxes support.
101 | Note: this uses IOU only and does not consider angle differences.
102 | """
103 |
104 | def process(self, inputs, outputs):
105 | """
106 | Args:
107 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
108 | It is a list of dict. Each dict corresponds to an image and
109 | contains keys like "height", "width", "file_name", "image_id".
110 | outputs: the outputs of a COCO model. It is a list of dicts with key
111 | "instances" that contains :class:`Instances`.
112 | """
113 | for input, output in zip(inputs, outputs):
114 | prediction = {"image_id": input["image_id"]}
115 |
116 | if "instances" in output:
117 | instances = output["instances"].to(self._cpu_device)
118 |
119 | prediction["instances"] = self.instances_to_json(instances, input["image_id"])
120 | if "proposals" in output:
121 | prediction["proposals"] = output["proposals"].to(self._cpu_device)
122 | self._predictions.append(prediction)
123 |
124 | def instances_to_json(self, instances, img_id):
125 | num_instance = len(instances)
126 | if num_instance == 0:
127 | return []
128 |
129 | boxes = instances.pred_boxes.tensor.numpy()
130 | if boxes.shape[1] == 4:
131 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
132 | boxes = boxes.tolist()
133 | scores = instances.scores.tolist()
134 | classes = instances.pred_classes.tolist()
135 |
136 | results = []
137 | for k in range(num_instance):
138 | result = {
139 | "image_id": img_id,
140 | "category_id": classes[k],
141 | "bbox": boxes[k],
142 | "score": scores[k],
143 | }
144 |
145 | results.append(result)
146 | return results
147 |
148 | def _eval_predictions(self, predictions, img_ids=None): # img_ids: unused
149 | """
150 | Evaluate predictions on the given tasks.
151 | Fill self._results with the metrics of the tasks.
152 | """
153 | self._logger.info("Preparing results for COCO format ...")
154 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
155 |
156 | # unmap the category ids for COCO
157 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
158 | reverse_id_mapping = {
159 | v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
160 | }
161 | for result in coco_results:
162 | result["category_id"] = reverse_id_mapping[result["category_id"]]
163 |
164 | if self._output_dir:
165 | file_path = os.path.join(self._output_dir, "coco_instances_results.json")
166 | self._logger.info("Saving results to {}".format(file_path))
167 | with PathManager.open(file_path, "w") as f:
168 | f.write(json.dumps(coco_results))
169 | f.flush()
170 |
171 | if not self._do_evaluation:
172 | self._logger.info("Annotations are not available for evaluation.")
173 | return
174 |
175 | self._logger.info("Evaluating predictions ...")
176 |
177 | assert self._tasks is None or set(self._tasks) == {
178 | "bbox"
179 | }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported"
180 | coco_eval = (
181 | self._evaluate_predictions_on_coco(self._coco_api, coco_results)
182 | if len(coco_results) > 0
183 | else None # cocoapi does not handle empty results very well
184 | )
185 |
186 | task = "bbox"
187 | res = self._derive_coco_results(
188 | coco_eval, task, class_names=self._metadata.get("thing_classes")
189 | )
190 | self._results[task] = res
191 |
192 | def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
193 | """
194 | Evaluate the coco results using COCOEval API.
195 | """
196 | assert len(coco_results) > 0
197 |
198 | coco_dt = coco_gt.loadRes(coco_results)
199 |
200 | # Only bbox is supported for now
201 | coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
202 |
203 | coco_eval.evaluate()
204 | coco_eval.accumulate()
205 | coco_eval.summarize()
206 |
207 | return coco_eval
208 |
--------------------------------------------------------------------------------
/sas_det/evaluation/sem_seg_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import itertools
3 | import json
4 | import logging
5 | import numpy as np
6 | import os
7 | from collections import OrderedDict
8 | import PIL.Image as Image
9 | import pycocotools.mask as mask_util
10 | import torch
11 |
12 | from detectron2.data import DatasetCatalog, MetadataCatalog
13 | from detectron2.utils.comm import all_gather, is_main_process, synchronize
14 | from detectron2.utils.file_io import PathManager
15 |
16 | from .evaluator import DatasetEvaluator
17 |
18 |
19 | class SemSegEvaluator(DatasetEvaluator):
20 | """
21 | Evaluate semantic segmentation metrics.
22 | """
23 |
24 | def __init__(
25 | self,
26 | dataset_name,
27 | distributed=True,
28 | output_dir=None,
29 | *,
30 | num_classes=None,
31 | ignore_label=None,
32 | ):
33 | """
34 | Args:
35 | dataset_name (str): name of the dataset to be evaluated.
36 | distributed (bool): if True, will collect results from all ranks for evaluation.
37 | Otherwise, will evaluate the results in the current process.
38 | output_dir (str): an output directory to dump results.
39 | num_classes, ignore_label: deprecated argument
40 | """
41 | self._logger = logging.getLogger(__name__)
42 | if num_classes is not None:
43 | self._logger.warn(
44 | "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
45 | )
46 | if ignore_label is not None:
47 | self._logger.warn(
48 | "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
49 | )
50 | self._dataset_name = dataset_name
51 | self._distributed = distributed
52 | self._output_dir = output_dir
53 |
54 | self._cpu_device = torch.device("cpu")
55 |
56 | self.input_file_to_gt_file = {
57 | dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
58 | for dataset_record in DatasetCatalog.get(dataset_name)
59 | }
60 |
61 | meta = MetadataCatalog.get(dataset_name)
62 | # Dict that maps contiguous training ids to COCO category ids
63 | try:
64 | c2d = meta.stuff_dataset_id_to_contiguous_id
65 | self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
66 | except AttributeError:
67 | self._contiguous_id_to_dataset_id = None
68 | self._class_names = meta.stuff_classes
69 | self._num_classes = len(meta.stuff_classes)
70 | if num_classes is not None:
71 | assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
72 | self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
73 |
74 | def reset(self):
75 | self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
76 | self._predictions = []
77 |
78 | def process(self, inputs, outputs):
79 | """
80 | Args:
81 | inputs: the inputs to a model.
82 | It is a list of dicts. Each dict corresponds to an image and
83 | contains keys like "height", "width", "file_name".
84 | outputs: the outputs of a model. It is either list of semantic segmentation predictions
85 | (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
86 | segmentation prediction in the same format.
87 | """
88 | for input, output in zip(inputs, outputs):
89 | output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
90 | pred = np.array(output, dtype=np.int)
91 | with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f:
92 | gt = np.array(Image.open(f), dtype=np.int)
93 |
94 | gt[gt == self._ignore_label] = self._num_classes
95 |
96 | self._conf_matrix += np.bincount(
97 | (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
98 | minlength=self._conf_matrix.size,
99 | ).reshape(self._conf_matrix.shape)
100 |
101 | self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
102 |
103 | def evaluate(self):
104 | """
105 | Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
106 |
107 | * Mean intersection-over-union averaged across classes (mIoU)
108 | * Frequency Weighted IoU (fwIoU)
109 | * Mean pixel accuracy averaged across classes (mACC)
110 | * Pixel Accuracy (pACC)
111 | """
112 | if self._distributed:
113 | synchronize()
114 | conf_matrix_list = all_gather(self._conf_matrix)
115 | self._predictions = all_gather(self._predictions)
116 | self._predictions = list(itertools.chain(*self._predictions))
117 | if not is_main_process():
118 | return
119 |
120 | self._conf_matrix = np.zeros_like(self._conf_matrix)
121 | for conf_matrix in conf_matrix_list:
122 | self._conf_matrix += conf_matrix
123 |
124 | if self._output_dir:
125 | PathManager.mkdirs(self._output_dir)
126 | file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
127 | with PathManager.open(file_path, "w") as f:
128 | f.write(json.dumps(self._predictions))
129 |
130 | acc = np.full(self._num_classes, np.nan, dtype=np.float)
131 | iou = np.full(self._num_classes, np.nan, dtype=np.float)
132 | tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
133 | pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
134 | class_weights = pos_gt / np.sum(pos_gt)
135 | pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
136 | acc_valid = pos_gt > 0
137 | acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
138 | iou_valid = (pos_gt + pos_pred) > 0
139 | union = pos_gt + pos_pred - tp
140 | iou[acc_valid] = tp[acc_valid] / union[acc_valid]
141 | macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
142 | miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
143 | fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
144 | pacc = np.sum(tp) / np.sum(pos_gt)
145 |
146 | res = {}
147 | res["mIoU"] = 100 * miou
148 | res["fwIoU"] = 100 * fiou
149 | for i, name in enumerate(self._class_names):
150 | res["IoU-{}".format(name)] = 100 * iou[i]
151 | res["mACC"] = 100 * macc
152 | res["pACC"] = 100 * pacc
153 | for i, name in enumerate(self._class_names):
154 | res["ACC-{}".format(name)] = 100 * acc[i]
155 |
156 | if self._output_dir:
157 | file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
158 | with PathManager.open(file_path, "wb") as f:
159 | torch.save(res, f)
160 | results = OrderedDict({"sem_seg": res})
161 | self._logger.info(results)
162 | return results
163 |
164 | def encode_json_sem_seg(self, sem_seg, input_file_name):
165 | """
166 | Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
167 | See http://cocodataset.org/#format-results
168 | """
169 | json_list = []
170 | for label in np.unique(sem_seg):
171 | if self._contiguous_id_to_dataset_id is not None:
172 | assert (
173 | label in self._contiguous_id_to_dataset_id
174 | ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
175 | dataset_id = self._contiguous_id_to_dataset_id[label]
176 | else:
177 | dataset_id = int(label)
178 | mask = (sem_seg == label).astype(np.uint8)
179 | mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
180 | mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
181 | json_list.append(
182 | {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
183 | )
184 | return json_list
185 |
--------------------------------------------------------------------------------
/sas_det/evaluation/testing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | import numpy as np
4 | import pprint
5 | import sys
6 | from collections.abc import Mapping
7 |
8 |
9 | def print_csv_format(results):
10 | """
11 | Print main metrics in a format similar to Detectron,
12 | so that they are easy to copypaste into a spreadsheet.
13 |
14 | Args:
15 | results (OrderedDict[dict]): task_name -> {metric -> score}
16 | unordered dict can also be printed, but in arbitrary order
17 | """
18 | assert isinstance(results, Mapping) or not len(results), results
19 | logger = logging.getLogger(__name__)
20 | for task, res in results.items():
21 | if isinstance(res, Mapping):
22 | # Don't print "AP-category" metrics since they are usually not tracked.
23 | important_res = [(k, v) for k, v in res.items() if "-" not in k]
24 | logger.info("copypaste: Task: {}".format(task))
25 | logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
26 | logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
27 | else:
28 | logger.info(f"copypaste: {task}={res}")
29 |
30 |
31 | def verify_results(cfg, results):
32 | """
33 | Args:
34 | results (OrderedDict[dict]): task_name -> {metric -> score}
35 |
36 | Returns:
37 | bool: whether the verification succeeds or not
38 | """
39 | expected_results = cfg.TEST.EXPECTED_RESULTS
40 | if not len(expected_results):
41 | return True
42 |
43 | ok = True
44 | for task, metric, expected, tolerance in expected_results:
45 | actual = results[task].get(metric, None)
46 | if actual is None:
47 | ok = False
48 | continue
49 | if not np.isfinite(actual):
50 | ok = False
51 | continue
52 | diff = abs(actual - expected)
53 | if diff > tolerance:
54 | ok = False
55 |
56 | logger = logging.getLogger(__name__)
57 | if not ok:
58 | logger.error("Result verification failed!")
59 | logger.error("Expected Results: " + str(expected_results))
60 | logger.error("Actual Results: " + pprint.pformat(results))
61 |
62 | sys.exit(1)
63 | else:
64 | logger.info("Results verification passed.")
65 | return ok
66 |
67 |
68 | def flatten_results_dict(results):
69 | """
70 | Expand a hierarchical dict of scalars into a flat dict of scalars.
71 | If results[k1][k2][k3] = v, the returned dict will have the entry
72 | {"k1/k2/k3": v}.
73 |
74 | Args:
75 | results (dict):
76 | """
77 | r = {}
78 | for k, v in results.items():
79 | if isinstance(v, Mapping):
80 | v = flatten_results_dict(v)
81 | for kk, vv in v.items():
82 | r[k + "/" + kk] = vv
83 | else:
84 | r[k] = v
85 | return r
86 |
--------------------------------------------------------------------------------
/sas_det/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 |
3 | from .backbone import (
4 | build_clip_language_encoder,
5 | get_clip_tokenzier,
6 | get_clip_image_transform,
7 | )
8 |
9 | from .meta_arch import clip_rcnn as _
10 |
11 | from .roi_heads import (
12 | CLIPRes5ROIHeads,
13 | FastRCNNOutputLayers,
14 | )
15 |
--------------------------------------------------------------------------------
/sas_det/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 |
3 | from .clip_backbone import (
4 | build_clip_language_encoder,
5 | get_clip_tokenzier,
6 | get_clip_image_transform,
7 | )
--------------------------------------------------------------------------------
/sas_det/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 |
3 | from .clip_roi_heads import (
4 | CLIPRes5ROIHeads,
5 | # PretrainRes5ROIHeads,
6 | # CLIPStandardROIHeads,
7 | )
8 | from .clip_roi_heads import FastRCNNOutputLayers
9 |
10 | __all__ = list(globals().keys())
11 |
--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # Copyright (c) NEC Laboratories America, Inc.
4 | """
5 | A main training script.
6 |
7 | This scripts reads a given config file and runs the training or evaluation.
8 | It is an entry point that is made to train standard models in detectron2.
9 |
10 | In order to let one script support training of many models,
11 | this script contains logic that are specific to these built-in models and therefore
12 | may not be suitable for your own project.
13 | For example, your research project perhaps only needs a single "evaluator".
14 |
15 | Therefore, we recommend you to use detectron2 as an library and take
16 | this file as an example of how to use the library.
17 | You may want to write your own script with your datasets and other customizations.
18 | """
19 |
20 | import logging
21 | import os
22 | from collections import OrderedDict
23 | import torch
24 |
25 | import detectron2.utils.comm as comm
26 | # from detectron2.checkpoint import DetectionCheckpointer
27 | from detectron2.config import get_cfg
28 | from detectron2.data import MetadataCatalog, build_detection_train_loader
29 | from detectron2.engine import default_argument_parser, default_setup, hooks, launch
30 | from detectron2.engine import DefaultTrainer # this may be modified by regionclip
31 | from detectron2.modeling import GeneralizedRCNNWithTTA
32 |
33 | from sas_det.evaluation import (
34 | CityscapesInstanceEvaluator,
35 | CityscapesSemSegEvaluator,
36 | COCOEvaluator,
37 | COCOPanopticEvaluator,
38 | DatasetEvaluators,
39 | LVISEvaluator,
40 | PascalVOCDetectionEvaluator,
41 | SemSegEvaluator,
42 | verify_results,
43 | )
44 | from sas_det.checkpoint import DetectionCheckpointer
45 | from sas_det import add_sas_det_config
46 |
47 | #os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
48 | import torch.multiprocessing
49 | torch.multiprocessing.set_sharing_strategy('file_system')
50 |
51 | class Trainer(DefaultTrainer):
52 | """
53 | We use the "DefaultTrainer" which contains pre-defined default logic for
54 | standard training workflow. They may not work for you, especially if you
55 | are working on a new research project. In that case you can write your
56 | own training loop. You can use "tools/plain_train_net.py" as an example.
57 | """
58 |
59 | @classmethod
60 | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
61 | """
62 | Create evaluator(s) for a given dataset.
63 | This uses the special metadata "evaluator_type" associated with each builtin dataset.
64 | For your own dataset, you can simply create an evaluator manually in your
65 | script and do not have to worry about the hacky if-else logic here.
66 | """
67 | if output_folder is None:
68 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
69 | evaluator_list = []
70 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
71 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
72 | evaluator_list.append(
73 | SemSegEvaluator(
74 | dataset_name,
75 | distributed=True,
76 | output_dir=output_folder,
77 | )
78 | )
79 | if evaluator_type in ["coco", "coco_panoptic_seg"]:
80 | evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
81 | if evaluator_type == "coco_panoptic_seg":
82 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
83 | if evaluator_type == "cityscapes_instance":
84 | assert (
85 | torch.cuda.device_count() >= comm.get_rank()
86 | ), "CityscapesEvaluator currently do not work with multiple machines."
87 | return CityscapesInstanceEvaluator(dataset_name)
88 | if evaluator_type == "cityscapes_sem_seg":
89 | assert (
90 | torch.cuda.device_count() >= comm.get_rank()
91 | ), "CityscapesEvaluator currently do not work with multiple machines."
92 | return CityscapesSemSegEvaluator(dataset_name)
93 | elif evaluator_type == "pascal_voc":
94 | return PascalVOCDetectionEvaluator(dataset_name)
95 | elif evaluator_type == "lvis":
96 | return LVISEvaluator(dataset_name, output_dir=output_folder)
97 | if len(evaluator_list) == 0:
98 | raise NotImplementedError(
99 | "no Evaluator for the dataset {} with the type {}".format(
100 | dataset_name, evaluator_type
101 | )
102 | )
103 | elif len(evaluator_list) == 1:
104 | return evaluator_list[0]
105 | return DatasetEvaluators(evaluator_list)
106 |
107 | @classmethod
108 | def test_with_TTA(cls, cfg, model):
109 | logger = logging.getLogger("detectron2.trainer")
110 | # In the end of training, run an evaluation with TTA
111 | # Only support some R-CNN models.
112 | logger.info("Running inference with test-time augmentation ...")
113 | model = GeneralizedRCNNWithTTA(cfg, model)
114 | evaluators = [
115 | cls.build_evaluator(
116 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
117 | )
118 | for name in cfg.DATASETS.TEST
119 | ]
120 | res = cls.test(cfg, model, evaluators)
121 | res = OrderedDict({k + "_TTA": v for k, v in res.items()})
122 | return res
123 |
124 |
125 | def periodic_update_teacher(trainer):
126 | update_steps = trainer.cfg.MODEL.OVD.PERIODIC_STEPS
127 | cur_iters = trainer.iter
128 |
129 | if cur_iters in update_steps:
130 | model = trainer.model
131 | if isinstance(model, torch.nn.parallel.DistributedDataParallel):
132 | # wrapped by DistributedDataParallel
133 | model.module.periodic_update_pairs()
134 | else:
135 | model.periodic_update_pairs()
136 |
137 |
138 | def setup(args):
139 | """
140 | Create configs and perform basic setups.
141 | """
142 | cfg = get_cfg()
143 | add_sas_det_config(cfg) # sas_det configs
144 |
145 | cfg.merge_from_file(args.config_file)
146 | cfg.merge_from_list(args.opts)
147 | cfg.freeze()
148 | default_setup(cfg, args)
149 | return cfg
150 |
151 |
152 | def main(args):
153 | cfg = setup(args)
154 |
155 | assert args.eval_only, "This release supports evaluation only."
156 | if args.eval_only:
157 | model = Trainer.build_model(cfg)
158 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
159 | cfg.MODEL.WEIGHTS, resume=args.resume
160 | )
161 | if cfg.MODEL.META_ARCHITECTURE in ['CLIPRCNN', 'CLIPFastRCNN', 'PretrainFastRCNN', 'WeakPretrainFastRCNN'] \
162 | and cfg.MODEL.CLIP.BB_RPN_WEIGHTS is not None\
163 | and cfg.MODEL.CLIP.CROP_REGION_TYPE == 'RPN': # load 2nd pretrained model
164 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR, bb_rpn_weights=True).resume_or_load(
165 | cfg.MODEL.CLIP.BB_RPN_WEIGHTS, resume=False
166 | )
167 | res = Trainer.test(cfg, model)
168 | if cfg.TEST.AUG.ENABLED:
169 | res.update(Trainer.test_with_TTA(cfg, model))
170 | if comm.is_main_process():
171 | verify_results(cfg, res)
172 | return res
173 |
174 |
175 | if __name__ == "__main__":
176 | args = default_argument_parser().parse_args()
177 | print("Command Line Args:", args)
178 | launch(
179 | main,
180 | args.num_gpus,
181 | num_machines=args.num_machines,
182 | machine_rank=args.machine_rank,
183 | dist_url=args.dist_url,
184 | args=(args,),
185 | )
186 |
--------------------------------------------------------------------------------
/tools/offline_eval_onLVIS.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | from lvis import LVIS
5 | from lvis import LVISEval, LVISResults
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser(description='evaluate PLs quality offline')
9 | parser.add_argument('gt_json', type=str, help='gt coco json file')
10 | parser.add_argument('pl_json', type=str, help='PL coco json file')
11 | args = parser.parse_args()
12 | # print(args)
13 |
14 | #############################################
15 | gt_LVISJson_file = args.gt_json
16 | pred_LvisJson_file = args.pl_json
17 |
18 | covert_to_result = True # True if .json in coco data format (not coco result format)
19 |
20 | #############################################
21 |
22 | # load image list in gt_json
23 | lvis_gt = LVIS(gt_LVISJson_file)
24 | gt_img_ids = set(lvis_gt.get_img_ids())
25 |
26 | if covert_to_result:
27 | PLData = json.load(open(pred_LvisJson_file, 'r'))
28 | PL_list = list()
29 | imageId_list = list()
30 | for anno in PLData['annotations']:
31 | cur_image_id = anno['image_id']
32 | ## eval only on PLs
33 | if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids):
34 | data = {'image_id': cur_image_id,
35 | 'category_id': anno['category_id'],
36 | 'bbox': anno['bbox'],
37 | 'score': anno['confidence']}
38 | PL_list.append(data)
39 | imageId_list.append(cur_image_id)
40 | # ## eval on all data (GT + PLs)
41 | # if cur_image_id in gt_img_ids:
42 | # data = {'image_id': cur_image_id,
43 | # 'category_id': anno['category_id'],
44 | # 'bbox': anno['bbox'],
45 | # 'score': anno['confidence']}
46 | # PL_list.append(data)
47 | # imageId_list.append(cur_image_id)
48 |
49 | print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) )
50 | else:
51 | PL_list = json.load(open(pred_LvisJson_file, 'r'))
52 |
53 | # do evaluation
54 | lvis_results = LVISResults(lvis_gt, PL_list, max_dets=300)
55 | lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type="bbox")
56 | lvis_eval.run()
57 | lvis_eval.print_results()
58 |
--------------------------------------------------------------------------------
/tools/offline_eval_onO365.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | from pycocotools.coco import COCO
5 | from pycocotools.cocoeval import COCOeval
6 |
7 |
8 | if __name__ == '__main__':
9 | parser = argparse.ArgumentParser(description='evaluate PLs quality offline')
10 | parser.add_argument('gt_json', type=str, help='gt coco json file')
11 | parser.add_argument('pl_json', type=str, help='PL coco json file')
12 | parser.add_argument('-r', '--raw', action='store_true')
13 |
14 | args = parser.parse_args()
15 | # print(args)
16 |
17 | #############################################
18 | gt_COCOJson_file = args.gt_json
19 | pred_COCOJson_file = args.pl_json
20 | #############################################
21 |
22 | # load image list in gt_json
23 | GtData = json.load(open(gt_COCOJson_file, 'r'))
24 | gt_img_ids = [x['id'] for x in GtData['images']]
25 | gt_img_ids = set(gt_img_ids)
26 |
27 | PLData = json.load(open(pred_COCOJson_file, 'r'))
28 |
29 | if args.raw:
30 | PL_list = PLData
31 | imageId_list = gt_img_ids
32 | else:
33 | PL_list = list()
34 | imageId_list = list()
35 | for anno in PLData['annotations']:
36 | cur_image_id = anno['image_id']
37 |
38 | score = anno.get('confidence', None)
39 | if score is None:
40 | # take all PLs
41 | data = {'image_id': cur_image_id,
42 | 'category_id': anno['category_id'],
43 | 'bbox': anno['bbox'],
44 | 'score': anno['confidence']}
45 | PL_list.append(data)
46 | imageId_list.append(cur_image_id)
47 |
48 | # if args.raw:
49 | # # take all annos from PLs
50 | # data = {'image_id': cur_image_id,
51 | # 'category_id': anno['category_id'],
52 | # 'bbox': anno['bbox'],
53 | # 'score': anno['confidence']}
54 | # PL_list.append(data)
55 | # imageId_list.append(cur_image_id)
56 | # else:
57 | # if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids):
58 | # data = {'image_id': cur_image_id,
59 | # 'category_id': anno['category_id'],
60 | # 'bbox': anno['bbox'],
61 | # 'score': anno['confidence']}
62 | # PL_list.append(data)
63 | # imageId_list.append(cur_image_id)
64 |
65 | print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) )
66 |
67 | curSaveJson = './.temp.json'
68 | with open(curSaveJson, 'w') as outfile:
69 | json.dump(PL_list, outfile)
70 |
71 | cocoGt = COCO(gt_COCOJson_file)
72 | cocoDt = cocoGt.loadRes(curSaveJson)
73 |
74 | cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
75 | cocoEval.evaluate()
76 | cocoEval.accumulate()
77 | cocoEval.summarize()
78 |
--------------------------------------------------------------------------------