├── .gitignore
├── LICENSE
├── README.md
├── datasets
    └── .gitkeep
├── pretrained_ckpt
    └── .gitkeep
├── sas_det
    ├── __init__.py
    ├── checkpoint
    │   ├── __init__.py
    │   ├── c2_model_loading.py
    │   ├── catalog.py
    │   ├── clip_model_loading.py
    │   └── detection_checkpoint.py
    ├── config.py
    ├── configs
    │   ├── ovd_coco_R50_C4_ensemble.yaml
    │   ├── ovd_coco_R50_C4_ensemble_PLs.yaml
    │   ├── ovd_lvis_R50_C4_SAS_Det_3x.yaml
    │   ├── ovd_lvis_R50_C4_ensemble_PLs.yaml
    │   └── regionclip
    │   │   ├── Base-RCNN-C4.yaml
    │   │   ├── Base-RCNN-DilatedC5.yaml
    │   │   ├── Base-RCNN-FPN.yaml
    │   │   ├── Base-RetinaNet.yaml
    │   │   ├── COCO-Detection
    │   │       ├── fast_rcnn_R_50_FPN_1x.yaml
    │   │       ├── faster_rcnn_R_101_C4_3x.yaml
    │   │       ├── faster_rcnn_R_101_DC5_3x.yaml
    │   │       ├── faster_rcnn_R_101_FPN_3x.yaml
    │   │       ├── faster_rcnn_R_50_C4_1x.yaml
    │   │       ├── faster_rcnn_R_50_C4_3x.yaml
    │   │       ├── faster_rcnn_R_50_DC5_1x.yaml
    │   │       ├── faster_rcnn_R_50_DC5_3x.yaml
    │   │       ├── faster_rcnn_R_50_FPN_1x.yaml
    │   │       ├── faster_rcnn_R_50_FPN_3x.yaml
    │   │       ├── faster_rcnn_X_101_32x8d_FPN_3x.yaml
    │   │       ├── retinanet_R_101_FPN_3x.yaml
    │   │       ├── retinanet_R_50_FPN_1x.py
    │   │       ├── retinanet_R_50_FPN_1x.yaml
    │   │       ├── retinanet_R_50_FPN_3x.yaml
    │   │       ├── rpn_R_50_C4_1x.yaml
    │   │       └── rpn_R_50_FPN_1x.yaml
    │   │   ├── COCO-InstanceSegmentation
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml
    │   │       ├── customized
    │   │       │   ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml
    │   │       │   ├── CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml
    │   │       │   ├── CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml
    │   │       │   ├── ovd_coco_2x_PLs_per4k_clsBoxConf.yaml
    │   │       │   ├── ovd_coco_fCLIP_PLs_clsBoxConf.yaml
    │   │       │   ├── ovd_coco_fCLIP_offline_PLs.yaml
    │   │       │   └── ovd_coco_frozen_CLIP_RPN.yaml
    │   │       ├── mask_rcnn_CLIP_R_50_C4_1x.yaml
    │   │       ├── mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml
    │   │       ├── mask_rcnn_R_101_C4_3x.yaml
    │   │       ├── mask_rcnn_R_101_DC5_3x.yaml
    │   │       ├── mask_rcnn_R_101_FPN_3x.yaml
    │   │       ├── mask_rcnn_R_50_C4_1x.py
    │   │       ├── mask_rcnn_R_50_C4_1x.yaml
    │   │       ├── mask_rcnn_R_50_C4_1x_ovd_FSD.yaml
    │   │       ├── mask_rcnn_R_50_C4_1x_ovd_coco65.yaml
    │   │       ├── mask_rcnn_R_50_C4_3x.yaml
    │   │       ├── mask_rcnn_R_50_DC5_1x.yaml
    │   │       ├── mask_rcnn_R_50_DC5_3x.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x.py
    │   │       ├── mask_rcnn_R_50_FPN_1x.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x_giou.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml
    │   │       ├── mask_rcnn_R_50_FPN_3x.yaml
    │   │       ├── mask_rcnn_X_101_32x8d_FPN_3x.yaml
    │   │       ├── mask_rcnn_regnetx_4gf_dds_fpn_1x.py
    │   │       └── mask_rcnn_regnety_4gf_dds_fpn_1x.py
    │   │   ├── LVISv0.5-InstanceSegmentation
    │   │       ├── mask_rcnn_R_101_FPN_1x.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x.yaml
    │   │       └── mask_rcnn_X_101_32x8d_FPN_1x.yaml
    │   │   ├── LVISv1-InstanceSegmentation
    │   │       ├── CLIP_fast_rcnn_R_50_C4.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_custom_img.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_zsinf.yaml
    │   │       ├── CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml
    │   │       ├── customized
    │   │       │   ├── ovd_lvis_box_PLs_periodic_boxConf.yaml
    │   │       │   ├── ovd_lvis_fCLIP_PLs_clsBoxConf.yaml
    │   │       │   └── ovd_lvis_frozen_CLIP_RPN.yaml
    │   │       ├── mask_rcnn_CLIP_R_50_C4_1x.yaml
    │   │       ├── mask_rcnn_CLIP_R_50_FPN_1x.yaml
    │   │       ├── mask_rcnn_R_101_FPN_1x.yaml
    │   │       ├── mask_rcnn_R_50_C4_1x.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x.yaml
    │   │       ├── mask_rcnn_R_50_FPN_2x.yaml
    │   │       └── mask_rcnn_X_101_32x8d_FPN_1x.yaml
    │   │   ├── Misc
    │   │       ├── cascade_mask_rcnn_R_50_FPN_1x.yaml
    │   │       ├── cascade_mask_rcnn_R_50_FPN_3x.yaml
    │   │       ├── cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
    │   │       ├── mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
    │   │       ├── mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
    │   │       ├── mask_rcnn_R_50_FPN_3x_gn.yaml
    │   │       ├── mask_rcnn_R_50_FPN_3x_syncbn.yaml
    │   │       ├── mmdet_mask_rcnn_R_50_FPN_1x.py
    │   │       ├── panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
    │   │       ├── scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
    │   │       ├── scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
    │   │       ├── scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
    │   │       ├── semantic_R_50_FPN_1x.yaml
    │   │       └── torchvision_imagenet_R_50.py
    │   │   ├── common
    │   │       ├── README.md
    │   │       ├── coco_schedule.py
    │   │       ├── data
    │   │       │   ├── coco.py
    │   │       │   ├── coco_keypoint.py
    │   │       │   └── coco_panoptic_separated.py
    │   │       ├── models
    │   │       │   ├── cascade_rcnn.py
    │   │       │   ├── keypoint_rcnn_fpn.py
    │   │       │   ├── mask_rcnn_c4.py
    │   │       │   ├── mask_rcnn_fpn.py
    │   │       │   ├── panoptic_fpn.py
    │   │       │   └── retinanet.py
    │   │       ├── optim.py
    │   │       └── train.py
    │   │   └── pretrain
    │   │       ├── RegionCLIP_RN50.yaml
    │   │       ├── RegionCLIP_RN50_onlinePL.yaml
    │   │       ├── RegionCLIP_RN50_onlinePL_box_weak.yaml
    │   │       ├── RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml
    │   │       ├── RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml
    │   │       ├── RegionCLIP_RN50x4.yaml
    │   │       └── RegionCLIP_RN50x4_onlinePL_boxWeak.yaml
    ├── data
    │   ├── __init__.py
    │   ├── coco_zeroshot_categories.py
    │   ├── lvis.py
    │   ├── lvis_v0_5_categories.py
    │   ├── lvis_v1_categories.py
    │   └── ovd_register.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── cityscapes_evaluation.py
    │   ├── coco_evaluation.py
    │   ├── evaluator.py
    │   ├── fast_eval_api.py
    │   ├── lvis_evaluation.py
    │   ├── panoptic_evaluation.py
    │   ├── pascal_voc_evaluation.py
    │   ├── rotated_coco_evaluation.py
    │   ├── sem_seg_evaluation.py
    │   └── testing.py
    └── modeling
    │   ├── __init__.py
    │   ├── backbone
    │       ├── __init__.py
    │       ├── batch_norm.py
    │       └── clip_backbone.py
    │   ├── ensemble_fast_rcnn.py
    │   ├── ensemble_roi_heads.py
    │   ├── meta_arch
    │       └── clip_rcnn.py
    │   └── roi_heads
    │       ├── __init__.py
    │       ├── clip_fast_rcnn.py
    │       ├── clip_roi_heads.py
    │       └── soft_nms.py
├── test_net.py
└── tools
    ├── gen_cat_text_emb.py
    ├── offline_eval_onLVIS.py
    └── offline_eval_onO365.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # macOS stuff
 2 | *.DS_Store
 3 | 
 4 | # Python caches
 5 | **/__pycache__
 6 | 
 7 | # Ignore all output directories and experiment scripts in the individual projects
 8 | /output/
 9 | /datasets
10 | /pretrained_ckpt/
11 | /pretrained_ckpt
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2023 NEC Laboratories America, Inc. ("NECLA")
 2 | 
 3 |  
 4 | 
 5 | This software and any and all related files/code/information is provided by
 6 | NECLA to for non-commercial evaluation or research purposes subject to terms in a License agreement the Recipient has agreed to by Recipient’s signature.
 7 | 
 8 |  
 9 | 
10 | The license restriction includes, among other limitations, the Recipient to only evaluate this software and redistribute information related to this software only in the form of technical publications/papers, with no rights to assign a license to third parties or redistribute the software to others.
11 | 
12 |  
13 | 
14 | 
15 | IN NO EVENT SHALL NEC BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
16 | SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
17 | USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF NEC HAS BEEN
18 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19 | 
20 |  
21 | 
22 | NEC SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND NEC HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
23 | SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
24 | 
25 |  
26 | 
27 | THE LICENSE FROM NEC FOR THE SOFTWARE REQUIRES THAT LICENSEE
28 | COMPLY WITH ANY AND ALL UNDERLYING COPYRIGHTS AND LICENSE RIGHTS
29 | IN THE SOFTWARE BY THIRD PARTIES.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Taming Self-Training for Open-Vocabulary Object Detection
  2 | 
  3 | Official implementation of online self-training and a split-and-fusion (SAF) head for Open-Vocabulary Object Detection (OVD), SAS-Det for short.
  4 | This project was named as Improving Pseudo Labels for Open-Vocabulary Object Detection.
  5 | 
  6 | [arXiv](https://arxiv.org/abs/2308.06412)
  7 | 
  8 | 
  9 | ## Installation
 10 | - Our project is developed on Detectron2. Please follow the official installation [instructions](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md), OR the following instructions.
 11 | ```
 12 | # create new environment
 13 | conda create -n sas_det python=3.8
 14 | conda activate sas_det
 15 | 
 16 | # install pytorch
 17 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
 18 | 
 19 | # install Detectron2 from a local clone
 20 | git clone https://github.com/facebookresearch/detectron2.git
 21 | python -m pip install -e detectron2
 22 | ```
 23 | 
 24 | - Install CLIP
 25 | ```
 26 | # install CLIP
 27 | pip install scipy
 28 | pip install ftfy regex tqdm
 29 | pip install git+https://github.com/openai/CLIP.git
 30 | ```
 31 | 
 32 | 
 33 | ## Datasets
 34 | 
 35 | - Please follow RegionCLIP's [dataset instructions](https://github.com/microsoft/RegionCLIP/blob/main/datasets/README.md) to prepare COCO and LVIS datasets.
 36 | 
 37 | - Download and put [metadata](https://drive.google.com/drive/u/1/folders/1R72q0Wg26-PQGqbaK3P3pT2vmGm9uzKU) for datasets in the folder `datasets` (i.e., `$DETECTRON2_DATASETS` used in the last step), which will be used in our evaluation and training.
 38 | 
 39 | 
 40 | ## Download pretrained weights
 41 | - Download various [RegionCLIP's pretrained weights](https://drive.google.com/drive/folders/1hzrJBvcCrahoRcqJRqzkIGFO_HUSJIii). Check [here](https://github.com/microsoft/RegionCLIP/blob/main/docs/MODEL_ZOO.md#model-downloading) for more details.
 42 | Create a new folder `pretrained_ckpt` to put those weights. In this repository, `regionclip`, `concept_emb` and `rpn` will be used.
 43 | 
 44 | - Download [our pretrained weights](https://drive.google.com/drive/u/1/folders/1TAr7nZSvpB6nCZCC6nXBw6xgmMmlL0X9) and put them in corresponding folders in `pretrained_ckpt`. 
 45 | Our pretrained weights includes:
 46 |     - `r50_3x_pre_RegCLIP_cocoRPN_2`: RPN weights pretrained only with COCO Base categories. This is used for experiments on COCO to avoid potential data leakage.
 47 |     - `concept_emb`: Complementary to RegionCLIP's `concept_emb`.
 48 | 
 49 | ## Evaluation with released weights
 50 | 
 51 | ### Results on COCO-OVD
 52 | <table><tbody>
 53 | <!-- START TABLE -->
 54 | <!-- TABLE HEADER -->
 55 | <th valign="bottom">Configs</th>
 56 | <th valign="bottom">Novel AP</th>
 57 | <th valign="bottom">Base AP</th>
 58 | <th valign="bottom">Overall AP</th>
 59 | <!-- TABLE BODY -->
 60 | <!-- ROW: with LSJ -->
 61 |  <tr><td align="left"><a href="./sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml">w/o SAF head</a></td>
 62 | <td align="center">31.4</td>
 63 | <td align="center">55.7</td>
 64 | <td align="center">49.4</td>
 65 | </tr>
 66 | <!-- ROW: with out LSJ -->
 67 |  <tr><td align="left"><a href="./sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml">with SAF head</a></td>
 68 | <td align="center">37.4</td>
 69 | <td align="center">58.5</td>
 70 | <td align="center">53.0</td>
 71 | </tr>
 72 | </tbody></table>
 73 | 
 74 | <details>
 75 | <summary>
 76 | Evaluation without the SAF Head (baseline in the paper),
 77 | </summary>
 78 |   
 79 | ```bash
 80 | python3 ./test_net.py \
 81 |     --num-gpus 8 \
 82 |     --eval-only \
 83 |     --config-file ./sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml \
 84 |     MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco_no_saf_head_baseline.pth \
 85 |     MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
 86 |     MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
 87 |     MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
 88 |     MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
 89 |     MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
 90 |     OUTPUT_DIR output/eval
 91 | ```
 92 | </details>
 93 | 
 94 | <details>
 95 | <summary>
 96 | Evaluation with the SAF Head,
 97 | </summary>
 98 |   
 99 | ```bash
100 | python3 ./test_net.py \
101 |     --num-gpus 8 \
102 |     --eval-only \
103 |     --config-file ./sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml \
104 |     MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_coco.pth \
105 |     MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
106 |     MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
107 |     MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
108 |     MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_coco_48_base_17_cls_emb.pth \
109 |     MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
110 |     MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
111 |     MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/coco_ovd_continue_cat_ids.json" \
112 |     MODEL.ENSEMBLE.ALPHA 0.3 MODEL.ENSEMBLE.BETA 0.7 \
113 |     OUTPUT_DIR output/eval
114 | ```
115 | </details>
116 | 
117 | 
118 | ### Results on LVIS-OVD
119 | <table><tbody>
120 | <!-- START TABLE -->
121 | <!-- TABLE HEADER -->
122 | <th valign="bottom">Configs</th>
123 | <th valign="bottom">APr</th>
124 | <th valign="bottom">APc</th>
125 | <th valign="bottom">APf</th>
126 | <th valign="bottom">AP</th>
127 | <!-- TABLE BODY -->
128 | <!-- ROW: with LSJ -->
129 |  <tr><td align="left"><a href="./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml">RN50-C4 as backbone</a></td>
130 | <td align="center">20.1</td>
131 | <td align="center">27.1</td>
132 | <td align="center">32.9</td>
133 | <td align="center">28.1</td>
134 | </tr>
135 | <!-- ROW: with out LSJ -->
136 |  <tr><td align="left"><a href="./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml">RN50x4-C4 as backbone</a></td>
137 | <td align="center">29.0</td>
138 | <td align="center">32.3</td>
139 | <td align="center">36.8</td>
140 | <td align="center">33.5</td>
141 | </tr>
142 | </tbody></table>
143 | 
144 | <details>
145 | <summary>
146 | Evaluation with RN50-C4 as the backbone,
147 | </summary>
148 |   
149 | ```bash
150 | python3 ./test_net.py \
151 |     --num-gpus 8 \
152 |     --eval-only \
153 |     --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \
154 |     MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50.pth \
155 |     MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
156 |     MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
157 |     MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb.pth \
158 |     MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb.pth \
159 |     MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
160 |     MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
161 |     MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \
162 |     MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \
163 |     OUTPUT_DIR output/eval
164 | ```
165 | </details>
166 | 
167 | <details>
168 | <summary>
169 | Evaluation with RN50x4-C4 as the backbone,
170 | </summary>
171 |   
172 | ```bash
173 | python3 ./test_net.py \
174 |     --num-gpus 8 \
175 |     --eval-only \
176 |     --config-file ./sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml \
177 |     MODEL.WEIGHTS ./pretrained_ckpt/sas_det/sas_det_lvis_r50x4.pth \
178 |     MODEL.CLIP.OFFLINE_RPN_CONFIG ./sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
179 |     MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
180 |     MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb_rn50x4.pth \
181 |     MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/my_lvis_866_base_337_cls_emb_rn50x4.pth \
182 |     MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
183 |     MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
184 |     MODEL.CLIP.TEXT_EMB_DIM 640 \
185 |     MODEL.RESNETS.DEPTH 200 \
186 |     MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
187 |     MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \
188 |     MODEL.ENSEMBLE.TEST_CATEGORY_INFO "./datasets/lvis_ovd_continue_cat_ids.json" \
189 |     MODEL.ENSEMBLE.ALPHA 0.33 MODEL.ENSEMBLE.BETA 0.67 \
190 |     OUTPUT_DIR output/eval
191 | ```
192 | </details>
193 | 
194 | 
195 | 
196 | ## Acknowledgement
197 | 
198 | This repository was built on top of [Detectron2](https://github.com/facebookresearch/detectron2), [RegionCLIP](https://github.com/microsoft/RegionCLIP), and [VLDet](https://github.com/clin1223/VLDet). We thank the effort from our community.
199 | 


--------------------------------------------------------------------------------
/datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/datasets/.gitkeep


--------------------------------------------------------------------------------
/pretrained_ckpt/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaofeng94/SAS-Det/fd6ea7dc2ba07d3a836b5e65bdd0fd57bd60cb9c/pretrained_ckpt/.gitkeep


--------------------------------------------------------------------------------
/sas_det/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 | from .modeling import ensemble_roi_heads as _
3 | from .config import add_sas_det_config
4 | from .data import *
5 | 
6 | 


--------------------------------------------------------------------------------
/sas_det/checkpoint/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # File:
 4 | 
 5 | 
 6 | from . import catalog as _UNUSED  # register the handler
 7 | from .detection_checkpoint import DetectionCheckpointer
 8 | from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
 9 | 
10 | __all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
11 | 


--------------------------------------------------------------------------------
/sas_det/checkpoint/catalog.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | 
  4 | from detectron2.utils.file_io import PathHandler, PathManager
  5 | 
  6 | 
  7 | class ModelCatalog(object):
  8 |     """
  9 |     Store mappings from names to third-party models.
 10 |     """
 11 | 
 12 |     S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
 13 | 
 14 |     # MSRA models have STRIDE_IN_1X1=True. False otherwise.
 15 |     # NOTE: all BN models here have fused BN into an affine layer.
 16 |     # As a result, you should only load them to a model with "FrozenBN".
 17 |     # Loading them to a model with regular BN or SyncBN is wrong.
 18 |     # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
 19 |     # which should be negligible for training.
 20 |     # NOTE: all models here uses PIXEL_STD=[1,1,1]
 21 |     # NOTE: Most of the BN models here are no longer used. We use the
 22 |     # re-converted pre-trained models under detectron2 model zoo instead.
 23 |     C2_IMAGENET_MODELS = {
 24 |         "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
 25 |         "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
 26 |         "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
 27 |         "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
 28 |         "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
 29 |         "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
 30 |         "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
 31 |     }
 32 | 
 33 |     C2_DETECTRON_PATH_FORMAT = (
 34 |         "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl"  # noqa B950
 35 |     )
 36 | 
 37 |     C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
 38 |     C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
 39 | 
 40 |     # format: {model_name} -> part of the url
 41 |     C2_DETECTRON_MODELS = {
 42 |         "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW",  # noqa B950
 43 |         "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I",  # noqa B950
 44 |         "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7",  # noqa B950
 45 |         "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ",  # noqa B950
 46 |         "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB",  # noqa B950
 47 |         "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC",  # noqa B950
 48 |         "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT",  # noqa B950
 49 |         "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI",  # noqa B950
 50 |         "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q",  # noqa B950
 51 |         "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao",  # noqa B950
 52 |         "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L",  # noqa B950
 53 |         "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179",  # noqa B950
 54 |         "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2",  # noqa B950
 55 |     }
 56 | 
 57 |     @staticmethod
 58 |     def get(name):
 59 |         if name.startswith("Caffe2Detectron/COCO"):
 60 |             return ModelCatalog._get_c2_detectron_baseline(name)
 61 |         if name.startswith("ImageNetPretrained/"):
 62 |             return ModelCatalog._get_c2_imagenet_pretrained(name)
 63 |         raise RuntimeError("model not present in the catalog: {}".format(name))
 64 | 
 65 |     @staticmethod
 66 |     def _get_c2_imagenet_pretrained(name):
 67 |         prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
 68 |         name = name[len("ImageNetPretrained/") :]
 69 |         name = ModelCatalog.C2_IMAGENET_MODELS[name]
 70 |         url = "/".join([prefix, name])
 71 |         return url
 72 | 
 73 |     @staticmethod
 74 |     def _get_c2_detectron_baseline(name):
 75 |         name = name[len("Caffe2Detectron/COCO/") :]
 76 |         url = ModelCatalog.C2_DETECTRON_MODELS[name]
 77 |         if "keypoint_rcnn" in name:
 78 |             dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
 79 |         else:
 80 |             dataset = ModelCatalog.C2_DATASET_COCO
 81 | 
 82 |         if "35998355/rpn_R-50-C4_1x" in name:
 83 |             # this one model is somehow different from others ..
 84 |             type = "rpn"
 85 |         else:
 86 |             type = "generalized_rcnn"
 87 | 
 88 |         # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
 89 |         url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
 90 |             prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
 91 |         )
 92 |         return url
 93 | 
 94 | 
 95 | class ModelCatalogHandler(PathHandler):
 96 |     """
 97 |     Resolve URL like catalog://.
 98 |     """
 99 | 
100 |     PREFIX = "catalog://"
101 | 
102 |     def _get_supported_prefixes(self):
103 |         return [self.PREFIX]
104 | 
105 |     def _get_local_path(self, path, **kwargs):
106 |         logger = logging.getLogger(__name__)
107 |         catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
108 |         logger.info("Catalog entry {} points to {}".format(path, catalog_path))
109 |         return PathManager.get_local_path(catalog_path, **kwargs)
110 | 
111 |     def _open(self, path, mode="r", **kwargs):
112 |         return PathManager.open(self._get_local_path(path), mode, **kwargs)
113 | 
114 | 
115 | PathManager.register_handler(ModelCatalogHandler())
116 | 


--------------------------------------------------------------------------------
/sas_det/checkpoint/detection_checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | import os
  4 | import pickle
  5 | import torch
  6 | from fvcore.common.checkpoint import Checkpointer
  7 | from torch.nn.parallel import DistributedDataParallel
  8 | 
  9 | import detectron2.utils.comm as comm
 10 | from detectron2.utils.env import TORCH_VERSION
 11 | from detectron2.utils.file_io import PathManager
 12 | 
 13 | from .c2_model_loading import align_and_update_state_dicts
 14 | from .clip_model_loading import align_and_update_state_dicts_for_CLIP
 15 | 
 16 | class DetectionCheckpointer(Checkpointer):
 17 |     """
 18 |     Same as :class:`Checkpointer`, but is able to:
 19 |     1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models.
 20 |     2. correctly load checkpoints that are only available on the master worker
 21 |     """
 22 | 
 23 |     def __init__(self, model, save_dir="", *, save_to_disk=None, bb_rpn_weights=False, **checkpointables):
 24 |         is_main_process = comm.is_main_process()
 25 |         super().__init__(
 26 |             model,
 27 |             save_dir,
 28 |             save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
 29 |             **checkpointables,
 30 |         )
 31 |         self.path_manager = PathManager
 32 |         self.bb_rpn_weights = bb_rpn_weights
 33 | 
 34 |     def load(self, path, *args, **kwargs):
 35 |         need_sync = False
 36 | 
 37 |         if path and isinstance(self.model, DistributedDataParallel):
 38 |             logger = logging.getLogger(__name__)
 39 |             path = self.path_manager.get_local_path(path)
 40 |             has_file = os.path.isfile(path)
 41 |             all_has_file = comm.all_gather(has_file)
 42 |             if not all_has_file[0]:
 43 |                 raise OSError(f"File {path} not found on main worker.")
 44 |             if not all(all_has_file):
 45 |                 logger.warning(
 46 |                     f"Not all workers can read checkpoint {path}. "
 47 |                     "Training may fail to fully resume."
 48 |                 )
 49 |                 # TODO: broadcast the checkpoint file contents from main
 50 |                 # worker, and load from it instead.
 51 |                 need_sync = True
 52 |             if not has_file:
 53 |                 path = None  # don't load if not readable
 54 |         ret = super().load(path, *args, **kwargs)
 55 | 
 56 |         if need_sync:
 57 |             logger.info("Broadcasting model states from main worker ...")
 58 |             if TORCH_VERSION >= (1, 7):
 59 |                 self.model._sync_params_and_buffers()
 60 |         return ret
 61 | 
 62 |     def _load_file(self, filename):
 63 |         if filename.endswith(".pkl"):
 64 |             with PathManager.open(filename, "rb") as f:
 65 |                 data = pickle.load(f, encoding="latin1")
 66 |             if "model" in data and "__author__" in data:
 67 |                 # file is in Detectron2 model zoo format
 68 |                 self.logger.info("Reading a file from '{}'".format(data["__author__"]))
 69 |                 return data
 70 |             else:
 71 |                 # assume file is from Caffe2 / Detectron1 model zoo
 72 |                 if "blobs" in data:
 73 |                     # Detection models have "blobs", but ImageNet models don't
 74 |                     data = data["blobs"]
 75 |                 data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
 76 |                 return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
 77 |         elif filename.endswith(".pyth"):
 78 |             # assume file is from pycls; no one else seems to use the ".pyth" extension
 79 |             with PathManager.open(filename, "rb") as f:
 80 |                 data = torch.load(f)
 81 |             assert (
 82 |                 "model_state" in data
 83 |             ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'."
 84 |             model_state = {
 85 |                 k: v
 86 |                 for k, v in data["model_state"].items()
 87 |                 if not k.endswith("num_batches_tracked")
 88 |             }
 89 |             return {"model": model_state, "__author__": "pycls", "matching_heuristics": True}
 90 |         elif "OAI_CLIP" in filename:
 91 |             # assume file is from OpenAI CLIP pre-trained model
 92 |             loaded = super()._load_file(filename)  # load native pth checkpoint
 93 |             if "model" not in loaded:
 94 |                 loaded = {"model": loaded}
 95 |             return {"model": loaded["model"], "__author__": "OAI_CLIP", "matching_heuristics": True}
 96 | 
 97 |         loaded = super()._load_file(filename)  # load native pth checkpoint
 98 |         if "model" not in loaded:
 99 |             loaded = {"model": loaded}
100 |         return loaded
101 | 
102 |     def _load_model(self, checkpoint):
103 |         if checkpoint.get("matching_heuristics", False) or self.bb_rpn_weights:
104 |             self._convert_ndarray_to_tensor(checkpoint["model"])
105 |             # convert weights by name-matching heuristics
106 |             if checkpoint.get("__author__", "NA") == "OAI_CLIP" or self.bb_rpn_weights:  # for OAI_CLIP or 2nd ckpt (offline modules)
107 |                 checkpoint["model"] = align_and_update_state_dicts_for_CLIP(
108 |                     self.model.state_dict(),
109 |                     checkpoint["model"],
110 |                     bb_rpn_weights=self.bb_rpn_weights,
111 |                 )
112 |             else:  # default loading
113 |                 checkpoint["model"] = align_and_update_state_dicts(
114 |                     self.model.state_dict(),
115 |                     checkpoint["model"],
116 |                     c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
117 |                 )
118 |         # for non-caffe2 models, use standard ways to load it
119 |         incompatible = super()._load_model(checkpoint)
120 |         del checkpoint  # try saving memory
121 | 
122 |         model_buffers = dict(self.model.named_buffers(recurse=False))
123 |         for k in ["pixel_mean", "pixel_std"]:
124 |             # Ignore missing key message about pixel_mean/std.
125 |             # Though they may be missing in old checkpoints, they will be correctly
126 |             # initialized from config anyway.
127 |             if k in model_buffers:
128 |                 try:
129 |                     incompatible.missing_keys.remove(k)
130 |                 except ValueError:
131 |                     pass
132 |         return incompatible


--------------------------------------------------------------------------------
/sas_det/configs/ovd_coco_R50_C4_ensemble.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./regionclip/Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "CLIPFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" 
 8 |   MASK_ON: False
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   # RPN:    # not used
16 |   #   HEAD_NAME: StandardRPNHead
17 |   #   IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "EnsembleCLIPRes5ROIHeads"    #
20 |     IN_FEATURES: ["res4"]
21 |     NUM_CLASSES: 48 # base categories only
22 |     SCORE_THRESH_TEST: 0.001
23 |   ROI_BOX_HEAD:
24 |     NAME: "FastRCNNConvFCHead"    # for text head
25 |     NUM_FC: 2
26 |     POOLER_RESOLUTION: 14
27 |     CLS_AGNOSTIC_BBOX_REG: True
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 0
31 |     POOLER_RESOLUTION: 14
32 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
33 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
34 |   CLIP:
35 |     CROP_REGION_TYPE: "RPN"
36 |     USE_TEXT_EMB_CLASSIFIER: True
37 |     CLSS_TEMP: 0.01
38 |     NO_BOX_DELTA: False
39 |     BG_CLS_LOSS_WEIGHT: 0.2
40 |     FOCAL_SCALED_LOSS: 0.5
41 | INPUT:
42 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 | DATASETS:
44 |   TRAIN: ("coco_2017_ovd_b_train",) 
45 |   TEST: ("coco_2017_ovd_all_test",)
46 | TEST:
47 |   EVAL_PERIOD: 5000
48 | SOLVER:
49 |   IMS_PER_BATCH: 16
50 |   BASE_LR: 0.002
51 |   STEPS: (60000, 80000)
52 |   MAX_ITER: 90000
53 |   WARMUP_ITERS: 5000
54 |   CHECKPOINT_PERIOD: 10000
55 | INPUT:
56 |   MIN_SIZE_TRAIN_SAMPLING: choice
57 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
58 |   MAX_SIZE_TRAIN: 1333
59 |   MIN_SIZE_TEST: 800
60 |   MAX_SIZE_TEST: 1333
61 |   FORMAT: "RGB"


--------------------------------------------------------------------------------
/sas_det/configs/ovd_coco_R50_C4_ensemble_PLs.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./ovd_coco_R50_C4_ensemble.yaml"
 2 | MODEL:
 3 |   ROI_BOX_HEAD:
 4 |     NAME: "CLIP_BOX_HEAD"    # close-branch head
 5 |   OVD:
 6 |     WITH_PSEUDO_LABELS: True
 7 |     #
 8 |     USE_ADAPTIVE_THRES: True
 9 |     PL_THRESHOLD: 0.85
10 |     PL_NMS_THRES: 0.5
11 |     RPN_FUSION_METHOD: "avg_norm_scores"
12 |     CATEGORY_INFO: None
13 |     # periodic update
14 |     USE_PERIODIC_UPDATE: True
15 |     # box reg, cls loss
16 |     BOX_CONFIDENCE_THRES: 1.0
17 |     USE_CONFIDENCE_WEIGHT: True


--------------------------------------------------------------------------------
/sas_det/configs/ovd_lvis_R50_C4_SAS_Det_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./ovd_lvis_R50_C4_ensemble_PLs.yaml"
2 | DATASETS:
3 |   TRAIN: ('lvis_v1_train_SASDet_r50x4_PLs', 'lvis_v1_o365_SASDet_r50x4_PLs',)
4 | SOLVER:
5 |   CHECKPOINT_PERIOD: 20000
6 |   STEPS: (210000, 250000)
7 |   MAX_ITER: 270000
8 | TEST:
9 |   EVAL_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/ovd_lvis_R50_C4_ensemble_PLs.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NAME: "EnsembleCLIPRes5ROIHeads"    #
 5 |   ROI_BOX_HEAD:
 6 |     NAME: "CLIP_BOX_HEAD"    # close-branch head
 7 |     # NUM_FC: 2
 8 |     POOLER_RESOLUTION: 14
 9 |     CLS_AGNOSTIC_BBOX_REG: True
10 |   OVD:
11 |     WITH_PSEUDO_LABELS: True
12 |     #
13 |     USE_ADAPTIVE_THRES: True
14 |     PL_NMS_THRES: 0.5
15 |     PL_THRESHOLD: 0.925
16 |     MIN_AVG_PLS: 2.0
17 |     MAX_AVG_PLS: 4.0
18 |     ADAPTIVE_THRES_DELTA: 0.005
19 |     RPN_FUSION_METHOD: "avg_logits"
20 |     CATEGORY_INFO: None   # if None, assume novel cat ids >= len(base_categories)
21 |     # periodic update
22 |     USE_PERIODIC_UPDATE: True
23 |     PERIODIC_STEPS: (120000, 160000)
24 |     # box reg, cls loss
25 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
26 |     USE_CONFIDENCE_WEIGHT: False  # False for LVIS
27 |   ENSEMBLE:
28 |     ALPHA: 0.33
29 |     BETA: 0.67
30 |     # TEST_CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
31 | SOLVER:
32 |   CHECKPOINT_PERIOD: 20000
33 | TEST:
34 |   EVAL_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-C4.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   ROI_HEADS:
 7 |     NAME: "Res5ROIHeads"
 8 | DATASETS:
 9 |   TRAIN: ("coco_2017_train",)
10 |   TEST: ("coco_2017_val",)
11 | SOLVER:
12 |   IMS_PER_BATCH: 16
13 |   BASE_LR: 0.02
14 |   STEPS: (60000, 80000)
15 |   MAX_ITER: 90000
16 | INPUT:
17 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
18 | VERSION: 2
19 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-DilatedC5.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RESNETS:
 4 |     OUT_FEATURES: ["res5"]
 5 |     RES5_DILATION: 2
 6 |   RPN:
 7 |     IN_FEATURES: ["res5"]
 8 |     PRE_NMS_TOPK_TEST: 6000
 9 |     POST_NMS_TOPK_TEST: 1000
10 |   ROI_HEADS:
11 |     NAME: "StandardROIHeads"
12 |     IN_FEATURES: ["res5"]
13 |   ROI_BOX_HEAD:
14 |     NAME: "FastRCNNConvFCHead"
15 |     NUM_FC: 2
16 |     POOLER_RESOLUTION: 7
17 |   ROI_MASK_HEAD:
18 |     NAME: "MaskRCNNConvUpsampleHead"
19 |     NUM_CONV: 4
20 |     POOLER_RESOLUTION: 14
21 | DATASETS:
22 |   TRAIN: ("coco_2017_train",)
23 |   TEST: ("coco_2017_val",)
24 | SOLVER:
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.02
27 |   STEPS: (60000, 80000)
28 |   MAX_ITER: 90000
29 | INPUT:
30 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
31 | VERSION: 2
32 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "StandardROIHeads"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Base-RetinaNet.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "RetinaNet"
 3 |   BACKBONE:
 4 |     NAME: "build_retinanet_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res3", "res4", "res5"]
 7 |   ANCHOR_GENERATOR:
 8 |     SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
 9 |   FPN:
10 |     IN_FEATURES: ["res3", "res4", "res5"]
11 |   RETINANET:
12 |     IOU_THRESHOLDS: [0.4, 0.5]
13 |     IOU_LABELS: [0, -1, 1]
14 |     SMOOTH_L1_LOSS_BETA: 0.0
15 | DATASETS:
16 |   TRAIN: ("coco_2017_train",)
17 |   TEST: ("coco_2017_val",)
18 | SOLVER:
19 |   IMS_PER_BATCH: 16
20 |   BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
21 |   STEPS: (60000, 80000)
22 |   MAX_ITER: 90000
23 | INPUT:
24 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
25 | VERSION: 2
26 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   LOAD_PROPOSALS: True
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   PROPOSAL_GENERATOR:
 9 |     NAME: "PrecomputedProposals"
10 | DATASETS:
11 |   TRAIN: ("coco_2017_train",)
12 |   PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
13 |   TEST: ("coco_2017_val",)
14 |   PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
15 | DATALOADER:
16 |   # proposals are part of the dataset_dicts, and take a lot of RAM
17 |   NUM_WORKERS: 2
18 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: False
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: False
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 5 |   PIXEL_STD: [57.375, 57.120, 58.395]
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 | SOLVER:
12 |   STEPS: (210000, 250000)
13 |   MAX_ITER: 270000
14 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
 1 | from ..common.optim import SGD as optimizer
 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
 3 | from ..common.data.coco import dataloader
 4 | from ..common.models.retinanet import model
 5 | from ..common.train import train
 6 | 
 7 | dataloader.train.mapper.use_instance_mask = False
 8 | model.backbone.bottom_up.freeze_at = 2
 9 | optimizer.lr = 0.01
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/retinanet_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RetinaNet.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | SOLVER:
7 |   STEPS: (210000, 250000)
8 |   MAX_ITER: 270000
9 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/rpn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "ProposalNetwork"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: False
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   RPN:
 9 |     PRE_NMS_TOPK_TEST: 12000
10 |     POST_NMS_TOPK_TEST: 2000
11 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-Detection/rpn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "ProposalNetwork"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   MASK_ON: False
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |   RPN:
 9 |     POST_NMS_TOPK_TEST: 2000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "CLIPFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" 
 8 |   MASK_ON: False
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   RPN:
16 |     HEAD_NAME: StandardRPNHead
17 |     IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "CLIPRes5ROIHeads"
20 |     IN_FEATURES: ["res4"]
21 |     NUM_CLASSES: 48 # base categories
22 |     SCORE_THRESH_TEST: 0.001
23 |   ROI_BOX_HEAD:
24 |     NAME: ""
25 |     NUM_FC: 0
26 |     POOLER_RESOLUTION: 14
27 |     CLS_AGNOSTIC_BBOX_REG: True
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 0
31 |     POOLER_RESOLUTION: 14
32 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
33 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
34 |   CLIP:
35 |     CROP_REGION_TYPE: "RPN"
36 |     USE_TEXT_EMB_CLASSIFIER: True
37 |     CLSS_TEMP: 0.01
38 |     NO_BOX_DELTA: False
39 |     BG_CLS_LOSS_WEIGHT: 0.2
40 |     FOCAL_SCALED_LOSS: 0.5
41 | INPUT:
42 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 | DATASETS:
44 |   TRAIN: ("coco_2017_ovd_b_train",)
45 |   TEST: ("coco_2017_ovd_all_test",)
46 | TEST:
47 |   EVAL_PERIOD: 25000
48 | SOLVER:
49 |   IMS_PER_BATCH: 16
50 |   BASE_LR: 0.002
51 |   STEPS: (60000, 80000)
52 |   MAX_ITER: 90000
53 |   WARMUP_ITERS: 5000
54 |   CHECKPOINT_PERIOD: 10000
55 | INPUT:
56 |   MIN_SIZE_TRAIN_SAMPLING: choice
57 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
58 |   MAX_SIZE_TRAIN: 1333
59 |   MIN_SIZE_TEST: 800
60 |   MAX_SIZE_TEST: 1333
61 |   FORMAT: "RGB"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_coco80.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 |   ROI_HEADS:
4 |     NUM_CLASSES: 80
5 | DATASETS:
6 |   TRAIN: ("coco_2017_train",)
7 |   TEST: ("coco_2017_val",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | DATASETS:
3 |   TEST: ("coco_2017_ovd_b_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | DATASETS:
3 |   TEST: ("coco_2017_ovd_t_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
2 | MODEL:
3 |   ROI_HEADS:
4 |     NUM_CLASSES: 65
5 |     NMS_THRESH_TEST: 0.5
6 |   CLIP:
7 |     NO_BOX_DELTA: True  # no box refinement
8 |     OFFLINE_RPN_NMS_THRESH: 0.7


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf_clipWeights.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 5 |   ROI_HEADS:
 6 |     NUM_CLASSES: 65
 7 |     NMS_THRESH_TEST: 0.5
 8 |   CLIP:
 9 |     NO_BOX_DELTA: True  # no box refinement
10 |     OFFLINE_RPN_NMS_THRESH: 0.9


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NUM_CLASSES: 65  # base + novel categories
 5 |   OVD:
 6 |     WITH_PSEUDO_LABELS: True
 7 |     USE_ADAPTIVE_THRES: True
 8 |     PL_THRESHOLD: 0.8  # init pl threshold
 9 |     PL_NMS_THRES: 0.5
10 |     RPN_FUSION_METHOD: "avg_norm_scores"
11 |     USE_PERIODIC_UPDATE: True
12 |     BOX_CONFIDENCE_THRES: 1.0   # only use pseudo boxes with confidence > BOX_CONFIDENCE_THRES. 1.0 means no pseudo boxes
13 |     USE_CONFIDENCE_WEIGHT: True
14 | DATASETS:
15 |   TRAIN: ("coco_2017_ovd_b_train_65cats",)
16 |   TEST: ("coco_2017_ovd_all_test",)
17 | DATALOADER:
18 |   FILTER_EMPTY_ANNOTATIONS: False   # empty images may contain novel categories
19 | SOLVER:
20 |   CHECKPOINT_PERIOD: 10000
21 | TEST:
22 |   EVAL_PERIOD: 5000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_PLs_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml"
2 | SOLVER:
3 |   STEPS: (210000, 250000)
4 |   MAX_ITER: 270000
5 |   CHECKPOINT_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/CLIP_fast_rcnn_R_50_C4_ovd_vldet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   CLIP:
 4 |     CROP_REGION_TYPE: "RPN"
 5 |     # OFFLINE_RPN_NMS_THRESH: 0.3 # will change offline_cfg.MODEL.RPN.NMS_THRESH, will affect the eval performance
 6 |     # PRETRAIN_RPN_REGIONS: 300   # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
 7 |     PRETRAIN_SAMPLE_REGIONS: 32   # num_regions_per_img, topk in box selection
 8 |     # for inference
 9 |     NO_BOX_DELTA: False   # check
10 |     USE_TEXT_EMB_CLASSIFIER: True
11 |     MULTIPLY_RPN_SCORE: False   # check
12 |   WEAK_LOSS:
13 |     WEAK_LOSS_WEIGHT: 0.01
14 |     BOX_SELECT_THRES: 0.97   # threshold in box selection
15 |     NEG_CONCEPT_NUM: 10
16 | DATASETS:
17 |   TRAIN: ("coco_2017_ovd_b_train", "coco_caption_nouns_train_4764tags",) # coco_2017_ovd_b_train with 48 cats
18 |   TEST: ("coco_generalized_del_val",)
19 | INPUT:
20 |   CUSTOM_AUG: ResizeShortestEdge
21 |   MIN_SIZE_TRAIN_SAMPLING: range
22 |   MIN_SIZE_TRAIN: (800, 800)
23 | DATALOADER:
24 |   SAMPLER_TRAIN: "MultiDatasetSampler"
25 |   DATASET_RATIO: [1, 4]
26 |   USE_DIFF_BS_SIZE: True
27 |   DATASET_BS: [2, 8]
28 |   USE_RFS: [False, False]
29 |   DATASET_MIN_SIZES: [[800, 800], [400, 400]]
30 |   DATASET_MAX_SIZES: [1333, 667]
31 |   FILTER_EMPTY_ANNOTATIONS: False
32 |   MULTI_DATASET_GROUPING: True
33 |   DATASET_ANN: ['box', 'caption']
34 |   NUM_WORKERS: 8
35 | TEST:
36 |   EVAL_PERIOD: 10000
37 | FIND_UNUSED_PARAM: True
38 | WITH_IMAGE_LABELS: True
39 | OUTPUT_DIR: output/test


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_2x_PLs_per4k_clsBoxConf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4_ovd_PLs.yaml"
 2 | MODEL:
 3 |   OVD:
 4 |     WITH_PSEUDO_LABELS: True
 5 |     USE_ADAPTIVE_THRES: True
 6 |     PL_THRESHOLD: 0.9
 7 |     MIN_AVG_PLS: 1.0
 8 |     MAX_AVG_PLS: 3.0
 9 |     PL_NMS_THRES: 0.5
10 |     RPN_FUSION_METHOD: "avg_norm_scores"
11 |     CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json"
12 |     # periodic update
13 |     USE_PERIODIC_UPDATE: True 
14 |     PERIODIC_STEPS: (40000, 80000, 120000, 160000)
15 |     # box reg
16 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
17 | SOLVER:
18 |   STEPS: (120000, 160000)
19 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
20 |   CHECKPOINT_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_PLs_clsBoxConf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MyCLIPFastRCNN"
 4 |   ROI_HEADS:
 5 |     NUM_CLASSES: 65 # base + novel categories
 6 |   OVD:
 7 |     WITH_PSEUDO_LABELS: True
 8 |     USE_ADAPTIVE_THRES: True
 9 |     PL_THRESHOLD: 0.9
10 |     MIN_AVG_PLS: 1.0
11 |     MAX_AVG_PLS: 3.0
12 |     PL_NMS_THRES: 0.5
13 |     RPN_FUSION_METHOD: "avg_norm_scores"
14 |     CATEGORY_INFO: "datasets/coco_ovd_continue_cat_ids.json"
15 |     # periodic update
16 |     USE_PERIODIC_UPDATE: True 
17 |     PERIODIC_STEPS: (40000, 60000, 80000)
18 |     # box reg
19 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
20 | DATASETS:
21 |   TRAIN: ("coco_2017_ovd_b_train_65cats",)
22 |   TEST: ("coco_2017_ovd_all_test",)
23 | SOLVER:
24 |   CHECKPOINT_PERIOD: 10000
25 | TEST:
26 |   EVAL_PERIOD: 5000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_fCLIP_offline_PLs.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MyCLIPFastRCNN"
 4 |   ROI_HEADS:
 5 |     NUM_CLASSES: 65 # base + novel categories
 6 |   OVD:
 7 |     WITH_PSEUDO_LABELS: False   # no online PLs
 8 |     # box reg
 9 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
10 | DATASETS:
11 |   TRAIN: ("",)
12 |   TEST: ("coco_2017_ovd_all_test",)
13 | SOLVER:
14 |   CHECKPOINT_PERIOD: 10000
15 | TEST:
16 |   EVAL_PERIOD: 5000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/customized/ovd_coco_frozen_CLIP_RPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4_ovd.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MyCLIPFastRCNN"
 4 |   # IGNORE_CLS_LOSS: True
 5 |   CLIP:
 6 |     FREEZE_BACKBONE: True
 7 | SOLVER:
 8 |   IMS_PER_BATCH: 16
 9 |   BASE_LR: 0.002
10 |   STEPS: (60000, 80000)
11 |   MAX_ITER: 90000
12 |   WARMUP_ITERS: 5000
13 |   CHECKPOINT_PERIOD: 10000
14 | TEST:
15 |   EVAL_PERIOD: 5000
16 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GeneralizedRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   RPN:
16 |     HEAD_NAME: StandardRPNHead
17 |     IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 |     IN_FEATURES: ["res4"]
21 |   ROI_BOX_HEAD:
22 |     NAME: ""
23 |     NUM_FC: 0
24 |     POOLER_RESOLUTION: 14
25 |   ROI_MASK_HEAD:
26 |     NAME: "MaskRCNNConvUpsampleHead"
27 |     NUM_CONV: 0
28 |     POOLER_RESOLUTION: 14
29 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 
30 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 
31 | INPUT:
32 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
33 | TEST:
34 |   EVAL_PERIOD: 50000
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN_SAMPLING: choice
42 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 |   MAX_SIZE_TRAIN: 1333
44 |   MIN_SIZE_TEST: 800
45 |   MAX_SIZE_TEST: 1333
46 |   FORMAT: "RGB" # "BGR"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GeneralizedRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   RPN:
16 |     HEAD_NAME: StandardRPNHead
17 |     IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 |     IN_FEATURES: ["res4"]
21 |     NUM_CLASSES: 48
22 |   ROI_BOX_HEAD:
23 |     NAME: ""
24 |     NUM_FC: 0
25 |     POOLER_RESOLUTION: 14
26 |   ROI_MASK_HEAD:
27 |     NAME: "MaskRCNNConvUpsampleHead"
28 |     NUM_CONV: 0
29 |     POOLER_RESOLUTION: 14
30 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 
31 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 
32 | INPUT:
33 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
34 | DATASETS:
35 |   TRAIN: ("coco_2017_ovd_b_train",)
36 |   TEST: ("coco_2017_ovd_b_test",)
37 | TEST:
38 |   EVAL_PERIOD: 50000
39 | SOLVER:
40 |   IMS_PER_BATCH: 16
41 |   BASE_LR: 0.02
42 |   STEPS: (60000, 80000)
43 |   MAX_ITER: 90000
44 | INPUT:
45 |   MIN_SIZE_TRAIN_SAMPLING: choice
46 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
47 |   MAX_SIZE_TRAIN: 1333
48 |   MIN_SIZE_TEST: 800
49 |   MAX_SIZE_TEST: 1333
50 |   FORMAT: "RGB" # "BGR"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.train import train
2 | from ..common.optim import SGD as optimizer
3 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
4 | from ..common.data.coco import dataloader
5 | from ..common.models.mask_rcnn_c4 import model
6 | 
7 | model.backbone.freeze_at = 2
8 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-C4.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 48
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_ovd_b_train",)
11 |   TEST: ("coco_2017_ovd_all_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_coco65.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 65
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_ovd_all_train",)
11 |   TEST: ("coco_2017_ovd_all_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-DilatedC5.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
1 | from ..common.optim import SGD as optimizer
2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
3 | from ..common.data.coco import dataloader
4 | from ..common.models.mask_rcnn_fpn import model
5 | from ..common.train import train
6 | 
7 | model.backbone.bottom_up.freeze_at = 2
8 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_giou.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   RPN:
 8 |     BBOX_REG_LOSS_TYPE: "giou"
 9 |     BBOX_REG_LOSS_WEIGHT: 2.0
10 |   ROI_BOX_HEAD:
11 |     BBOX_REG_LOSS_TYPE: "giou"
12 |     BBOX_REG_LOSS_WEIGHT: 10.0
13 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_FSD.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 48
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_ovd_b_train",)
11 |   TEST: ("coco_2017_ovd_b_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_ovd_coco65.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 65
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_ovd_all_train",)
11 |   TEST: ("coco_2017_ovd_all_test",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 5 |   PIXEL_STD: [57.375, 57.120, 58.395]
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 | SOLVER:
12 |   STEPS: (210000, 250000)
13 |   MAX_ITER: 270000
14 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnetx_4gf_dds_fpn_1x.py:
--------------------------------------------------------------------------------
 1 | from ..common.optim import SGD as optimizer
 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
 3 | from ..common.data.coco import dataloader
 4 | from ..common.models.mask_rcnn_fpn import model
 5 | from ..common.train import train
 6 | 
 7 | from detectron2.config import LazyCall as L
 8 | from detectron2.modeling.backbone import RegNet
 9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
10 | 
11 | 
12 | # Replace default ResNet with RegNetX-4GF from the DDS paper. Config source:
13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnetx/RegNetX-4.0GF_dds_8gpu.yaml#L4-L9  # noqa
14 | model.backbone.bottom_up = L(RegNet)(
15 |     stem_class=SimpleStem,
16 |     stem_width=32,
17 |     block_class=ResBottleneckBlock,
18 |     depth=23,
19 |     w_a=38.65,
20 |     w_0=96,
21 |     w_m=2.43,
22 |     group_width=40,
23 |     freeze_at=2,
24 |     norm="FrozenBN",
25 |     out_features=["s1", "s2", "s3", "s4"],
26 | )
27 | model.pixel_std = [57.375, 57.120, 58.395]
28 | 
29 | optimizer.weight_decay = 5e-5
30 | train.init_checkpoint = (
31 |     "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906383/RegNetX-4.0GF_dds_8gpu.pyth"
32 | )
33 | # RegNets benefit from enabling cudnn benchmark mode
34 | train.cudnn_benchmark = True
35 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/COCO-InstanceSegmentation/mask_rcnn_regnety_4gf_dds_fpn_1x.py:
--------------------------------------------------------------------------------
 1 | from ..common.optim import SGD as optimizer
 2 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
 3 | from ..common.data.coco import dataloader
 4 | from ..common.models.mask_rcnn_fpn import model
 5 | from ..common.train import train
 6 | 
 7 | from detectron2.config import LazyCall as L
 8 | from detectron2.modeling.backbone import RegNet
 9 | from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
10 | 
11 | 
12 | # Replace default ResNet with RegNetY-4GF from the DDS paper. Config source:
13 | # https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnety/RegNetY-4.0GF_dds_8gpu.yaml#L4-L10  # noqa
14 | model.backbone.bottom_up = L(RegNet)(
15 |     stem_class=SimpleStem,
16 |     stem_width=32,
17 |     block_class=ResBottleneckBlock,
18 |     depth=22,
19 |     w_a=31.41,
20 |     w_0=96,
21 |     w_m=2.24,
22 |     group_width=64,
23 |     se_ratio=0.25,
24 |     freeze_at=2,
25 |     norm="FrozenBN",
26 |     out_features=["s1", "s2", "s3", "s4"],
27 | )
28 | model.pixel_std = [57.375, 57.120, 58.395]
29 | 
30 | optimizer.weight_decay = 5e-5
31 | train.init_checkpoint = (
32 |     "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906838/RegNetY-4.0GF_dds_8gpu.pyth"
33 | )
34 | # RegNets benefit from enabling cudnn benchmark mode
35 | train.cudnn_benchmark = True
36 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1230
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v0.5_train",)
14 |   TEST: ("lvis_v0.5_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 | DATALOADER:
18 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 |   REPEAT_THRESHOLD: 0.001
20 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1230
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v0.5_train",)
14 |   TEST: ("lvis_v0.5_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 | DATALOADER:
18 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
19 |   REPEAT_THRESHOLD: 0.001
20 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 4 |   PIXEL_STD: [57.375, 57.120, 58.395]
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 |   ROI_HEADS:
12 |     NUM_CLASSES: 1230
13 |     SCORE_THRESH_TEST: 0.0001
14 | INPUT:
15 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
16 | DATASETS:
17 |   TRAIN: ("lvis_v0.5_train",)
18 |   TEST: ("lvis_v0.5_val",)
19 | TEST:
20 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
21 | DATALOADER:
22 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 |   REPEAT_THRESHOLD: 0.001
24 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "CLIPFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: ""
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   RPN:
16 |     HEAD_NAME: StandardRPNHead
17 |     IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "CLIPRes5ROIHeads"
20 |     IN_FEATURES: ["res4"]
21 |     NUM_CLASSES: 866 # 1203
22 |     SCORE_THRESH_TEST: 0.02
23 |   ROI_BOX_HEAD:
24 |     NAME: ""
25 |     NUM_FC: 0
26 |     POOLER_RESOLUTION: 14
27 |     CLS_AGNOSTIC_BBOX_REG: True
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 0
31 |     POOLER_RESOLUTION: 14
32 |     CLS_AGNOSTIC_MASK: True
33 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
34 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
35 |   CLIP:
36 |     CROP_REGION_TYPE: "RPN"
37 |     USE_TEXT_EMB_CLASSIFIER: True
38 |     CLSS_TEMP: 0.01
39 |     NO_BOX_DELTA: False
40 |     BG_CLS_LOSS_WEIGHT: 0.8
41 |     MULTIPLY_RPN_SCORE: True
42 | INPUT:
43 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
44 | DATASETS:
45 |   TRAIN: ("lvis_v1_train",)
46 |   TEST: ("lvis_v1_val",)
47 | TEST:
48 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
49 |   EVAL_PERIOD: 25000
50 | SOLVER:
51 |   IMS_PER_BATCH: 16
52 |   BASE_LR: 0.002
53 |   STEPS: (120000, 160000)
54 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
55 |   WARMUP_ITERS: 5000
56 | DATALOADER:
57 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
58 |   REPEAT_THRESHOLD: 0.001
59 | INPUT:
60 |   MIN_SIZE_TRAIN_SAMPLING: choice
61 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
62 |   MAX_SIZE_TRAIN: 1333
63 |   MIN_SIZE_TEST: 800
64 |   MAX_SIZE_TEST: 1333
65 |   FORMAT: "RGB"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   MASK_ON: False
 4 |   ROI_HEADS:
 5 |     NUM_CLASSES: 1203
 6 |     NMS_THRESH_TEST: 0.3
 7 |   CLIP:
 8 |     NO_BOX_DELTA: True
 9 |     OFFLINE_RPN_NMS_THRESH: 0.9
10 |     VIS: True # Note: visualize the scores before multiplying RPN scores, if any
11 | DATASETS:
12 |   TRAIN: ("lvis_v1_train_custom_img",)
13 |   TEST: ("lvis_v1_val_custom_img",)


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
2 | MODEL:
3 |   MASK_ON: False
4 |   ROI_HEADS:
5 |     NUM_CLASSES: 1203
6 |     NMS_THRESH_TEST: 0.5
7 |   CLIP:
8 |     NO_BOX_DELTA: True
9 |     OFFLINE_RPN_NMS_THRESH: 0.9


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf_clipWeights.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   MASK_ON: False
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 6 |   ROI_HEADS:
 7 |     NUM_CLASSES: 1203
 8 |     NMS_THRESH_TEST: 0.5
 9 |   CLIP:
10 |     NO_BOX_DELTA: True
11 |     OFFLINE_RPN_NMS_THRESH: 0.9


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_box_PLs_periodic_boxConf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NUM_CLASSES: 1203 # base + novel categories
 5 |   OVD:
 6 |     WITH_PSEUDO_LABELS: True
 7 |     #
 8 |     USE_ADAPTIVE_THRES: True
 9 |     PL_NMS_THRES: 0.5
10 |     PL_THRESHOLD: 0.925
11 |     MIN_AVG_PLS: 2.0
12 |     MAX_AVG_PLS: 3.0
13 |     ADAPTIVE_THRES_DELTA: 0.005
14 |     RPN_FUSION_METHOD: "avg_logits"
15 |     # CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
16 |     # periodic update
17 |     USE_PERIODIC_UPDATE: True 
18 |     PERIODIC_STEPS: (120000, 160000)
19 |     # box reg
20 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
21 | DATASETS:
22 |   TRAIN: ("lvis_v1_train_base_1203cats",)
23 |   TEST: ("lvis_v1_val",)
24 | SOLVER:
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.002
27 |   STEPS: (120000, 160000)
28 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
29 |   WARMUP_ITERS: 5000
30 |   CHECKPOINT_PERIOD: 20000
31 | TEST:
32 |   EVAL_PERIOD: 20000
33 | OUTPUT_DIR: output/ovd_lvis_ft_PLs_per4kUpdate_boxConf
34 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_fCLIP_PLs_clsBoxConf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MyCLIPFastRCNN"
 4 |   ROI_HEADS:
 5 |     NUM_CLASSES: 1203 # base + novel categories
 6 |   OVD:
 7 |     WITH_PSEUDO_LABELS: True
 8 |     #
 9 |     USE_ADAPTIVE_THRES: True
10 |     PL_NMS_THRES: 0.5
11 |     PL_THRESHOLD: 0.925
12 |     MIN_AVG_PLS: 1.0
13 |     MAX_AVG_PLS: 3.0
14 |     ADAPTIVE_THRES_DELTA: 0.005
15 |     RPN_FUSION_METHOD: "avg_logits"
16 |     CATEGORY_INFO: "datasets/lvis_ovd_continue_cat_ids.json"
17 |     # periodic update
18 |     USE_PERIODIC_UPDATE: True 
19 |     PERIODIC_STEPS: (40000, 80000, 120000, 160000)
20 |     # box reg
21 |     BOX_CONFIDENCE_THRES: 1.0   # no box reg for PL boxes
22 | DATASETS:
23 |   TRAIN: ("lvis_v1_train_base_1203cats",)
24 |   TEST: ("lvis_v1_val",)
25 | SOLVER:
26 |   IMS_PER_BATCH: 16
27 |   BASE_LR: 0.002
28 |   STEPS: (120000, 160000)
29 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
30 |   WARMUP_ITERS: 5000
31 |   CHECKPOINT_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/customized/ovd_lvis_frozen_CLIP_RPN.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../CLIP_fast_rcnn_R_50_C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MyCLIPFastRCNN"
 4 |   # IGNORE_CLS_LOSS: True
 5 |   CLIP:
 6 |     FREEZE_BACKBONE: True
 7 | SOLVER:
 8 |   IMS_PER_BATCH: 16
 9 |   BASE_LR: 0.002
10 |   STEPS: (60000, 80000)
11 |   MAX_ITER: 90000
12 |   WARMUP_ITERS: 5000
13 |   CHECKPOINT_PERIOD: 10000
14 | TEST:
15 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
16 |   EVAL_PERIOD: 20000


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GeneralizedRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone" #"build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   RPN:
16 |     HEAD_NAME: StandardRPNHead
17 |     IN_FEATURES: ["res4"]
18 |   ROI_HEADS:
19 |     NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
20 |     IN_FEATURES: ["res4"]
21 |     NUM_CLASSES: 1203
22 |     SCORE_THRESH_TEST: 0.0001
23 |   ROI_BOX_HEAD:
24 |     NAME: ""
25 |     NUM_FC: 0
26 |     POOLER_RESOLUTION: 14
27 |   ROI_MASK_HEAD:
28 |     NAME: "MaskRCNNConvUpsampleHead"
29 |     NUM_CONV: 0
30 |     POOLER_RESOLUTION: 14
31 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 
32 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 
33 | INPUT:
34 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
35 | DATASETS:
36 |   TRAIN: ("lvis_v1_train",)
37 |   TEST: ("lvis_v1_val",)
38 | TEST:
39 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
40 |   EVAL_PERIOD: 25000
41 | SOLVER:
42 |   IMS_PER_BATCH: 16
43 |   BASE_LR: 0.02
44 |   STEPS: (120000, 160000) # (140000,)  #
45 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
46 | DATALOADER:
47 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
48 |   REPEAT_THRESHOLD: 0.001
49 | INPUT:
50 |   MIN_SIZE_TRAIN_SAMPLING: choice
51 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
52 |   MAX_SIZE_TRAIN: 1333
53 |   MIN_SIZE_TEST: 800
54 |   MAX_SIZE_TEST: 1333
55 |   FORMAT: "RGB" # "BGR"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_CLIP_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GeneralizedRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_fpn_backbone" # "build_resnet_fpn_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   FPN:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     OUT_CHANNELS: 256
18 |     FUSE_TYPE: sum
19 |   RPN:
20 |     HEAD_NAME: StandardRPNHead
21 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
22 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
23 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
24 |     # Detectron1 uses 2000 proposals per-batch,
25 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
26 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
27 |     POST_NMS_TOPK_TRAIN: 1000
28 |     POST_NMS_TOPK_TEST: 1000
29 |   ROI_HEADS:
30 |     NAME: "StandardROIHeads"
31 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
32 |     NUM_CLASSES: 1203
33 |     SCORE_THRESH_TEST: 0.0001
34 |   ROI_BOX_HEAD:
35 |     NAME: "FastRCNNConvFCHead"
36 |     NUM_FC: 2
37 |     POOLER_RESOLUTION: 7
38 |   ROI_MASK_HEAD:
39 |     NAME: "MaskRCNNConvUpsampleHead"
40 |     NUM_CONV: 4
41 |     POOLER_RESOLUTION: 14
42 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] # [103.530, 116.280, 123.675] # 
43 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] # [1.0, 1.0, 1.0] # 
44 | INPUT:
45 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
46 | DATASETS:
47 |   TRAIN: ("lvis_v1_train",)
48 |   TEST: ("lvis_v1_val",)
49 | TEST:
50 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
51 |   EVAL_PERIOD: 50000
52 | SOLVER:
53 |   IMS_PER_BATCH: 16
54 |   BASE_LR: 0.02
55 |   STEPS: (120000, 160000) # (140000,)  #
56 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
57 | DATALOADER:
58 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
59 |   REPEAT_THRESHOLD: 0.001
60 | INPUT:
61 |   MIN_SIZE_TRAIN_SAMPLING: choice
62 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
63 |   MAX_SIZE_TRAIN: 1333
64 |   MIN_SIZE_TEST: 800
65 |   MAX_SIZE_TEST: 1333
66 |   FORMAT: "RGB" # "BGR"


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1203
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v1_train",)
14 |   TEST: ("lvis_v1_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 | SOLVER:
18 |   STEPS: (120000, 160000)
19 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
20 | DATALOADER:
21 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
22 |   REPEAT_THRESHOLD: 0.001
23 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1203
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v1_train",)
14 |   TEST: ("lvis_v1_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 |   EVAL_PERIOD: 50000
18 | SOLVER:
19 |   STEPS: (120000, 160000)
20 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 |   REPEAT_THRESHOLD: 0.001


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1203
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v1_train",)
14 |   TEST: ("lvis_v1_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 |   EVAL_PERIOD: 50000
18 | SOLVER:
19 |   STEPS: (120000, 160000)
20 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 |   REPEAT_THRESHOLD: 0.001
24 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_2x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 1203
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   TRAIN: ("lvis_v1_train",)
14 |   TEST: ("lvis_v1_val",)
15 | TEST:
16 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
17 |   EVAL_PERIOD: 50000
18 | SOLVER:
19 |   STEPS: (240000, 320000) #(120000, 160000)
20 |   MAX_ITER: 360000  # 180000 * 16 / 100000 ~ 28.8 epochs
21 | DATALOADER:
22 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
23 |   REPEAT_THRESHOLD: 0.001
24 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/LVISv1-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 4 |   PIXEL_STD: [57.375, 57.120, 58.395]
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False  # this is a C2 model
 8 |     NUM_GROUPS: 32
 9 |     WIDTH_PER_GROUP: 8
10 |     DEPTH: 101
11 |   ROI_HEADS:
12 |     NUM_CLASSES: 1203
13 |     SCORE_THRESH_TEST: 0.0001
14 | INPUT:
15 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
16 | DATASETS:
17 |   TRAIN: ("lvis_v1_train",)
18 |   TEST: ("lvis_v1_val",)
19 | SOLVER:
20 |   STEPS: (120000, 160000)
21 |   MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
22 | TEST:
23 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
24 | DATALOADER:
25 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
26 |   REPEAT_THRESHOLD: 0.001
27 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NAME: CascadeROIHeads
 9 |   ROI_BOX_HEAD:
10 |     CLS_AGNOSTIC_BBOX_REG: True
11 |   RPN:
12 |     POST_NMS_TOPK_TRAIN: 2000
13 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NAME: CascadeROIHeads
 9 |   ROI_BOX_HEAD:
10 |     CLS_AGNOSTIC_BBOX_REG: True
11 |   RPN:
12 |     POST_NMS_TOPK_TRAIN: 2000
13 | SOLVER:
14 |   STEPS: (210000, 250000)
15 |   MAX_ITER: 270000
16 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
 5 |   RESNETS:
 6 |     STRIDE_IN_1X1: False  # this is a C2 model
 7 |     NUM_GROUPS: 32
 8 |     WIDTH_PER_GROUP: 8
 9 |     DEPTH: 152
10 |     DEFORM_ON_PER_STAGE: [False, True, True, True]
11 |   ROI_HEADS:
12 |     NAME: "CascadeROIHeads"
13 |   ROI_BOX_HEAD:
14 |     NAME: "FastRCNNConvFCHead"
15 |     NUM_CONV: 4
16 |     NUM_FC: 1
17 |     NORM: "GN"
18 |     CLS_AGNOSTIC_BBOX_REG: True
19 |   ROI_MASK_HEAD:
20 |     NUM_CONV: 8
21 |     NORM: "GN"
22 |   RPN:
23 |     POST_NMS_TOPK_TRAIN: 2000
24 | SOLVER:
25 |   IMS_PER_BATCH: 128
26 |   STEPS: (35000, 45000)
27 |   MAX_ITER: 50000
28 |   BASE_LR: 0.16
29 | INPUT:
30 |   MIN_SIZE_TRAIN: (640, 864)
31 |   MIN_SIZE_TRAIN_SAMPLING: "range"
32 |   MAX_SIZE_TRAIN: 1440
33 |   CROP:
34 |     ENABLED: True
35 | TEST:
36 |   EVAL_PERIOD: 2500
37 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_BOX_HEAD:
 8 |     CLS_AGNOSTIC_BBOX_REG: True
 9 |   ROI_MASK_HEAD:
10 |     CLS_AGNOSTIC_MASK: True
11 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   MASK_ON: True
5 |   RESNETS:
6 |     DEPTH: 50
7 |     DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
8 |     DEFORM_MODULATED: False
9 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
 8 |     DEFORM_MODULATED: False
 9 | SOLVER:
10 |   STEPS: (210000, 250000)
11 |   MAX_ITER: 270000
12 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     NORM: "GN"
 8 |     STRIDE_IN_1X1: False
 9 |   FPN:
10 |     NORM: "GN"
11 |   ROI_BOX_HEAD:
12 |     NAME: "FastRCNNConvFCHead"
13 |     NUM_CONV: 4
14 |     NUM_FC: 1
15 |     NORM: "GN"
16 |   ROI_MASK_HEAD:
17 |     NORM: "GN"
18 | SOLVER:
19 |   # 3x schedule
20 |   STEPS: (210000, 250000)
21 |   MAX_ITER: 270000
22 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |     NORM: "SyncBN"
 8 |     STRIDE_IN_1X1: True
 9 |   FPN:
10 |     NORM: "SyncBN"
11 |   ROI_BOX_HEAD:
12 |     NAME: "FastRCNNConvFCHead"
13 |     NUM_CONV: 4
14 |     NUM_FC: 1
15 |     NORM: "SyncBN"
16 |   ROI_MASK_HEAD:
17 |     NORM: "SyncBN"
18 | SOLVER:
19 |   # 3x schedule
20 |   STEPS: (210000, 250000)
21 |   MAX_ITER: 270000
22 | TEST:
23 |   PRECISE_BN:
24 |     ENABLED: True
25 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/mmdet_mask_rcnn_R_50_FPN_1x.py:
--------------------------------------------------------------------------------
  1 | # An example config to train a mmdetection model using detectron2.
  2 | 
  3 | from ..common.data.coco import dataloader
  4 | from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
  5 | from ..common.optim import SGD as optimizer
  6 | from ..common.train import train
  7 | 
  8 | from detectron2.modeling.mmdet_wrapper import MMDetDetector
  9 | from detectron2.config import LazyCall as L
 10 | 
 11 | model = L(MMDetDetector)(
 12 |     detector=dict(
 13 |         type="MaskRCNN",
 14 |         pretrained="torchvision://resnet50",
 15 |         backbone=dict(
 16 |             type="ResNet",
 17 |             depth=50,
 18 |             num_stages=4,
 19 |             out_indices=(0, 1, 2, 3),
 20 |             frozen_stages=1,
 21 |             norm_cfg=dict(type="BN", requires_grad=True),
 22 |             norm_eval=True,
 23 |             style="pytorch",
 24 |         ),
 25 |         neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
 26 |         rpn_head=dict(
 27 |             type="RPNHead",
 28 |             in_channels=256,
 29 |             feat_channels=256,
 30 |             anchor_generator=dict(
 31 |                 type="AnchorGenerator",
 32 |                 scales=[8],
 33 |                 ratios=[0.5, 1.0, 2.0],
 34 |                 strides=[4, 8, 16, 32, 64],
 35 |             ),
 36 |             bbox_coder=dict(
 37 |                 type="DeltaXYWHBBoxCoder",
 38 |                 target_means=[0.0, 0.0, 0.0, 0.0],
 39 |                 target_stds=[1.0, 1.0, 1.0, 1.0],
 40 |             ),
 41 |             loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
 42 |             loss_bbox=dict(type="L1Loss", loss_weight=1.0),
 43 |         ),
 44 |         roi_head=dict(
 45 |             type="StandardRoIHead",
 46 |             bbox_roi_extractor=dict(
 47 |                 type="SingleRoIExtractor",
 48 |                 roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
 49 |                 out_channels=256,
 50 |                 featmap_strides=[4, 8, 16, 32],
 51 |             ),
 52 |             bbox_head=dict(
 53 |                 type="Shared2FCBBoxHead",
 54 |                 in_channels=256,
 55 |                 fc_out_channels=1024,
 56 |                 roi_feat_size=7,
 57 |                 num_classes=80,
 58 |                 bbox_coder=dict(
 59 |                     type="DeltaXYWHBBoxCoder",
 60 |                     target_means=[0.0, 0.0, 0.0, 0.0],
 61 |                     target_stds=[0.1, 0.1, 0.2, 0.2],
 62 |                 ),
 63 |                 reg_class_agnostic=False,
 64 |                 loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
 65 |                 loss_bbox=dict(type="L1Loss", loss_weight=1.0),
 66 |             ),
 67 |             mask_roi_extractor=dict(
 68 |                 type="SingleRoIExtractor",
 69 |                 roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0),
 70 |                 out_channels=256,
 71 |                 featmap_strides=[4, 8, 16, 32],
 72 |             ),
 73 |             mask_head=dict(
 74 |                 type="FCNMaskHead",
 75 |                 num_convs=4,
 76 |                 in_channels=256,
 77 |                 conv_out_channels=256,
 78 |                 num_classes=80,
 79 |                 loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0),
 80 |             ),
 81 |         ),
 82 |         # model training and testing settings
 83 |         train_cfg=dict(
 84 |             rpn=dict(
 85 |                 assigner=dict(
 86 |                     type="MaxIoUAssigner",
 87 |                     pos_iou_thr=0.7,
 88 |                     neg_iou_thr=0.3,
 89 |                     min_pos_iou=0.3,
 90 |                     match_low_quality=True,
 91 |                     ignore_iof_thr=-1,
 92 |                 ),
 93 |                 sampler=dict(
 94 |                     type="RandomSampler",
 95 |                     num=256,
 96 |                     pos_fraction=0.5,
 97 |                     neg_pos_ub=-1,
 98 |                     add_gt_as_proposals=False,
 99 |                 ),
100 |                 allowed_border=-1,
101 |                 pos_weight=-1,
102 |                 debug=False,
103 |             ),
104 |             rpn_proposal=dict(
105 |                 nms_pre=2000,
106 |                 max_per_img=1000,
107 |                 nms=dict(type="nms", iou_threshold=0.7),
108 |                 min_bbox_size=0,
109 |             ),
110 |             rcnn=dict(
111 |                 assigner=dict(
112 |                     type="MaxIoUAssigner",
113 |                     pos_iou_thr=0.5,
114 |                     neg_iou_thr=0.5,
115 |                     min_pos_iou=0.5,
116 |                     match_low_quality=True,
117 |                     ignore_iof_thr=-1,
118 |                 ),
119 |                 sampler=dict(
120 |                     type="RandomSampler",
121 |                     num=512,
122 |                     pos_fraction=0.25,
123 |                     neg_pos_ub=-1,
124 |                     add_gt_as_proposals=True,
125 |                 ),
126 |                 mask_size=28,
127 |                 pos_weight=-1,
128 |                 debug=False,
129 |             ),
130 |         ),
131 |         test_cfg=dict(
132 |             rpn=dict(
133 |                 nms_pre=1000,
134 |                 max_per_img=1000,
135 |                 nms=dict(type="nms", iou_threshold=0.7),
136 |                 min_bbox_size=0,
137 |             ),
138 |             rcnn=dict(
139 |                 score_thr=0.05,
140 |                 nms=dict(type="nms", iou_threshold=0.5),
141 |                 max_per_img=100,
142 |                 mask_thr_binary=0.5,
143 |             ),
144 |         ),
145 |     ),
146 |     pixel_mean=[123.675, 116.280, 103.530],
147 |     pixel_std=[58.395, 57.120, 57.375],
148 | )
149 | 
150 | dataloader.train.mapper.image_format = "RGB"  # torchvision pretrained model
151 | train.init_checkpoint = None  # pretrained model is loaded inside backbone
152 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml:
--------------------------------------------------------------------------------
 1 | # A large PanopticFPN for demo purposes.
 2 | # Use GN on backbone to support semantic seg.
 3 | # Use Cascade + Deform Conv to improve localization.
 4 | _BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
 5 | MODEL:
 6 |   WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
 7 |   RESNETS:
 8 |     DEPTH: 101
 9 |     NORM: "GN"
10 |     DEFORM_ON_PER_STAGE: [False, True, True, True]
11 |     STRIDE_IN_1X1: False
12 |   FPN:
13 |     NORM: "GN"
14 |   ROI_HEADS:
15 |     NAME: CascadeROIHeads
16 |   ROI_BOX_HEAD:
17 |     CLS_AGNOSTIC_BBOX_REG: True
18 |   ROI_MASK_HEAD:
19 |     NORM: "GN"
20 |   RPN:
21 |     POST_NMS_TOPK_TRAIN: 2000
22 | SOLVER:
23 |   STEPS: (105000, 125000)
24 |   MAX_ITER: 135000
25 |   IMS_PER_BATCH: 32
26 |   BASE_LR: 0.04
27 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
 2 | MODEL:
 3 |   # Train from random initialization.
 4 |   WEIGHTS: ""
 5 |   # It makes sense to divide by STD when training from scratch
 6 |   # But it seems to make no difference on the results and C2's models didn't do this.
 7 |   # So we keep things consistent with C2.
 8 |   # PIXEL_STD: [57.375, 57.12, 58.395]
 9 |   MASK_ON: True
10 |   BACKBONE:
11 |     FREEZE_AT: 0
12 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
13 | # to learn what you need for training from scratch.
14 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
 2 | MODEL:
 3 |   PIXEL_STD: [57.375, 57.12, 58.395]
 4 |   WEIGHTS: ""
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False
 8 |   BACKBONE:
 9 |     FREEZE_AT: 0
10 | SOLVER:
11 |   # 9x schedule
12 |   IMS_PER_BATCH: 64  # 4x the standard
13 |   STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
14 |   MAX_ITER: 202500   # 90k * 9 / 4
15 |   BASE_LR: 0.08
16 | TEST:
17 |   EVAL_PERIOD: 2500
18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
19 | # to learn what you need for training from scratch.
20 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml"
 2 | MODEL:
 3 |   PIXEL_STD: [57.375, 57.12, 58.395]
 4 |   WEIGHTS: ""
 5 |   MASK_ON: True
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False
 8 |   BACKBONE:
 9 |     FREEZE_AT: 0
10 | SOLVER:
11 |   # 9x schedule
12 |   IMS_PER_BATCH: 64  # 4x the standard
13 |   STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
14 |   MAX_ITER: 202500   # 90k * 9 / 4
15 |   BASE_LR: 0.08
16 | TEST:
17 |   EVAL_PERIOD: 2500
18 | # NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
19 | # to learn what you need for training from scratch.
20 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/semantic_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "SemanticSegmentor"
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_train_panoptic_stuffonly",)
 9 |   TEST: ("coco_2017_val_panoptic_stuffonly",)
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/Misc/torchvision_imagenet_R_50.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An example config file to train a ImageNet classifier with detectron2.
  3 | Model and dataloader both come from torchvision.
  4 | This shows how to use detectron2 as a general engine for any new models and tasks.
  5 | To run, use the following command:
  6 | 
  7 | python tools/lazyconfig_train_net.py --config-file configs/Misc/torchvision_imagenet_R_50.py \
  8 |     --num-gpus 8 dataloader.train.dataset.root=/path/to/imagenet/
  9 | """
 10 | 
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | from torch.nn import functional as F
 15 | from omegaconf import OmegaConf
 16 | import torchvision
 17 | from torchvision.transforms import transforms as T
 18 | from torchvision.models.resnet import ResNet, Bottleneck
 19 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 20 | 
 21 | from detectron2.solver import WarmupParamScheduler
 22 | from detectron2.solver.build import get_default_optimizer_params
 23 | from detectron2.config import LazyCall as L
 24 | from detectron2.model_zoo import get_config
 25 | from detectron2.data.samplers import TrainingSampler, InferenceSampler
 26 | from detectron2.evaluation import DatasetEvaluator
 27 | from detectron2.utils import comm
 28 | 
 29 | 
 30 | def build_data_loader(dataset, batch_size, num_workers, training=True):
 31 |     return torch.utils.data.DataLoader(
 32 |         dataset,
 33 |         sampler=(TrainingSampler if training else InferenceSampler)(len(dataset)),
 34 |         batch_size=batch_size,
 35 |         num_workers=num_workers,
 36 |         pin_memory=True,
 37 |     )
 38 | 
 39 | 
 40 | class ClassificationNet(nn.Module):
 41 |     def __init__(self, model: nn.Module):
 42 |         super().__init__()
 43 |         self.model = model
 44 | 
 45 |     @property
 46 |     def device(self):
 47 |         return list(self.model.parameters())[0].device
 48 | 
 49 |     def forward(self, inputs):
 50 |         image, label = inputs
 51 |         pred = self.model(image.to(self.device))
 52 |         if self.training:
 53 |             label = label.to(self.device)
 54 |             return F.cross_entropy(pred, label)
 55 |         else:
 56 |             return pred
 57 | 
 58 | 
 59 | class ClassificationAcc(DatasetEvaluator):
 60 |     def reset(self):
 61 |         self.corr = self.total = 0
 62 | 
 63 |     def process(self, inputs, outputs):
 64 |         image, label = inputs
 65 |         self.corr += (outputs.argmax(dim=1).cpu() == label.cpu()).sum().item()
 66 |         self.total += len(label)
 67 | 
 68 |     def evaluate(self):
 69 |         all_corr_total = comm.all_gather([self.corr, self.total])
 70 |         corr = sum(x[0] for x in all_corr_total)
 71 |         total = sum(x[1] for x in all_corr_total)
 72 |         return {"accuracy": corr / total}
 73 | 
 74 | 
 75 | dataloader = OmegaConf.create()
 76 | dataloader.train = L(build_data_loader)(
 77 |     dataset=L(torchvision.datasets.ImageNet)(
 78 |         root="/path/to/imagenet",
 79 |         split="train",
 80 |         transform=L(T.Compose)(
 81 |             transforms=[
 82 |                 L(T.RandomResizedCrop)(size=224),
 83 |                 L(T.RandomHorizontalFlip)(),
 84 |                 T.ToTensor(),
 85 |                 L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
 86 |             ]
 87 |         ),
 88 |     ),
 89 |     batch_size=256 // 8,
 90 |     num_workers=4,
 91 |     training=True,
 92 | )
 93 | 
 94 | dataloader.test = L(build_data_loader)(
 95 |     dataset=L(torchvision.datasets.ImageNet)(
 96 |         root="${...train.dataset.root}",
 97 |         split="val",
 98 |         transform=L(T.Compose)(
 99 |             transforms=[
100 |                 L(T.Resize)(size=256),
101 |                 L(T.CenterCrop)(size=224),
102 |                 T.ToTensor(),
103 |                 L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
104 |             ]
105 |         ),
106 |     ),
107 |     batch_size=256 // 8,
108 |     num_workers=4,
109 |     training=False,
110 | )
111 | 
112 | dataloader.evaluator = L(ClassificationAcc)()
113 | 
114 | model = L(ClassificationNet)(
115 |     model=(ResNet)(block=Bottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)
116 | )
117 | 
118 | 
119 | optimizer = L(torch.optim.SGD)(
120 |     params=L(get_default_optimizer_params)(),
121 |     lr=0.1,
122 |     momentum=0.9,
123 |     weight_decay=1e-4,
124 | )
125 | 
126 | lr_multiplier = L(WarmupParamScheduler)(
127 |     scheduler=L(MultiStepParamScheduler)(
128 |         values=[1.0, 0.1, 0.01, 0.001], milestones=[30, 60, 90, 100]
129 |     ),
130 |     warmup_length=1 / 100,
131 |     warmup_factor=0.1,
132 | )
133 | 
134 | 
135 | train = get_config("common/train.py").train
136 | train.init_checkpoint = None
137 | train.max_iter = 100 * 1281167 // 256
138 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/README.md:
--------------------------------------------------------------------------------
1 | This directory provides definitions for a few common models, dataloaders, scheduler,
2 | and optimizers that are often used in training.
3 | The definition of these objects are provided in the form of lazy instantiation:
4 | their arguments can be edited by users before constructing the objects.
5 | 
6 | They can be imported, or loaded by `model_zoo.get_config` API in users' own configs.
7 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/coco_schedule.py:
--------------------------------------------------------------------------------
 1 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 2 | 
 3 | from detectron2.config import LazyCall as L
 4 | from detectron2.solver import WarmupParamScheduler
 5 | 
 6 | 
 7 | def default_X_scheduler(num_X):
 8 |     """
 9 |     Returns the config for a default multi-step LR scheduler such as "1x", "3x",
10 |     commonly referred to in papers, where every 1x has the total length of 1440k
11 |     training images (~12 COCO epochs). LR is decayed twice at the end of training
12 |     following the strategy defined in "Rethinking ImageNet Pretraining", Sec 4.
13 | 
14 |     Args:
15 |         num_X: a positive real number
16 | 
17 |     Returns:
18 |         DictConfig: configs that define the multiplier for LR during training
19 |     """
20 |     # total number of iterations assuming 16 batch size, using 1440000/16=90000
21 |     total_steps_16bs = num_X * 90000
22 | 
23 |     if num_X <= 2:
24 |         scheduler = L(MultiStepParamScheduler)(
25 |             values=[1.0, 0.1, 0.01],
26 |             # note that scheduler is scale-invariant. This is equivalent to
27 |             # milestones=[6, 8, 9]
28 |             milestones=[60000, 80000, 90000],
29 |         )
30 |     else:
31 |         scheduler = L(MultiStepParamScheduler)(
32 |             values=[1.0, 0.1, 0.01],
33 |             milestones=[total_steps_16bs - 60000, total_steps_16bs - 20000, total_steps_16bs],
34 |         )
35 |     return L(WarmupParamScheduler)(
36 |         scheduler=scheduler,
37 |         warmup_length=1000 / total_steps_16bs,
38 |         warmup_method="linear",
39 |         warmup_factor=0.001,
40 |     )
41 | 
42 | 
43 | lr_multiplier_1x = default_X_scheduler(1)
44 | lr_multiplier_2x = default_X_scheduler(2)
45 | lr_multiplier_3x = default_X_scheduler(3)
46 | lr_multiplier_6x = default_X_scheduler(6)
47 | lr_multiplier_9x = default_X_scheduler(9)
48 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | 
 3 | import detectron2.data.transforms as T
 4 | from detectron2.config import LazyCall as L
 5 | from detectron2.data import (
 6 |     DatasetMapper,
 7 |     build_detection_test_loader,
 8 |     build_detection_train_loader,
 9 |     get_detection_dataset_dicts,
10 | )
11 | from detectron2.evaluation import COCOEvaluator
12 | 
13 | dataloader = OmegaConf.create()
14 | 
15 | dataloader.train = L(build_detection_train_loader)(
16 |     dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
17 |     mapper=L(DatasetMapper)(
18 |         is_train=True,
19 |         augmentations=[
20 |             L(T.ResizeShortestEdge)(
21 |                 short_edge_length=(640, 672, 704, 736, 768, 800),
22 |                 sample_style="choice",
23 |                 max_size=1333,
24 |             ),
25 |             L(T.RandomFlip)(horizontal=True),
26 |         ],
27 |         image_format="BGR",
28 |         use_instance_mask=True,
29 |     ),
30 |     total_batch_size=16,
31 |     num_workers=4,
32 | )
33 | 
34 | dataloader.test = L(build_detection_test_loader)(
35 |     dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
36 |     mapper=L(DatasetMapper)(
37 |         is_train=False,
38 |         augmentations=[
39 |             L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333),
40 |         ],
41 |         image_format="${...train.mapper.image_format}",
42 |     ),
43 |     num_workers=4,
44 | )
45 | 
46 | dataloader.evaluator = L(COCOEvaluator)(
47 |     dataset_name="${..test.dataset.names}",
48 | )
49 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco_keypoint.py:
--------------------------------------------------------------------------------
 1 | from detectron2.data.detection_utils import create_keypoint_hflip_indices
 2 | 
 3 | from .coco import dataloader
 4 | 
 5 | dataloader.train.dataset.min_keypoints = 1
 6 | dataloader.train.dataset.names = "keypoints_coco_2017_train"
 7 | dataloader.test.dataset.names = "keypoints_coco_2017_val"
 8 | 
 9 | dataloader.train.mapper.update(
10 |     use_instance_mask=False,
11 |     use_keypoint=True,
12 |     keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names),
13 | )
14 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/data/coco_panoptic_separated.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.evaluation import (
 3 |     COCOEvaluator,
 4 |     COCOPanopticEvaluator,
 5 |     DatasetEvaluators,
 6 |     SemSegEvaluator,
 7 | )
 8 | 
 9 | from .coco import dataloader
10 | 
11 | dataloader.train.dataset.names = "coco_2017_train_panoptic_separated"
12 | dataloader.train.dataset.filter_empty = False
13 | dataloader.test.dataset.names = "coco_2017_val_panoptic_separated"
14 | 
15 | 
16 | dataloader.evaluator = [
17 |     L(COCOEvaluator)(
18 |         dataset_name="${...test.dataset.names}",
19 |     ),
20 |     L(SemSegEvaluator)(
21 |         dataset_name="${...test.dataset.names}",
22 |     ),
23 |     L(COCOPanopticEvaluator)(
24 |         dataset_name="${...test.dataset.names}",
25 |     ),
26 | ]
27 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/cascade_rcnn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.box_regression import Box2BoxTransform
 4 | from detectron2.modeling.matcher import Matcher
 5 | from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads
 6 | 
 7 | from .mask_rcnn_fpn import model
 8 | 
 9 | # arguments that don't exist for Cascade R-CNN
10 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
11 | 
12 | model.roi_heads.update(
13 |     _target_=CascadeROIHeads,
14 |     box_heads=[
15 |         L(FastRCNNConvFCHead)(
16 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
17 |             conv_dims=[],
18 |             fc_dims=[1024, 1024],
19 |         )
20 |         for k in range(3)
21 |     ],
22 |     box_predictors=[
23 |         L(FastRCNNOutputLayers)(
24 |             input_shape=ShapeSpec(channels=1024),
25 |             test_score_thresh=0.05,
26 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
27 |             cls_agnostic_bbox_reg=True,
28 |             num_classes="${...num_classes}",
29 |         )
30 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
31 |     ],
32 |     proposal_matchers=[
33 |         L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
34 |         for th in [0.5, 0.6, 0.7]
35 |     ],
36 | )
37 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/keypoint_rcnn_fpn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.poolers import ROIPooler
 4 | from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead
 5 | 
 6 | from .mask_rcnn_fpn import model
 7 | 
 8 | [model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]]
 9 | 
10 | model.roi_heads.update(
11 |     num_classes=1,
12 |     keypoint_in_features=["p2", "p3", "p4", "p5"],
13 |     keypoint_pooler=L(ROIPooler)(
14 |         output_size=14,
15 |         scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
16 |         sampling_ratio=0,
17 |         pooler_type="ROIAlignV2",
18 |     ),
19 |     keypoint_head=L(KRCNNConvDeconvUpsampleHead)(
20 |         input_shape=ShapeSpec(channels=256, width=14, height=14),
21 |         num_keypoints=17,
22 |         conv_dims=[512] * 8,
23 |         loss_normalizer="visible",
24 |     ),
25 | )
26 | 
27 | # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
28 | # 1000 proposals per-image is found to hurt box AP.
29 | # Therefore we increase it to 1500 per-image.
30 | model.proposal_generator.post_nms_topk = (1500, 1000)
31 | 
32 | # Keypoint AP degrades (though box AP improves) when using plain L1 loss
33 | model.roi_heads.box_predictor.smooth_l1_beta = 0.5
34 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/mask_rcnn_c4.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.meta_arch import GeneralizedRCNN
 4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
 5 | from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet
 6 | from detectron2.modeling.box_regression import Box2BoxTransform
 7 | from detectron2.modeling.matcher import Matcher
 8 | from detectron2.modeling.poolers import ROIPooler
 9 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
10 | from detectron2.modeling.roi_heads import (
11 |     FastRCNNOutputLayers,
12 |     MaskRCNNConvUpsampleHead,
13 |     Res5ROIHeads,
14 | )
15 | 
16 | model = L(GeneralizedRCNN)(
17 |     backbone=L(ResNet)(
18 |         stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
19 |         stages=L(ResNet.make_default_stages)(
20 |             depth=50,
21 |             stride_in_1x1=True,
22 |             norm="FrozenBN",
23 |         ),
24 |         out_features=["res4"],
25 |     ),
26 |     proposal_generator=L(RPN)(
27 |         in_features=["res4"],
28 |         head=L(StandardRPNHead)(in_channels=1024, num_anchors=15),
29 |         anchor_generator=L(DefaultAnchorGenerator)(
30 |             sizes=[[32, 64, 128, 256, 512]],
31 |             aspect_ratios=[0.5, 1.0, 2.0],
32 |             strides=[16],
33 |             offset=0.0,
34 |         ),
35 |         anchor_matcher=L(Matcher)(
36 |             thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
37 |         ),
38 |         box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
39 |         batch_size_per_image=256,
40 |         positive_fraction=0.5,
41 |         pre_nms_topk=(12000, 6000),
42 |         post_nms_topk=(2000, 1000),
43 |         nms_thresh=0.7,
44 |     ),
45 |     roi_heads=L(Res5ROIHeads)(
46 |         num_classes=80,
47 |         batch_size_per_image=512,
48 |         positive_fraction=0.25,
49 |         proposal_matcher=L(Matcher)(
50 |             thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
51 |         ),
52 |         in_features=["res4"],
53 |         pooler=L(ROIPooler)(
54 |             output_size=14,
55 |             scales=(1.0 / 16,),
56 |             sampling_ratio=0,
57 |             pooler_type="ROIAlignV2",
58 |         ),
59 |         res5=L(ResNet.make_stage)(
60 |             block_class=BottleneckBlock,
61 |             num_blocks=3,
62 |             stride_per_block=[2, 1, 1],
63 |             in_channels=1024,
64 |             bottleneck_channels=512,
65 |             out_channels=2048,
66 |             norm="FrozenBN",
67 |             stride_in_1x1=True,
68 |         ),
69 |         box_predictor=L(FastRCNNOutputLayers)(
70 |             input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1),
71 |             test_score_thresh=0.05,
72 |             box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
73 |             num_classes="${..num_classes}",
74 |         ),
75 |         mask_head=L(MaskRCNNConvUpsampleHead)(
76 |             input_shape=L(ShapeSpec)(
77 |                 channels="${...res5.out_channels}",
78 |                 width="${...pooler.output_size}",
79 |                 height="${...pooler.output_size}",
80 |             ),
81 |             num_classes="${..num_classes}",
82 |             conv_dims=[256],
83 |         ),
84 |     ),
85 |     pixel_mean=[103.530, 116.280, 123.675],
86 |     pixel_std=[1.0, 1.0, 1.0],
87 |     input_format="BGR",
88 | )
89 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/mask_rcnn_fpn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.meta_arch import GeneralizedRCNN
 4 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 6 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet
 7 | from detectron2.modeling.box_regression import Box2BoxTransform
 8 | from detectron2.modeling.matcher import Matcher
 9 | from detectron2.modeling.poolers import ROIPooler
10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
11 | from detectron2.modeling.roi_heads import (
12 |     StandardROIHeads,
13 |     FastRCNNOutputLayers,
14 |     MaskRCNNConvUpsampleHead,
15 |     FastRCNNConvFCHead,
16 | )
17 | 
18 | model = L(GeneralizedRCNN)(
19 |     backbone=L(FPN)(
20 |         bottom_up=L(ResNet)(
21 |             stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
22 |             stages=L(ResNet.make_default_stages)(
23 |                 depth=50,
24 |                 stride_in_1x1=True,
25 |                 norm="FrozenBN",
26 |             ),
27 |             out_features=["res2", "res3", "res4", "res5"],
28 |         ),
29 |         in_features="${.bottom_up.out_features}",
30 |         out_channels=256,
31 |         top_block=L(LastLevelMaxPool)(),
32 |     ),
33 |     proposal_generator=L(RPN)(
34 |         in_features=["p2", "p3", "p4", "p5", "p6"],
35 |         head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
36 |         anchor_generator=L(DefaultAnchorGenerator)(
37 |             sizes=[[32], [64], [128], [256], [512]],
38 |             aspect_ratios=[0.5, 1.0, 2.0],
39 |             strides=[4, 8, 16, 32, 64],
40 |             offset=0.0,
41 |         ),
42 |         anchor_matcher=L(Matcher)(
43 |             thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
44 |         ),
45 |         box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
46 |         batch_size_per_image=256,
47 |         positive_fraction=0.5,
48 |         pre_nms_topk=(2000, 1000),
49 |         post_nms_topk=(1000, 1000),
50 |         nms_thresh=0.7,
51 |     ),
52 |     roi_heads=L(StandardROIHeads)(
53 |         num_classes=80,
54 |         batch_size_per_image=512,
55 |         positive_fraction=0.25,
56 |         proposal_matcher=L(Matcher)(
57 |             thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
58 |         ),
59 |         box_in_features=["p2", "p3", "p4", "p5"],
60 |         box_pooler=L(ROIPooler)(
61 |             output_size=7,
62 |             scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
63 |             sampling_ratio=0,
64 |             pooler_type="ROIAlignV2",
65 |         ),
66 |         box_head=L(FastRCNNConvFCHead)(
67 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
68 |             conv_dims=[],
69 |             fc_dims=[1024, 1024],
70 |         ),
71 |         box_predictor=L(FastRCNNOutputLayers)(
72 |             input_shape=ShapeSpec(channels=1024),
73 |             test_score_thresh=0.05,
74 |             box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
75 |             num_classes="${..num_classes}",
76 |         ),
77 |         mask_in_features=["p2", "p3", "p4", "p5"],
78 |         mask_pooler=L(ROIPooler)(
79 |             output_size=14,
80 |             scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
81 |             sampling_ratio=0,
82 |             pooler_type="ROIAlignV2",
83 |         ),
84 |         mask_head=L(MaskRCNNConvUpsampleHead)(
85 |             input_shape=ShapeSpec(channels=256, width=14, height=14),
86 |             num_classes="${..num_classes}",
87 |             conv_dims=[256, 256, 256, 256, 256],
88 |         ),
89 |     ),
90 |     pixel_mean=[103.530, 116.280, 123.675],
91 |     pixel_std=[1.0, 1.0, 1.0],
92 |     input_format="BGR",
93 | )
94 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/panoptic_fpn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling import PanopticFPN
 4 | from detectron2.modeling.meta_arch.semantic_seg import SemSegFPNHead
 5 | 
 6 | from .mask_rcnn_fpn import model
 7 | 
 8 | model._target_ = PanopticFPN
 9 | model.sem_seg_head = L(SemSegFPNHead)(
10 |     input_shape={
11 |         f: L(ShapeSpec)(stride=s, channels="${....backbone.out_channels}")
12 |         for f, s in zip(["p2", "p3", "p4", "p5"], [4, 8, 16, 32])
13 |     },
14 |     ignore_value=255,
15 |     num_classes=54,  # COCO stuff + 1
16 |     conv_dims=128,
17 |     common_stride=4,
18 |     loss_weight=0.5,
19 |     norm="GN",
20 | )
21 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/models/retinanet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from detectron2.config import LazyCall as L
 4 | from detectron2.layers import ShapeSpec
 5 | from detectron2.modeling.meta_arch import RetinaNet
 6 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
 7 | from detectron2.modeling.backbone.fpn import LastLevelP6P7
 8 | from detectron2.modeling.backbone import BasicStem, FPN, ResNet
 9 | from detectron2.modeling.box_regression import Box2BoxTransform
10 | from detectron2.modeling.matcher import Matcher
11 | from detectron2.modeling.meta_arch.retinanet import RetinaNetHead
12 | 
13 | model = L(RetinaNet)(
14 |     backbone=L(FPN)(
15 |         bottom_up=L(ResNet)(
16 |             stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
17 |             stages=L(ResNet.make_default_stages)(
18 |                 depth=50,
19 |                 stride_in_1x1=True,
20 |                 norm="FrozenBN",
21 |             ),
22 |             out_features=["res3", "res4", "res5"],
23 |         ),
24 |         in_features=["res3", "res4", "res5"],
25 |         out_channels=256,
26 |         top_block=L(LastLevelP6P7)(in_channels=2048, out_channels="${..out_channels}"),
27 |     ),
28 |     head=L(RetinaNetHead)(
29 |         input_shape=[ShapeSpec(channels=256)],
30 |         num_classes="${..num_classes}",
31 |         conv_dims=[256, 256, 256, 256],
32 |         prior_prob=0.01,
33 |         num_anchors=9,
34 |     ),
35 |     anchor_generator=L(DefaultAnchorGenerator)(
36 |         sizes=[[x, x * 2 ** (1.0 / 3), x * 2 ** (2.0 / 3)] for x in [32, 64, 128, 256, 512]],
37 |         aspect_ratios=[0.5, 1.0, 2.0],
38 |         strides=[8, 16, 32, 64, 128],
39 |         offset=0.0,
40 |     ),
41 |     box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
42 |     anchor_matcher=L(Matcher)(
43 |         thresholds=[0.4, 0.5], labels=[0, -1, 1], allow_low_quality_matches=True
44 |     ),
45 |     num_classes=80,
46 |     head_in_features=["p3", "p4", "p5", "p6", "p7"],
47 |     focal_loss_alpha=0.25,
48 |     focal_loss_gamma=2.0,
49 |     pixel_mean=[103.530, 116.280, 123.675],
50 |     pixel_std=[1.0, 1.0, 1.0],
51 |     input_format="BGR",
52 | )
53 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/optim.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from detectron2.config import LazyCall as L
 4 | from detectron2.solver.build import get_default_optimizer_params
 5 | 
 6 | SGD = L(torch.optim.SGD)(
 7 |     params=L(get_default_optimizer_params)(
 8 |         # params.model is meant to be set to the model object, before instantiating
 9 |         # the optimizer.
10 |         weight_decay_norm=0.0
11 |     ),
12 |     lr=0.02,
13 |     momentum=0.9,
14 |     weight_decay=1e-4,
15 | )
16 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/common/train.py:
--------------------------------------------------------------------------------
 1 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py"
 2 | # You can use your own instead, together with your own train_net.py
 3 | train = dict(
 4 |     output_dir="./output",
 5 |     init_checkpoint="detectron2://ImageNetPretrained/MSRA/R-50.pkl",
 6 |     bb_rpn_checkpoint="", 
 7 |     max_iter=90000,
 8 |     amp=dict(enabled=False),  # options for Automatic Mixed Precision
 9 |     ddp=dict(  # options for DistributedDataParallel
10 |         broadcast_buffers=False,
11 |         find_unused_parameters=False,
12 |         fp16_compression=False,
13 |     ),
14 |     checkpointer=dict(period=5000, max_to_keep=100),  # options for PeriodicCheckpointer
15 |     eval_period=5000,
16 |     log_period=20,
17 |     device="cuda"
18 |     # ...
19 | )
20 | 


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "PretrainFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: ""
 8 |   MASK_ON: False
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   ROI_HEADS:
16 |     NAME: "PretrainRes5ROIHeads"
17 |     IN_FEATURES: ["res4"]
18 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
19 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
20 |   CLIP:
21 |     CLSS_TEMP: 0.01
22 |     CROP_REGION_TYPE: "RPN"
23 |     OFFLINE_RPN_NMS_THRESH: 0.5
24 |     GATHER_GPUS: True
25 |     CONCEPT_THRES: 0.1
26 |     PRETRAIN_RPN_REGIONS: 300
27 |     PRETRAIN_SAMPLE_REGIONS: 100
28 |     PRETRAIN_IMG_TXT_LEVEL: True
29 |     PRETRAIN_ONLY_EOT: True
30 |     TEACHER_RESNETS_DEPTH: 50
31 |     TEACHER_POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("imgtxtpairs",)
34 |   FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",)
35 |   PATH_TRAIN: ("./datasets/coco/val2017",) # ("/tmp/datasets/CC3M",)
36 |   TEST: () 
37 | DATALOADER:
38 |   ASPECT_RATIO_GROUPING: False
39 |   NUM_WORKERS: 4
40 | TEST:
41 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
42 |   EVAL_PERIOD: 2500000
43 | SOLVER:
44 |   IMS_PER_BATCH: 96 # 32 gpus
45 |   BASE_LR: 0.002
46 |   WEIGHT_DECAY: 0.0001
47 |   STEPS: (300000, 525000)
48 |   MAX_ITER: 600000
49 |   CLIP_GRADIENTS:
50 |     ENABLED: True
51 |     CLIP_TYPE: "norm"
52 |     CLIP_VALUE: 5.0
53 | INPUT:
54 |   MIN_SIZE_TRAIN_SAMPLING: choice
55 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
56 |   MAX_SIZE_TRAIN: 1333
57 |   MIN_SIZE_TEST: 800
58 |   MAX_SIZE_TEST: 1333
59 |   FORMAT: "RGB"
60 | AUG: # Data Augmentation from MSR-CLIP 
61 |   TRAIN:
62 |     IMAGE_SIZE: [800,]
63 |     MAX_SIZE: 1333
64 |   TEST:
65 |     IMAGE_SIZE: [800,]
66 |     MAX_SIZE: 1333
67 |   INTERPOLATION: 3


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./RegionCLIP_RN50.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "WeakPretrainFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 6 |     FREEZE_AT: 2
 7 |   CLIP:
 8 |     CROP_REGION_TYPE: "RPN"
 9 |     OFFLINE_RPN_NMS_THRESH: 0.3 # will affect the eval performance
10 |     # GATHER_GPUS: True
11 |     PRETRAIN_RPN_REGIONS: 300   # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
12 |     PRETRAIN_SAMPLE_REGIONS: 64   # num_regions_per_img, topk in box selection
13 |     # for ZS inference
14 |     NO_BOX_DELTA: True  # no box refinement
15 |     OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
16 |     USE_TEXT_EMB_CLASSIFIER: True
17 |     MULTIPLY_RPN_SCORE: True
18 |   WEAK_LOSS:
19 |     WEAK_LOSS_WEIGHT: 0.01
20 |     IMAGE_LOSS_WEIGHT: 0.1
21 |     BOX_SELECT_THRES: 0.97   # threshold in box selection
22 |   # for ZS inference
23 |   ROI_HEADS:
24 |     NAME: "CLIPRes5ROIHeads"    # pretrain roi head
25 |     IN_FEATURES: ["res4"]
26 |     NUM_CLASSES: 1  # box only roi head, used in pretraining to setup self.cls_score
27 |     # for ZS inference
28 |     NMS_THRESH_TEST: 0.5
29 |   # for ZS inference
30 |   ROI_BOX_HEAD:
31 |     NAME: ""
32 |     NUM_FC: 0
33 |     # POOLER_RESOLUTION: 14
34 |     CLS_AGNOSTIC_BBOX_REG: True
35 | DATASETS:
36 |   # TRAIN: ("coco_zeroshot_train_del", "coco_caption_nouns_train_4764tags",) 
37 |   TRAIN: ("coco_caption_nouns_train_4764tags",)
38 |   TEST: ("coco_generalized_del_val",)
39 | INPUT:
40 |   CUSTOM_AUG: ResizeShortestEdge
41 |   MIN_SIZE_TRAIN_SAMPLING: range
42 |   MIN_SIZE_TRAIN: (400, 400)
43 |   MAX_SIZE_TRAIN: 667
44 | DATALOADER:
45 |   # SAMPLER_TRAIN: "MultiDatasetSampler"
46 |   # DATASET_RATIO: [1, 4]
47 |   # USE_DIFF_BS_SIZE: True  # if use build_custom_augmentation
48 |   # DATASET_BS: [2, 8]
49 |   # USE_RFS: [False, False]
50 |   # DATASET_MIN_SIZES: [[800, 800], [400, 400]]
51 |   # DATASET_MAX_SIZES: [1333, 667]
52 |   # DATASET_MIN_SIZES: [[800, 800], [400, 400]]
53 |   # DATASET_MAX_SIZES: [1333, 667]
54 |   FILTER_EMPTY_ANNOTATIONS: False
55 |   DATASET_ANN: ['caption',]
56 |   # MULTI_DATASET_GROUPING: True
57 |   # DATASET_ANN: ['box', 'caption']
58 |   # NUM_WORKERS: 8
59 | TEST:
60 |   DETECTIONS_PER_IMAGE: 100  # LVIS allows up to 300
61 |   EVAL_PERIOD: 10000
62 | SOLVER:
63 |   IMS_PER_BATCH: 96 # 32 gpus
64 |   BASE_LR: 0.002
65 |   WEIGHT_DECAY: 0.0001
66 |   STEPS: (60000, 80000)
67 |   MAX_ITER: 90000
68 |   CHECKPOINT_PERIOD: 20000
69 |   CLIP_GRADIENTS:
70 |     ENABLED: True
71 |     CLIP_TYPE: "norm"
72 |     CLIP_VALUE: 5.0
73 | FIND_UNUSED_PARAM: True
74 | WITH_IMAGE_LABELS: True   # load image tags
75 | OUTPUT_DIR: output/r50_onlinePL_pre


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./RegionCLIP_RN50.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "WeakPretrainFastRCNN"
 4 |   IGNORE_CLS_LOSS: True   # disable weak loss
 5 |   BACKBONE:
 6 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 7 |     FREEZE_AT: 2
 8 |   ROI_HEADS:
 9 |     NAME: "CLIPRes5ROIHeads"    # pretrain roi head
10 |     IN_FEATURES: ["res4"]
11 |     NUM_CLASSES: 1  # box only roi head, used in pretraining to setup self.cls_score
12 |     # for ZS inference
13 |     NMS_THRESH_TEST: 0.5
14 |   # for ZS inference
15 |   ROI_BOX_HEAD:
16 |     NAME: ""
17 |     NUM_FC: 0
18 |     # POOLER_RESOLUTION: 14
19 |     CLS_AGNOSTIC_BBOX_REG: True
20 |   CLIP:
21 |     CROP_REGION_TYPE: "RPN"
22 |     OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 |     # GATHER_GPUS: True
24 |     # PRETRAIN_RPN_REGIONS: 300   # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 |     PRETRAIN_SAMPLE_REGIONS: 64   # num_regions_per_img, topk in box selection
26 |     # for ZS inference
27 |     NO_BOX_DELTA: False       # pretrain roi head
28 |     OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 |     USE_TEXT_EMB_CLASSIFIER: True
30 |     MULTIPLY_RPN_SCORE: True
31 |   WEAK_LOSS:
32 |     WEAK_LOSS_WEIGHT: 0.01
33 |     IMAGE_LOSS_WEIGHT: 0.1
34 |     BOX_SELECT_THRES: 0.97   # threshold in box selection
35 | DATASETS:
36 |   TRAIN: ("lvis_v1_train_base_box_only", "coco_caption_nouns_train_4764tags",)
37 |   TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 |   CUSTOM_AUG: ResizeShortestEdge
40 |   MIN_SIZE_TRAIN_SAMPLING: range
41 |   MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 |   SAMPLER_TRAIN: "MultiDatasetSampler"
44 |   DATASET_RATIO: [1, 8]
45 |   USE_DIFF_BS_SIZE: True
46 |   DATASET_BS: [2, 16]
47 |   USE_RFS: [False, False]
48 |   DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 |   DATASET_MAX_SIZES: [1333, 667]
50 |   FILTER_EMPTY_ANNOTATIONS: False
51 |   MULTI_DATASET_GROUPING: True
52 |   DATASET_ANN: ['box', 'caption']
53 |   NUM_WORKERS: 8
54 | TEST:
55 |   DETECTIONS_PER_IMAGE: 100  # LVIS allows up to 300
56 |   EVAL_PERIOD: 10000
57 | SOLVER:
58 |   IMS_PER_BATCH: 96 # 32 gpus
59 |   BASE_LR: 0.002
60 |   WEIGHT_DECAY: 0.0001
61 |   STEPS: (60000, 80000)
62 |   MAX_ITER: 90000
63 |   CHECKPOINT_PERIOD: 20000
64 |   CLIP_GRADIENTS:
65 |     ENABLED: True
66 |     CLIP_TYPE: "norm"
67 |     CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True   # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_weak


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_cc3m.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./RegionCLIP_RN50.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "WeakPretrainFastRCNN"
 4 |   IGNORE_CLS_LOSS: True   # disable weak loss
 5 |   BACKBONE:
 6 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 7 |     FREEZE_AT: 2
 8 |   ROI_HEADS:
 9 |     NAME: "CLIPRes5ROIHeads"    # pretrain roi head
10 |     IN_FEATURES: ["res4"]
11 |     NUM_CLASSES: 1  # box only roi head, used in pretraining to setup self.cls_score
12 |     # for ZS inference
13 |     NMS_THRESH_TEST: 0.5
14 |   # for ZS inference
15 |   ROI_BOX_HEAD:
16 |     NAME: ""
17 |     NUM_FC: 0
18 |     # POOLER_RESOLUTION: 14
19 |     CLS_AGNOSTIC_BBOX_REG: True
20 |   CLIP:
21 |     CROP_REGION_TYPE: "RPN"
22 |     OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 |     # GATHER_GPUS: True
24 |     # PRETRAIN_RPN_REGIONS: 300   # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 |     PRETRAIN_SAMPLE_REGIONS: 64   # num_regions_per_img, topk in box selection
26 |     # for ZS inference
27 |     NO_BOX_DELTA: False       # pretrain roi head
28 |     OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 |     USE_TEXT_EMB_CLASSIFIER: True
30 |     MULTIPLY_RPN_SCORE: True
31 |   WEAK_LOSS:
32 |     WEAK_LOSS_WEIGHT: 0.01
33 |     IMAGE_LOSS_WEIGHT: 0.1
34 |     BOX_SELECT_THRES: 0.97   # threshold in box selection
35 | DATASETS:
36 |   TRAIN: ("lvis_v1_train_base_box_only", "cc3m_v1_nouns_train_4764tags",)
37 |   TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 |   CUSTOM_AUG: ResizeShortestEdge
40 |   MIN_SIZE_TRAIN_SAMPLING: range
41 |   MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 |   SAMPLER_TRAIN: "MultiDatasetSampler"
44 |   DATASET_RATIO: [1, 8]
45 |   USE_DIFF_BS_SIZE: True
46 |   DATASET_BS: [2, 16]
47 |   USE_RFS: [False, False]
48 |   DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 |   DATASET_MAX_SIZES: [1333, 667]
50 |   FILTER_EMPTY_ANNOTATIONS: False
51 |   MULTI_DATASET_GROUPING: True
52 |   DATASET_ANN: ['box', 'caption']
53 |   NUM_WORKERS: 8
54 | TEST:
55 |   DETECTIONS_PER_IMAGE: 100  # LVIS allows up to 300
56 |   EVAL_PERIOD: 10000
57 | SOLVER:
58 |   IMS_PER_BATCH: 96 # 32 gpus
59 |   BASE_LR: 0.002
60 |   WEIGHT_DECAY: 0.0001
61 |   STEPS: (240000, 320000)
62 |   MAX_ITER: 360000
63 |   CHECKPOINT_PERIOD: 40000
64 |   CLIP_GRADIENTS:
65 |     ENABLED: True
66 |     CLIP_TYPE: "norm"
67 |     CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True   # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50_onlinePL_box_weak_locNarr.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./RegionCLIP_RN50.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "WeakPretrainFastRCNN"
 4 |   IGNORE_CLS_LOSS: True   # disable weak loss
 5 |   BACKBONE:
 6 |     NAME: "build_clip_resnet_backbone_from_pretrain"
 7 |     FREEZE_AT: 2
 8 |   ROI_HEADS:
 9 |     NAME: "CLIPRes5ROIHeads"    # pretrain roi head
10 |     IN_FEATURES: ["res4"]
11 |     NUM_CLASSES: 1  # box only roi head, used in pretraining to setup self.cls_score
12 |     # for ZS inference
13 |     NMS_THRESH_TEST: 0.5
14 |   # for ZS inference
15 |   ROI_BOX_HEAD:
16 |     NAME: ""
17 |     NUM_FC: 0
18 |     # POOLER_RESOLUTION: 14
19 |     CLS_AGNOSTIC_BBOX_REG: True
20 |   CLIP:
21 |     CROP_REGION_TYPE: "RPN"
22 |     OFFLINE_RPN_NMS_THRESH: 0.7 # will affect the eval performance
23 |     # GATHER_GPUS: True
24 |     # PRETRAIN_RPN_REGIONS: 300   # will change offline_cfg.MODEL.RPN.POST_NMS_TOPK_TEST
25 |     PRETRAIN_SAMPLE_REGIONS: 64   # num_regions_per_img, topk in box selection
26 |     # for ZS inference
27 |     NO_BOX_DELTA: False       # pretrain roi head
28 |     OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth"
29 |     USE_TEXT_EMB_CLASSIFIER: True
30 |     MULTIPLY_RPN_SCORE: True
31 |   WEAK_LOSS:
32 |     WEAK_LOSS_WEIGHT: 0.01
33 |     IMAGE_LOSS_WEIGHT: 0.1
34 |     BOX_SELECT_THRES: 0.97   # threshold in box selection
35 | DATASETS:
36 |   TRAIN: ("lvis_v1_train_base_box_only", "loc_narr_nouns_train_4764tags",)
37 |   TEST: ("coco_generalized_del_val",)
38 | INPUT:
39 |   CUSTOM_AUG: ResizeShortestEdge
40 |   MIN_SIZE_TRAIN_SAMPLING: range
41 |   MIN_SIZE_TRAIN: (800, 800)
42 | DATALOADER:
43 |   SAMPLER_TRAIN: "MultiDatasetSampler"
44 |   DATASET_RATIO: [1, 8]
45 |   USE_DIFF_BS_SIZE: True
46 |   DATASET_BS: [2, 16]
47 |   USE_RFS: [False, False]
48 |   DATASET_MIN_SIZES: [[800, 800], [400, 400]]
49 |   DATASET_MAX_SIZES: [1333, 667]
50 |   FILTER_EMPTY_ANNOTATIONS: False
51 |   MULTI_DATASET_GROUPING: True
52 |   DATASET_ANN: ['box', 'caption']
53 |   NUM_WORKERS: 8
54 | TEST:
55 |   DETECTIONS_PER_IMAGE: 100  # LVIS allows up to 300
56 |   EVAL_PERIOD: 10000
57 | SOLVER:
58 |   IMS_PER_BATCH: 96 # 32 gpus
59 |   BASE_LR: 0.002
60 |   WEIGHT_DECAY: 0.0001
61 |   STEPS: (240000, 320000)
62 |   MAX_ITER: 360000
63 |   CHECKPOINT_PERIOD: 40000
64 |   CLIP_GRADIENTS:
65 |     ENABLED: True
66 |     CLIP_TYPE: "norm"
67 |     CLIP_VALUE: 5.0
68 | FIND_UNUSED_PARAM: True
69 | WITH_IMAGE_LABELS: True   # load image tags
70 | OUTPUT_DIR: output/r50_pre_onlinePL_box_emaWeak_cc3m


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-C4.yaml"
 2 | MODEL:
 3 |   META_ARCHITECTURE: "PretrainFastRCNN"
 4 |   BACKBONE:
 5 |     NAME: "build_clip_resnet_backbone"
 6 |     FREEZE_AT: 2
 7 |   WEIGHTS: ""
 8 |   MASK_ON: False
 9 |   RESNETS:
10 |     DEPTH: 200
11 |     OUT_FEATURES: ["res4"]
12 |     NORM: FrozenBN
13 |     STEM_OUT_CHANNELS: 64
14 |     RES2_OUT_CHANNELS: 256
15 |   ROI_HEADS:
16 |     NAME: "PretrainRes5ROIHeads"
17 |     IN_FEATURES: ["res4"]
18 |   ROI_BOX_HEAD:
19 |     POOLER_RESOLUTION: 18
20 |   PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
21 |   PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
22 |   CLIP:
23 |     CLSS_TEMP: 0.01
24 |     CROP_REGION_TYPE: "RPN"
25 |     OFFLINE_RPN_NMS_THRESH: 0.5
26 |     GATHER_GPUS: True
27 |     CONCEPT_THRES: 0.1
28 |     PRETRAIN_RPN_REGIONS: 300
29 |     PRETRAIN_SAMPLE_REGIONS: 100
30 |     PRETRAIN_IMG_TXT_LEVEL: True
31 |     PRETRAIN_ONLY_EOT: True
32 |     TEACHER_RESNETS_DEPTH: 200
33 |     TEACHER_POOLER_RESOLUTION: 18
34 | # INPUT:
35 | #   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
36 | DATASETS:
37 |   TRAIN: ("imgtxtpairs",)
38 |   FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",)
39 |   PATH_TRAIN: ("/home/v-yiwuzhong/projects/azureblobs/vlpdatasets/coco-caption/val2017",) # ("/tmp/datasets/CC3M",)
40 |   TEST: () 
41 | DATALOADER:
42 |   ASPECT_RATIO_GROUPING: False
43 |   NUM_WORKERS: 4
44 | TEST:
45 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
46 |   EVAL_PERIOD: 2500000
47 | SOLVER:
48 |   IMS_PER_BATCH: 96 # 32 gpus
49 |   BASE_LR: 0.002
50 |   WEIGHT_DECAY: 0.0001
51 |   STEPS: (300000, 525000)
52 |   MAX_ITER: 600000
53 |   CLIP_GRADIENTS:
54 |     ENABLED: True
55 |     CLIP_TYPE: "norm"
56 |     CLIP_VALUE: 5.0
57 | INPUT:
58 |   MIN_SIZE_TRAIN_SAMPLING: choice
59 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
60 |   MAX_SIZE_TRAIN: 1333
61 |   MIN_SIZE_TEST: 800
62 |   MAX_SIZE_TEST: 1333
63 |   FORMAT: "RGB"
64 | AUG: # Data Augmentation from MSR-CLIP 
65 |   TRAIN:
66 |     IMAGE_SIZE: [800,]
67 |     MAX_SIZE: 1333
68 |   TEST:
69 |     IMAGE_SIZE: [800,]
70 |     MAX_SIZE: 1333
71 |   INTERPOLATION: 3


--------------------------------------------------------------------------------
/sas_det/configs/regionclip/pretrain/RegionCLIP_RN50x4_onlinePL_boxWeak.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "./RegionCLIP_RN50_onlinePL_box_weak.yaml"
 2 | MODEL:
 3 |   RESNETS:
 4 |     DEPTH: 200
 5 |   ROI_BOX_HEAD:
 6 |     POOLER_RESOLUTION: 18
 7 |   CLIP:
 8 |     TEACHER_RESNETS_DEPTH: 200
 9 |     TEACHER_POOLER_RESOLUTION: 18
10 |     TEXT_EMB_DIM: 640
11 |     # TEXT_EMB_PATH: None   # for classifer, not used in pretraining if MODEL.IGNORE_CLS_LOSS True
12 |     OPENSET_TEST_TEXT_EMB_PATH: "./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth"   # use emb from r50x4
13 | OUTPUT_DIR: output/r50x4_pre_onlinePL_boxWeak
14 | 
15 | 


--------------------------------------------------------------------------------
/sas_det/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import ovd_register as _ovd_register  # ensure the builtin datasets are registered
3 | 
4 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
5 | 


--------------------------------------------------------------------------------
/sas_det/data/coco_zeroshot_categories.py:
--------------------------------------------------------------------------------
  1 | # COCO categories for zero-shot setting
  2 | # 65 categories in total, 48 base categories for training, 17 unseen categories are only used in testing
  3 | # from http://ankan.umiacs.io/files/mscoco_seen_classes.json, http://ankan.umiacs.io/files/mscoco_unseen_classes.json
  4 | 
  5 | # 17 class names in order, obtained from load_coco_json() function
  6 | COCO_UNSEEN_CLS = ['airplane', 'bus', 'cat', 'dog', 'cow', 'elephant', 'umbrella', \
  7 |     'tie', 'snowboard', 'skateboard', 'cup', 'knife', 'cake', 'couch', 'keyboard', \
  8 |     'sink', 'scissors']
  9 | 
 10 | # 48 class names in order, obtained from load_coco_json() function
 11 | COCO_SEEN_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'train', 'truck', \
 12 |     'boat', 'bench', 'bird', 'horse', 'sheep', 'bear', 'zebra', 'giraffe', \
 13 |     'backpack', 'handbag', 'suitcase', 'frisbee', 'skis', 'kite', 'surfboard', \
 14 |     'bottle', 'fork', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', \
 15 |     'broccoli', 'carrot', 'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', \
 16 |     'laptop', 'mouse', 'remote', 'microwave', 'oven', 'toaster', \
 17 |     'refrigerator', 'book', 'clock', 'vase', 'toothbrush']
 18 | 
 19 | # 65 class names in order, obtained from load_coco_json() function
 20 | COCO_OVD_ALL_CLS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', \
 21 |     'bus', 'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', \
 22 |     'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', \
 23 |     'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard', \
 24 |     'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', \
 25 |     'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut', 'cake', \
 26 |     'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', \
 27 |     'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', \
 28 |     'scissors', 'toothbrush']
 29 | 
 30 | # 80 class names
 31 | COCO_80_ALL_CLS = {1: 'person',
 32 |  2: 'bicycle',
 33 |  3: 'car',
 34 |  4: 'motorcycle',
 35 |  5: 'airplane',
 36 |  6: 'bus',
 37 |  7: 'train',
 38 |  8: 'truck',
 39 |  9: 'boat',
 40 |  10: 'traffic light',
 41 |  11: 'fire hydrant',
 42 |  12: 'stop sign',
 43 |  13: 'parking meter',
 44 |  14: 'bench',
 45 |  15: 'bird',
 46 |  16: 'cat',
 47 |  17: 'dog',
 48 |  18: 'horse',
 49 |  19: 'sheep',
 50 |  20: 'cow',
 51 |  21: 'elephant',
 52 |  22: 'bear',
 53 |  23: 'zebra',
 54 |  24: 'giraffe',
 55 |  25: 'backpack',
 56 |  26: 'umbrella',
 57 |  27: 'handbag',
 58 |  28: 'tie',
 59 |  29: 'suitcase',
 60 |  30: 'frisbee',
 61 |  31: 'skis',
 62 |  32: 'snowboard',
 63 |  33: 'sports ball',
 64 |  34: 'kite',
 65 |  35: 'baseball bat',
 66 |  36: 'baseball glove',
 67 |  37: 'skateboard',
 68 |  38: 'surfboard',
 69 |  39: 'tennis racket',
 70 |  40: 'bottle',
 71 |  41: 'wine glass',
 72 |  42: 'cup',
 73 |  43: 'fork',
 74 |  44: 'knife',
 75 |  45: 'spoon',
 76 |  46: 'bowl',
 77 |  47: 'banana',
 78 |  48: 'apple',
 79 |  49: 'sandwich',
 80 |  50: 'orange',
 81 |  51: 'broccoli',
 82 |  52: 'carrot',
 83 |  53: 'hot dog',
 84 |  54: 'pizza',
 85 |  55: 'donut',
 86 |  56: 'cake',
 87 |  57: 'chair',
 88 |  58: 'couch',
 89 |  59: 'potted plant',
 90 |  60: 'bed',
 91 |  61: 'dining table',
 92 |  62: 'toilet',
 93 |  63: 'tv',
 94 |  64: 'laptop',
 95 |  65: 'mouse',
 96 |  66: 'remote',
 97 |  67: 'keyboard',
 98 |  68: 'cell phone',
 99 |  69: 'microwave',
100 |  70: 'oven',
101 |  71: 'toaster',
102 |  72: 'sink',
103 |  73: 'refrigerator',
104 |  74: 'book',
105 |  75: 'clock',
106 |  76: 'vase',
107 |  77: 'scissors',
108 |  78: 'teddy bear',
109 |  79: 'hair drier',
110 |  80: 'toothbrush'}
111 | 
112 | if __name__ == "__main__":
113 |     # from https://github.com/alirezazareian/ovr-cnn/blob/master/ipynb/001.ipynb
114 |     # Create zero-shot setting data split in COCO
115 |     import json
116 |     import ipdb
117 | 
118 |     with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
119 |         coco_train_anno_all = json.load(fin)
120 | 
121 |     with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
122 |         coco_train_anno_seen = json.load(fin)
123 | 
124 |     with open('./datasets/coco/annotations/instances_train2017.json', 'r') as fin:
125 |         coco_train_anno_unseen = json.load(fin)
126 | 
127 |     with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
128 |         coco_val_anno_all = json.load(fin)
129 | 
130 |     with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
131 |         coco_val_anno_seen = json.load(fin)
132 | 
133 |     with open('./datasets/coco/annotations/instances_val2017.json', 'r') as fin:
134 |         coco_val_anno_unseen = json.load(fin)
135 |     
136 |     labels_seen = COCO_SEEN_CLS
137 |     labels_unseen = COCO_UNSEEN_CLS
138 |     labels_all = [item['name'] for item in coco_val_anno_all['categories']]  # 80 class names
139 |     # len(labels_seen), len(labels_unseen)
140 |     # set(labels_seen) - set(labels_all)
141 |     # set(labels_unseen) - set(labels_all)
142 |     
143 |     class_id_to_split = {}  # {1: 'seen', 2: 'seen', 3: 'seen', 4: 'seen', 5: 'unseen',...}
144 |     class_name_to_split = {}  # {'person': 'seen', 'bicycle': 'seen', 'car': 'seen', 'motorcycle': 'seen', 'airplane': 'unseen',...}
145 |     for item in coco_val_anno_all['categories']:
146 |         if item['name'] in labels_seen:
147 |             class_id_to_split[item['id']] = 'seen'
148 |             class_name_to_split[item['name']] = 'seen'
149 |         elif item['name'] in labels_unseen:
150 |             class_id_to_split[item['id']] = 'unseen'
151 |             class_name_to_split[item['name']] = 'unseen'
152 |     
153 |     # class_name_to_emb = {}
154 |     # with open('../datasets/coco/zero-shot/glove.6B.300d.txt', 'r') as fin:
155 |     #     for row in fin:
156 |     #         row_tk = row.split()
157 |     #         if row_tk[0] in class_name_to_split:
158 |     #             class_name_to_emb[row_tk[0]] = [float(num) for num in row_tk[1:]]
159 |     # len(class_name_to_emb), len(class_name_to_split)
160 | 
161 |     def filter_annotation(anno_dict, split_name_list):
162 |         """
163 |         COCO annotations have fields: dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])
164 |         This function (1) filters the category metadata (list) in 'categories'; 
165 |         (2) filter instance annotation in 'annotations'; (3) filter image metadata (list) in 'images
166 |         """
167 |         filtered_categories = []
168 |         for item in anno_dict['categories']:
169 |             if class_id_to_split.get(item['id']) in split_name_list:
170 |                 #item['embedding'] = class_name_to_emb[item['name']]
171 |                 item['split'] = class_id_to_split.get(item['id'])
172 |                 filtered_categories.append(item)
173 |         anno_dict['categories'] = filtered_categories
174 |         
175 |         filtered_images = []
176 |         filtered_annotations = []
177 |         useful_image_ids = set()
178 |         for item in anno_dict['annotations']:
179 |             if class_id_to_split.get(item['category_id']) in split_name_list:
180 |                 filtered_annotations.append(item)
181 |                 useful_image_ids.add(item['image_id'])
182 |         for item in anno_dict['images']:
183 |             if item['id'] in useful_image_ids:
184 |                 filtered_images.append(item)
185 |         anno_dict['annotations'] = filtered_annotations
186 |         anno_dict['images'] = filtered_images
187 |     
188 |     filter_annotation(coco_train_anno_seen, ['seen'])
189 |     filter_annotation(coco_train_anno_unseen, ['unseen'])
190 |     filter_annotation(coco_train_anno_all, ['seen', 'unseen'])
191 |     filter_annotation(coco_val_anno_seen, ['seen'])
192 |     filter_annotation(coco_val_anno_unseen, ['unseen'])
193 |     filter_annotation(coco_val_anno_all, ['seen', 'unseen'])
194 | 
195 |     with open('./datasets/coco/annotations/ovd_ins_train2017_b.json', 'w') as fout:
196 |         json.dump(coco_train_anno_seen, fout)
197 |     with open('./datasets/coco/annotations/ovd_ins_train2017_t.json', 'w') as fout:
198 |         json.dump(coco_train_anno_unseen, fout)
199 |     with open('./datasets/coco/annotations/ovd_ins_train2017_all.json', 'w') as fout:
200 |         json.dump(coco_train_anno_all, fout)
201 |     with open('./datasets/coco/annotations/ovd_ins_val2017_b.json', 'w') as fout:
202 |         json.dump(coco_val_anno_seen, fout)
203 |     with open('./datasets/coco/annotations/ovd_ins_val2017_t.json', 'w') as fout:
204 |         json.dump(coco_val_anno_unseen, fout)
205 |     with open('./datasets/coco/annotations/ovd_ins_val2017_all.json', 'w') as fout:
206 |         json.dump(coco_val_anno_all, fout)


--------------------------------------------------------------------------------
/sas_det/data/ovd_register.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | 
  4 | 
  5 | """
  6 | This file registers pre-defined datasets at hard-coded paths, and their metadata.
  7 | 
  8 | We hard-code metadata for common datasets. This will enable:
  9 | 1. Consistency check when loading the datasets
 10 | 2. Use models on these standard datasets directly and run demos,
 11 |    without having to download the dataset annotations
 12 | 
 13 | We hard-code some paths to the dataset that's assumed to
 14 | exist in "./datasets/".
 15 | 
 16 | Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
 17 | To add new dataset, refer to the tutorial "docs/DATASETS.md".
 18 | """
 19 | 
 20 | import os
 21 | 
 22 | from detectron2.data import DatasetCatalog, MetadataCatalog
 23 | 
 24 | from detectron2.data.datasets.builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata
 25 | # from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
 26 | # from .cityscapes_panoptic import register_all_cityscapes_panoptic
 27 | from detectron2.data.datasets.coco import load_sem_seg, register_coco_instances
 28 | # from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
 29 | # from detectron2.data.datasets.lvis import get_lvis_instances_meta, register_lvis_instances
 30 | # from .pascal_voc import register_pascal_voc
 31 | 
 32 | from .lvis import get_lvis_instances_meta, register_lvis_instances_w_PLs, register_lvis_instances
 33 | 
 34 | # ==== Predefined datasets and splits for COCO ==========
 35 | 
 36 | _PREDEFINED_SPLITS_COCO = {}
 37 | # _PREDEFINED_SPLITS_COCO["coco"] = {
 38 | #     "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
 39 | #     "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
 40 | #     "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
 41 | #     "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"),
 42 | #     "coco_2014_valminusminival": (
 43 | #         "coco/val2014",
 44 | #         "coco/annotations/instances_valminusminival2014.json",
 45 | #     ),
 46 | #     "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
 47 | #     "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
 48 | #     "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
 49 | #     "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
 50 | #     "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
 51 | # }
 52 | _PREDEFINED_SPLITS_COCO["coco_ovd"] = {
 53 |     "coco_2017_ovd_all_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_all.json"),
 54 |     "coco_2017_ovd_b_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b.json"),
 55 |     "coco_2017_ovd_b_train_65cats": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats.json"),
 56 |     "coco_2017_ovd_b_train_65cats_all_images": ("coco/train2017", "coco/annotations/ovd_ins_train2017_b_65cats_all_images.json"),
 57 |     "coco_2017_ovd_t_train": ("coco/train2017", "coco/annotations/ovd_ins_train2017_t.json"),
 58 |     #
 59 |     "coco_2017_ovd_all_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_all.json"),
 60 |     "coco_2017_ovd_b_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_b.json"),
 61 |     "coco_2017_ovd_t_test": ("coco/val2017", "coco/annotations/ovd_ins_val2017_t.json"),
 62 |     #
 63 |     "coco_2017_ovd_retain_val": ("coco/val2017", "coco/annotations/ovd_ins_val2017_retain_15.json"),
 64 | }
 65 | 
 66 | 
 67 | def register_all_coco(root):
 68 |     for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
 69 |         if dataset_name == 'coco_ovd':  # for zero-shot split
 70 |             for key, (image_root, json_file) in splits_per_dataset.items():
 71 |                 # Assume pre-defined datasets live in `./datasets`.
 72 |                 register_coco_instances(
 73 |                     key,
 74 |                     {}, # empty metadata, it will be overwritten in load_coco_json() function
 75 |                     os.path.join(root, json_file) if "://" not in json_file else json_file,
 76 |                     os.path.join(root, image_root),
 77 |                 )
 78 |         else: # default splits
 79 |             for key, (image_root, json_file) in splits_per_dataset.items():
 80 |                 # Assume pre-defined datasets live in `./datasets`.
 81 |                 register_coco_instances(
 82 |                     key,
 83 |                     _get_builtin_metadata(dataset_name),
 84 |                     os.path.join(root, json_file) if "://" not in json_file else json_file,
 85 |                     os.path.join(root, image_root),
 86 |                 )
 87 | 
 88 | 
 89 | # ==== Predefined datasets and splits for LVIS ==========
 90 | 
 91 | _PREDEFINED_SPLITS_LVIS = {
 92 |     # # openset setting
 93 |     # "lvis_v1": {
 94 |     #     "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"),
 95 |     #     "lvis_v1_train_p0": ("coco/", "lvis/lvis_v1_train_p0.json"),
 96 |     #     "lvis_v1_train_p1": ("coco/", "lvis/lvis_v1_train_p1.json"),
 97 |     #     "lvis_v1_train_p2": ("coco/", "lvis/lvis_v1_train_p2.json"),
 98 |     #     "lvis_v1_train_p3": ("coco/", "lvis/lvis_v1_train_p3.json"),
 99 |     #     #
100 |     #     "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"),
101 |     #     "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
102 |     #     "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
103 |     # },
104 |     # custom image setting
105 |     "lvis_v1_custom_img": {
106 |         "lvis_v1_train_custom_img": ("coco/", "lvis/lvis_v1_train.json"),
107 |         "lvis_v1_val_custom_img": ("coco/", "lvis/lvis_v1_val.json"),
108 |         "lvis_v1_test_dev_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
109 |         "lvis_v1_test_challenge_custom_img": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
110 |     },
111 |     # regular fully supervised setting
112 |     "lvis_v1_fullysup": {
113 |         "lvis_v1_train_fullysup": ("coco/", "lvis/lvis_v1_train.json"),
114 |         "lvis_v1_val_fullysup": ("coco/", "lvis/lvis_v1_val.json"),
115 |         "lvis_v1_test_dev_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
116 |         "lvis_v1_test_challenge_fullysup": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
117 |         #
118 |         "lvis_v1_train_base_1203cats": ("coco/", "lvis/lvis_v1_train_baseOnly.json"),
119 |         "lvis_v1_val_1@10": ("coco/", "lvis/lvis_v1_val_1@10.json"),
120 |     },
121 |     # PLs for ensemble by zsy
122 |     "lvis_v1_PLs": {
123 |         "lvis_v1_train_base_PLs_r50x4": ("coco/", "lvis/regionclip_PLs/inst_train_defRegCLIPr50x4_PLs_93.json"),
124 |         "lvis_v1_train_SASDet_r50x4_PLs": ("coco/", "lvis/regionclip_PLs/lvis_v1_train_SASDet_r50x4_PLs_t62.json"),
125 |         "lvis_v1_o365_SASDet_r50x4_PLs": ("Objects365/train", "Objects365/regionclip_PLs/zsy_objv1_train_SASDet_r50x4_PLs_t83.json"),
126 |     }
127 | }
128 | 
129 | 
130 | def register_all_lvis(root):
131 |     for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
132 |         if dataset_name == "lvis_v1_PLs":
133 |             for key, (image_root, json_file) in splits_per_dataset.items():
134 |                 register_lvis_instances_w_PLs(
135 |                     key,
136 |                     get_lvis_instances_meta(dataset_name),  # TODO: meta for PLs, category order is rearranged
137 |                     os.path.join(root, json_file) if "://" not in json_file else json_file,
138 |                     os.path.join(root, image_root),
139 |                 )
140 |         else:
141 |             for key, (image_root, json_file) in splits_per_dataset.items():
142 |                 if dataset_name == "lvis_v1":
143 |                     args = {'filter_open_cls': True, 'run_custom_img': False}
144 |                 elif dataset_name == 'lvis_v1_custom_img':
145 |                     args = {'filter_open_cls': False, 'run_custom_img': True}
146 |                 elif dataset_name == 'lvis_v1_fullysup':
147 |                     args = {'filter_open_cls': False, 'run_custom_img': False}
148 |                 register_lvis_instances(
149 |                     key,
150 |                     get_lvis_instances_meta(dataset_name),
151 |                     os.path.join(root, json_file) if "://" not in json_file else json_file,
152 |                     os.path.join(root, image_root),
153 |                     args,
154 |                 )
155 | 
156 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
157 | register_all_coco(_root)
158 | register_all_lvis(_root)
159 | 
160 | # # True for open source;
161 | # # Internally at fb, we register them elsewhere
162 | # if __name__.endswith(".builtin"):
163 | #     # Assume pre-defined datasets live in `./datasets`.
164 | #     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
165 | #     register_all_coco(_root)
166 | #     register_all_lvis(_root)


--------------------------------------------------------------------------------
/sas_det/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator
 3 | from .coco_evaluation import COCOEvaluator
 4 | from .rotated_coco_evaluation import RotatedCOCOEvaluator
 5 | from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
 6 | from .lvis_evaluation import LVISEvaluator
 7 | from .panoptic_evaluation import COCOPanopticEvaluator
 8 | from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
 9 | from .sem_seg_evaluation import SemSegEvaluator
10 | from .testing import print_csv_format, verify_results
11 | 
12 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
13 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/cityscapes_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import glob
  3 | import logging
  4 | import numpy as np
  5 | import os
  6 | import tempfile
  7 | from collections import OrderedDict
  8 | import torch
  9 | from PIL import Image
 10 | 
 11 | from detectron2.data import MetadataCatalog
 12 | from detectron2.utils import comm
 13 | from detectron2.utils.file_io import PathManager
 14 | 
 15 | from .evaluator import DatasetEvaluator
 16 | 
 17 | 
 18 | class CityscapesEvaluator(DatasetEvaluator):
 19 |     """
 20 |     Base class for evaluation using cityscapes API.
 21 |     """
 22 | 
 23 |     def __init__(self, dataset_name):
 24 |         """
 25 |         Args:
 26 |             dataset_name (str): the name of the dataset.
 27 |                 It must have the following metadata associated with it:
 28 |                 "thing_classes", "gt_dir".
 29 |         """
 30 |         self._metadata = MetadataCatalog.get(dataset_name)
 31 |         self._cpu_device = torch.device("cpu")
 32 |         self._logger = logging.getLogger(__name__)
 33 | 
 34 |     def reset(self):
 35 |         self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
 36 |         self._temp_dir = self._working_dir.name
 37 |         # All workers will write to the same results directory
 38 |         # TODO this does not work in distributed training
 39 |         self._temp_dir = comm.all_gather(self._temp_dir)[0]
 40 |         if self._temp_dir != self._working_dir.name:
 41 |             self._working_dir.cleanup()
 42 |         self._logger.info(
 43 |             "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
 44 |         )
 45 | 
 46 | 
 47 | class CityscapesInstanceEvaluator(CityscapesEvaluator):
 48 |     """
 49 |     Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
 50 | 
 51 |     Note:
 52 |         * It does not work in multi-machine distributed training.
 53 |         * It contains a synchronization, therefore has to be used on all ranks.
 54 |         * Only the main process runs evaluation.
 55 |     """
 56 | 
 57 |     def process(self, inputs, outputs):
 58 |         from cityscapesscripts.helpers.labels import name2label
 59 | 
 60 |         for input, output in zip(inputs, outputs):
 61 |             file_name = input["file_name"]
 62 |             basename = os.path.splitext(os.path.basename(file_name))[0]
 63 |             pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
 64 | 
 65 |             if "instances" in output:
 66 |                 output = output["instances"].to(self._cpu_device)
 67 |                 num_instances = len(output)
 68 |                 with open(pred_txt, "w") as fout:
 69 |                     for i in range(num_instances):
 70 |                         pred_class = output.pred_classes[i]
 71 |                         classes = self._metadata.thing_classes[pred_class]
 72 |                         class_id = name2label[classes].id
 73 |                         score = output.scores[i]
 74 |                         mask = output.pred_masks[i].numpy().astype("uint8")
 75 |                         png_filename = os.path.join(
 76 |                             self._temp_dir, basename + "_{}_{}.png".format(i, classes)
 77 |                         )
 78 | 
 79 |                         Image.fromarray(mask * 255).save(png_filename)
 80 |                         fout.write(
 81 |                             "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
 82 |                         )
 83 |             else:
 84 |                 # Cityscapes requires a prediction file for every ground truth image.
 85 |                 with open(pred_txt, "w") as fout:
 86 |                     pass
 87 | 
 88 |     def evaluate(self):
 89 |         """
 90 |         Returns:
 91 |             dict: has a key "segm", whose value is a dict of "AP" and "AP50".
 92 |         """
 93 |         comm.synchronize()
 94 |         if comm.get_rank() > 0:
 95 |             return
 96 |         import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
 97 | 
 98 |         self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
 99 | 
100 |         # set some global states in cityscapes evaluation API, before evaluating
101 |         cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
102 |         cityscapes_eval.args.predictionWalk = None
103 |         cityscapes_eval.args.JSONOutput = False
104 |         cityscapes_eval.args.colorized = False
105 |         cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
106 | 
107 |         # These lines are adopted from
108 |         # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
109 |         gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
110 |         groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
111 |         assert len(
112 |             groundTruthImgList
113 |         ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
114 |             cityscapes_eval.args.groundTruthSearch
115 |         )
116 |         predictionImgList = []
117 |         for gt in groundTruthImgList:
118 |             predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
119 |         results = cityscapes_eval.evaluateImgLists(
120 |             predictionImgList, groundTruthImgList, cityscapes_eval.args
121 |         )["averages"]
122 | 
123 |         ret = OrderedDict()
124 |         ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
125 |         self._working_dir.cleanup()
126 |         return ret
127 | 
128 | 
129 | class CityscapesSemSegEvaluator(CityscapesEvaluator):
130 |     """
131 |     Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
132 | 
133 |     Note:
134 |         * It does not work in multi-machine distributed training.
135 |         * It contains a synchronization, therefore has to be used on all ranks.
136 |         * Only the main process runs evaluation.
137 |     """
138 | 
139 |     def process(self, inputs, outputs):
140 |         from cityscapesscripts.helpers.labels import trainId2label
141 | 
142 |         for input, output in zip(inputs, outputs):
143 |             file_name = input["file_name"]
144 |             basename = os.path.splitext(os.path.basename(file_name))[0]
145 |             pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
146 | 
147 |             output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
148 |             pred = 255 * np.ones(output.shape, dtype=np.uint8)
149 |             for train_id, label in trainId2label.items():
150 |                 if label.ignoreInEval:
151 |                     continue
152 |                 pred[output == train_id] = label.id
153 |             Image.fromarray(pred).save(pred_filename)
154 | 
155 |     def evaluate(self):
156 |         comm.synchronize()
157 |         if comm.get_rank() > 0:
158 |             return
159 |         # Load the Cityscapes eval script *after* setting the required env var,
160 |         # since the script reads CITYSCAPES_DATASET into global variables at load time.
161 |         import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
162 | 
163 |         self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
164 | 
165 |         # set some global states in cityscapes evaluation API, before evaluating
166 |         cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
167 |         cityscapes_eval.args.predictionWalk = None
168 |         cityscapes_eval.args.JSONOutput = False
169 |         cityscapes_eval.args.colorized = False
170 | 
171 |         # These lines are adopted from
172 |         # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
173 |         gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
174 |         groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
175 |         assert len(
176 |             groundTruthImgList
177 |         ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
178 |             cityscapes_eval.args.groundTruthSearch
179 |         )
180 |         predictionImgList = []
181 |         for gt in groundTruthImgList:
182 |             predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
183 |         results = cityscapes_eval.evaluateImgLists(
184 |             predictionImgList, groundTruthImgList, cityscapes_eval.args
185 |         )
186 |         ret = OrderedDict()
187 |         ret["sem_seg"] = {
188 |             "IoU": 100.0 * results["averageScoreClasses"],
189 |             "iIoU": 100.0 * results["averageScoreInstClasses"],
190 |             "IoU_sup": 100.0 * results["averageScoreCategories"],
191 |             "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
192 |         }
193 |         self._working_dir.cleanup()
194 |         return ret
195 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/evaluator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import datetime
  3 | import logging
  4 | import time
  5 | from collections import OrderedDict, abc
  6 | from contextlib import ExitStack, contextmanager
  7 | from typing import List, Union
  8 | import torch
  9 | from torch import nn
 10 | 
 11 | from detectron2.utils.comm import get_world_size, is_main_process
 12 | from detectron2.utils.logger import log_every_n_seconds
 13 | 
 14 | 
 15 | class DatasetEvaluator:
 16 |     """
 17 |     Base class for a dataset evaluator.
 18 | 
 19 |     The function :func:`inference_on_dataset` runs the model over
 20 |     all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
 21 | 
 22 |     This class will accumulate information of the inputs/outputs (by :meth:`process`),
 23 |     and produce evaluation results in the end (by :meth:`evaluate`).
 24 |     """
 25 | 
 26 |     def reset(self):
 27 |         """
 28 |         Preparation for a new round of evaluation.
 29 |         Should be called before starting a round of evaluation.
 30 |         """
 31 |         pass
 32 | 
 33 |     def process(self, inputs, outputs):
 34 |         """
 35 |         Process the pair of inputs and outputs.
 36 |         If they contain batches, the pairs can be consumed one-by-one using `zip`:
 37 | 
 38 |         .. code-block:: python
 39 | 
 40 |             for input_, output in zip(inputs, outputs):
 41 |                 # do evaluation on single input/output pair
 42 |                 ...
 43 | 
 44 |         Args:
 45 |             inputs (list): the inputs that's used to call the model.
 46 |             outputs (list): the return value of `model(inputs)`
 47 |         """
 48 |         pass
 49 | 
 50 |     def evaluate(self):
 51 |         """
 52 |         Evaluate/summarize the performance, after processing all input/output pairs.
 53 | 
 54 |         Returns:
 55 |             dict:
 56 |                 A new evaluator class can return a dict of arbitrary format
 57 |                 as long as the user can process the results.
 58 |                 In our train_net.py, we expect the following format:
 59 | 
 60 |                 * key: the name of the task (e.g., bbox)
 61 |                 * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
 62 |         """
 63 |         pass
 64 | 
 65 | 
 66 | class DatasetEvaluators(DatasetEvaluator):
 67 |     """
 68 |     Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
 69 | 
 70 |     This class dispatches every evaluation call to
 71 |     all of its :class:`DatasetEvaluator`.
 72 |     """
 73 | 
 74 |     def __init__(self, evaluators):
 75 |         """
 76 |         Args:
 77 |             evaluators (list): the evaluators to combine.
 78 |         """
 79 |         super().__init__()
 80 |         self._evaluators = evaluators
 81 | 
 82 |     def reset(self):
 83 |         for evaluator in self._evaluators:
 84 |             evaluator.reset()
 85 | 
 86 |     def process(self, inputs, outputs):
 87 |         for evaluator in self._evaluators:
 88 |             evaluator.process(inputs, outputs)
 89 | 
 90 |     def evaluate(self):
 91 |         results = OrderedDict()
 92 |         for evaluator in self._evaluators:
 93 |             result = evaluator.evaluate()
 94 |             if is_main_process() and result is not None:
 95 |                 for k, v in result.items():
 96 |                     assert (
 97 |                         k not in results
 98 |                     ), "Different evaluators produce results with the same key {}".format(k)
 99 |                     results[k] = v
100 |         return results
101 | 
102 | 
103 | def inference_on_dataset(
104 |     model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None]
105 | ):
106 |     """
107 |     Run model on the data_loader and evaluate the metrics with evaluator.
108 |     Also benchmark the inference speed of `model.__call__` accurately.
109 |     The model will be used in eval mode.
110 | 
111 |     Args:
112 |         model (callable): a callable which takes an object from
113 |             `data_loader` and returns some outputs.
114 | 
115 |             If it's an nn.Module, it will be temporarily set to `eval` mode.
116 |             If you wish to evaluate a model in `training` mode instead, you can
117 |             wrap the given model and override its behavior of `.eval()` and `.train()`.
118 |         data_loader: an iterable object with a length.
119 |             The elements it generates will be the inputs to the model.
120 |         evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
121 |             but don't want to do any evaluation.
122 | 
123 |     Returns:
124 |         The return value of `evaluator.evaluate()`
125 |     """
126 |     num_devices = get_world_size()
127 |     logger = logging.getLogger(__name__)
128 |     logger.info("Start inference on {} batches".format(len(data_loader)))
129 | 
130 |     total = len(data_loader)  # inference data loader must have a fixed length
131 |     if evaluator is None:
132 |         # create a no-op evaluator
133 |         evaluator = DatasetEvaluators([])
134 |     if isinstance(evaluator, abc.MutableSequence):
135 |         evaluator = DatasetEvaluators(evaluator)
136 |     evaluator.reset()
137 | 
138 |     num_warmup = min(5, total - 1)
139 |     start_time = time.perf_counter()
140 |     total_data_time = 0
141 |     total_compute_time = 0
142 |     total_eval_time = 0
143 |     with ExitStack() as stack:
144 |         if isinstance(model, nn.Module):
145 |             stack.enter_context(inference_context(model))
146 |         stack.enter_context(torch.no_grad())
147 | 
148 |         start_data_time = time.perf_counter()
149 |         for idx, inputs in enumerate(data_loader):
150 |             total_data_time += time.perf_counter() - start_data_time
151 |             if idx == num_warmup:
152 |                 start_time = time.perf_counter()
153 |                 total_data_time = 0
154 |                 total_compute_time = 0
155 |                 total_eval_time = 0
156 | 
157 |             start_compute_time = time.perf_counter()
158 |             outputs = model(inputs)
159 |             if torch.cuda.is_available():
160 |                 torch.cuda.synchronize()
161 |             total_compute_time += time.perf_counter() - start_compute_time
162 | 
163 |             start_eval_time = time.perf_counter()
164 |             evaluator.process(inputs, outputs)
165 |             total_eval_time += time.perf_counter() - start_eval_time
166 | 
167 |             iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
168 |             data_seconds_per_iter = total_data_time / iters_after_start
169 |             compute_seconds_per_iter = total_compute_time / iters_after_start
170 |             eval_seconds_per_iter = total_eval_time / iters_after_start
171 |             total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
172 |             if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
173 |                 eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
174 |                 log_every_n_seconds(
175 |                     logging.INFO,
176 |                     (
177 |                         f"Inference done {idx + 1}/{total}. "
178 |                         f"Dataloading: {data_seconds_per_iter:.4f} s / iter. "
179 |                         f"Inference: {compute_seconds_per_iter:.4f} s / iter. "
180 |                         f"Eval: {eval_seconds_per_iter:.4f} s / iter. "
181 |                         f"Total: {total_seconds_per_iter:.4f} s / iter. "
182 |                         f"ETA={eta}"
183 |                     ),
184 |                     n=5,
185 |                 )
186 |             start_data_time = time.perf_counter()
187 | 
188 |     # Measure the time only for this worker (before the synchronization barrier)
189 |     total_time = time.perf_counter() - start_time
190 |     total_time_str = str(datetime.timedelta(seconds=total_time))
191 |     # NOTE this format is parsed by grep
192 |     logger.info(
193 |         "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
194 |             total_time_str, total_time / (total - num_warmup), num_devices
195 |         )
196 |     )
197 |     total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
198 |     logger.info(
199 |         "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
200 |             total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
201 |         )
202 |     )
203 | 
204 |     results = evaluator.evaluate()
205 |     # An evaluator may return None when not in main process.
206 |     # Replace it by an empty dict instead to make it easier for downstream code to handle
207 |     if results is None:
208 |         results = {}
209 |     return results
210 | 
211 | 
212 | @contextmanager
213 | def inference_context(model):
214 |     """
215 |     A context where the model is temporarily changed to eval mode,
216 |     and restored to previous mode afterwards.
217 | 
218 |     Args:
219 |         model: a torch Module
220 |     """
221 |     training_mode = model.training
222 |     model.eval()
223 |     yield
224 |     model.train(training_mode)
225 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/fast_eval_api.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | import numpy as np
  5 | import time
  6 | from pycocotools.cocoeval import COCOeval
  7 | 
  8 | from detectron2 import _C
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class COCOeval_opt(COCOeval):
 14 |     """
 15 |     This is a slightly modified version of the original COCO API, where the functions evaluateImg()
 16 |     and accumulate() are implemented in C++ to speedup evaluation
 17 |     """
 18 | 
 19 |     def evaluate(self):
 20 |         """
 21 |         Run per image evaluation on given images and store results in self.evalImgs_cpp, a
 22 |         datastructure that isn't readable from Python but is used by a c++ implementation of
 23 |         accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
 24 |         self.evalImgs because this datastructure is a computational bottleneck.
 25 |         :return: None
 26 |         """
 27 |         tic = time.time()
 28 | 
 29 |         p = self.params
 30 |         # add backward compatibility if useSegm is specified in params
 31 |         if p.useSegm is not None:
 32 |             p.iouType = "segm" if p.useSegm == 1 else "bbox"
 33 |         logger.info("Evaluate annotation type *{}*".format(p.iouType))
 34 |         p.imgIds = list(np.unique(p.imgIds))
 35 |         if p.useCats:
 36 |             p.catIds = list(np.unique(p.catIds))
 37 |         p.maxDets = sorted(p.maxDets)
 38 |         self.params = p
 39 | 
 40 |         self._prepare()  # bottleneck
 41 | 
 42 |         # loop through images, area range, max detection number
 43 |         catIds = p.catIds if p.useCats else [-1]
 44 | 
 45 |         if p.iouType == "segm" or p.iouType == "bbox":
 46 |             computeIoU = self.computeIoU
 47 |         elif p.iouType == "keypoints":
 48 |             computeIoU = self.computeOks
 49 |         self.ious = {
 50 |             (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
 51 |         }  # bottleneck
 52 | 
 53 |         maxDet = p.maxDets[-1]
 54 | 
 55 |         # <<<< Beginning of code differences with original COCO API
 56 |         def convert_instances_to_cpp(instances, is_det=False):
 57 |             # Convert annotations for a list of instances in an image to a format that's fast
 58 |             # to access in C++
 59 |             instances_cpp = []
 60 |             for instance in instances:
 61 |                 instance_cpp = _C.InstanceAnnotation(
 62 |                     int(instance["id"]),
 63 |                     instance["score"] if is_det else instance.get("score", 0.0),
 64 |                     instance["area"],
 65 |                     bool(instance.get("iscrowd", 0)),
 66 |                     bool(instance.get("ignore", 0)),
 67 |                 )
 68 |                 instances_cpp.append(instance_cpp)
 69 |             return instances_cpp
 70 | 
 71 |         # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
 72 |         ground_truth_instances = [
 73 |             [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
 74 |             for imgId in p.imgIds
 75 |         ]
 76 |         detected_instances = [
 77 |             [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
 78 |             for imgId in p.imgIds
 79 |         ]
 80 |         ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
 81 | 
 82 |         if not p.useCats:
 83 |             # For each image, flatten per-category lists into a single list
 84 |             ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
 85 |             detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
 86 | 
 87 |         # Call C++ implementation of self.evaluateImgs()
 88 |         self._evalImgs_cpp = _C.COCOevalEvaluateImages(
 89 |             p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
 90 |         )
 91 |         self._evalImgs = None
 92 | 
 93 |         self._paramsEval = copy.deepcopy(self.params)
 94 |         toc = time.time()
 95 |         logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
 96 |         # >>>> End of code differences with original COCO API
 97 | 
 98 |     def accumulate(self):
 99 |         """
100 |         Accumulate per image evaluation results and store the result in self.eval.  Does not
101 |         support changing parameter settings from those used by self.evaluate()
102 |         """
103 |         logger.info("Accumulating evaluation results...")
104 |         tic = time.time()
105 |         assert hasattr(
106 |             self, "_evalImgs_cpp"
107 |         ), "evaluate() must be called before accmulate() is called."
108 | 
109 |         self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
110 | 
111 |         # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
112 |         self.eval["recall"] = np.array(self.eval["recall"]).reshape(
113 |             self.eval["counts"][:1] + self.eval["counts"][2:]
114 |         )
115 | 
116 |         # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
117 |         # num_area_ranges X num_max_detections
118 |         self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
119 |         self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
120 |         toc = time.time()
121 |         logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic))
122 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/panoptic_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import io
  4 | import itertools
  5 | import json
  6 | import logging
  7 | import numpy as np
  8 | import os
  9 | import tempfile
 10 | from collections import OrderedDict
 11 | from typing import Optional
 12 | from PIL import Image
 13 | from tabulate import tabulate
 14 | 
 15 | from detectron2.data import MetadataCatalog
 16 | from detectron2.utils import comm
 17 | from detectron2.utils.file_io import PathManager
 18 | 
 19 | from .evaluator import DatasetEvaluator
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class COCOPanopticEvaluator(DatasetEvaluator):
 25 |     """
 26 |     Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
 27 |     It saves panoptic segmentation prediction in `output_dir`
 28 | 
 29 |     It contains a synchronize call and has to be called from all workers.
 30 |     """
 31 | 
 32 |     def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
 33 |         """
 34 |         Args:
 35 |             dataset_name: name of the dataset
 36 |             output_dir: output directory to save results for evaluation.
 37 |         """
 38 |         self._metadata = MetadataCatalog.get(dataset_name)
 39 |         self._thing_contiguous_id_to_dataset_id = {
 40 |             v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
 41 |         }
 42 |         self._stuff_contiguous_id_to_dataset_id = {
 43 |             v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
 44 |         }
 45 | 
 46 |         self._output_dir = output_dir
 47 |         if self._output_dir is not None:
 48 |             PathManager.mkdirs(self._output_dir)
 49 | 
 50 |     def reset(self):
 51 |         self._predictions = []
 52 | 
 53 |     def _convert_category_id(self, segment_info):
 54 |         isthing = segment_info.pop("isthing", None)
 55 |         if isthing is None:
 56 |             # the model produces panoptic category id directly. No more conversion needed
 57 |             return segment_info
 58 |         if isthing is True:
 59 |             segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
 60 |                 segment_info["category_id"]
 61 |             ]
 62 |         else:
 63 |             segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
 64 |                 segment_info["category_id"]
 65 |             ]
 66 |         return segment_info
 67 | 
 68 |     def process(self, inputs, outputs):
 69 |         from panopticapi.utils import id2rgb
 70 | 
 71 |         for input, output in zip(inputs, outputs):
 72 |             panoptic_img, segments_info = output["panoptic_seg"]
 73 |             panoptic_img = panoptic_img.cpu().numpy()
 74 |             if segments_info is None:
 75 |                 # If "segments_info" is None, we assume "panoptic_img" is a
 76 |                 # H*W int32 image storing the panoptic_id in the format of
 77 |                 # category_id * label_divisor + instance_id. We reserve -1 for
 78 |                 # VOID label, and add 1 to panoptic_img since the official
 79 |                 # evaluation script uses 0 for VOID label.
 80 |                 label_divisor = self._metadata.label_divisor
 81 |                 segments_info = []
 82 |                 for panoptic_label in np.unique(panoptic_img):
 83 |                     if panoptic_label == -1:
 84 |                         # VOID region.
 85 |                         continue
 86 |                     pred_class = panoptic_label // label_divisor
 87 |                     isthing = (
 88 |                         pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
 89 |                     )
 90 |                     segments_info.append(
 91 |                         {
 92 |                             "id": int(panoptic_label) + 1,
 93 |                             "category_id": int(pred_class),
 94 |                             "isthing": bool(isthing),
 95 |                         }
 96 |                     )
 97 |                 # Official evaluation script uses 0 for VOID label.
 98 |                 panoptic_img += 1
 99 | 
100 |             file_name = os.path.basename(input["file_name"])
101 |             file_name_png = os.path.splitext(file_name)[0] + ".png"
102 |             with io.BytesIO() as out:
103 |                 Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
104 |                 segments_info = [self._convert_category_id(x) for x in segments_info]
105 |                 self._predictions.append(
106 |                     {
107 |                         "image_id": input["image_id"],
108 |                         "file_name": file_name_png,
109 |                         "png_string": out.getvalue(),
110 |                         "segments_info": segments_info,
111 |                     }
112 |                 )
113 | 
114 |     def evaluate(self):
115 |         comm.synchronize()
116 | 
117 |         self._predictions = comm.gather(self._predictions)
118 |         self._predictions = list(itertools.chain(*self._predictions))
119 |         if not comm.is_main_process():
120 |             return
121 | 
122 |         # PanopticApi requires local files
123 |         gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
124 |         gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
125 | 
126 |         with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
127 |             logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
128 |             for p in self._predictions:
129 |                 with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
130 |                     f.write(p.pop("png_string"))
131 | 
132 |             with open(gt_json, "r") as f:
133 |                 json_data = json.load(f)
134 |             json_data["annotations"] = self._predictions
135 | 
136 |             output_dir = self._output_dir or pred_dir
137 |             predictions_json = os.path.join(output_dir, "predictions.json")
138 |             with PathManager.open(predictions_json, "w") as f:
139 |                 f.write(json.dumps(json_data))
140 | 
141 |             from panopticapi.evaluation import pq_compute
142 | 
143 |             with contextlib.redirect_stdout(io.StringIO()):
144 |                 pq_res = pq_compute(
145 |                     gt_json,
146 |                     PathManager.get_local_path(predictions_json),
147 |                     gt_folder=gt_folder,
148 |                     pred_folder=pred_dir,
149 |                 )
150 | 
151 |         res = {}
152 |         res["PQ"] = 100 * pq_res["All"]["pq"]
153 |         res["SQ"] = 100 * pq_res["All"]["sq"]
154 |         res["RQ"] = 100 * pq_res["All"]["rq"]
155 |         res["PQ_th"] = 100 * pq_res["Things"]["pq"]
156 |         res["SQ_th"] = 100 * pq_res["Things"]["sq"]
157 |         res["RQ_th"] = 100 * pq_res["Things"]["rq"]
158 |         res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
159 |         res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
160 |         res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
161 | 
162 |         results = OrderedDict({"panoptic_seg": res})
163 |         _print_panoptic_results(pq_res)
164 | 
165 |         return results
166 | 
167 | 
168 | def _print_panoptic_results(pq_res):
169 |     headers = ["", "PQ", "SQ", "RQ", "#categories"]
170 |     data = []
171 |     for name in ["All", "Things", "Stuff"]:
172 |         row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
173 |         data.append(row)
174 |     table = tabulate(
175 |         data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
176 |     )
177 |     logger.info("Panoptic Evaluation Results:\n" + table)
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     from detectron2.utils.logger import setup_logger
182 | 
183 |     logger = setup_logger()
184 |     import argparse
185 | 
186 |     parser = argparse.ArgumentParser()
187 |     parser.add_argument("--gt-json")
188 |     parser.add_argument("--gt-dir")
189 |     parser.add_argument("--pred-json")
190 |     parser.add_argument("--pred-dir")
191 |     args = parser.parse_args()
192 | 
193 |     from panopticapi.evaluation import pq_compute
194 | 
195 |     with contextlib.redirect_stdout(io.StringIO()):
196 |         pq_res = pq_compute(
197 |             args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
198 |         )
199 |         _print_panoptic_results(pq_res)
200 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/rotated_coco_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import itertools
  3 | import json
  4 | import numpy as np
  5 | import os
  6 | import torch
  7 | from pycocotools.cocoeval import COCOeval, maskUtils
  8 | 
  9 | from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
 10 | from detectron2.utils.file_io import PathManager
 11 | 
 12 | from .coco_evaluation import COCOEvaluator
 13 | 
 14 | 
 15 | class RotatedCOCOeval(COCOeval):
 16 |     @staticmethod
 17 |     def is_rotated(box_list):
 18 |         if type(box_list) == np.ndarray:
 19 |             return box_list.shape[1] == 5
 20 |         elif type(box_list) == list:
 21 |             if box_list == []:  # cannot decide the box_dim
 22 |                 return False
 23 |             return np.all(
 24 |                 np.array(
 25 |                     [
 26 |                         (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray))
 27 |                         for obj in box_list
 28 |                     ]
 29 |                 )
 30 |             )
 31 |         return False
 32 | 
 33 |     @staticmethod
 34 |     def boxlist_to_tensor(boxlist, output_box_dim):
 35 |         if type(boxlist) == np.ndarray:
 36 |             box_tensor = torch.from_numpy(boxlist)
 37 |         elif type(boxlist) == list:
 38 |             if boxlist == []:
 39 |                 return torch.zeros((0, output_box_dim), dtype=torch.float32)
 40 |             else:
 41 |                 box_tensor = torch.FloatTensor(boxlist)
 42 |         else:
 43 |             raise Exception("Unrecognized boxlist type")
 44 | 
 45 |         input_box_dim = box_tensor.shape[1]
 46 |         if input_box_dim != output_box_dim:
 47 |             if input_box_dim == 4 and output_box_dim == 5:
 48 |                 box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
 49 |             else:
 50 |                 raise Exception(
 51 |                     "Unable to convert from {}-dim box to {}-dim box".format(
 52 |                         input_box_dim, output_box_dim
 53 |                     )
 54 |                 )
 55 |         return box_tensor
 56 | 
 57 |     def compute_iou_dt_gt(self, dt, gt, is_crowd):
 58 |         if self.is_rotated(dt) or self.is_rotated(gt):
 59 |             # TODO: take is_crowd into consideration
 60 |             assert all(c == 0 for c in is_crowd)
 61 |             dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
 62 |             gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
 63 |             return pairwise_iou_rotated(dt, gt)
 64 |         else:
 65 |             # This is the same as the classical COCO evaluation
 66 |             return maskUtils.iou(dt, gt, is_crowd)
 67 | 
 68 |     def computeIoU(self, imgId, catId):
 69 |         p = self.params
 70 |         if p.useCats:
 71 |             gt = self._gts[imgId, catId]
 72 |             dt = self._dts[imgId, catId]
 73 |         else:
 74 |             gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
 75 |             dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
 76 |         if len(gt) == 0 and len(dt) == 0:
 77 |             return []
 78 |         inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
 79 |         dt = [dt[i] for i in inds]
 80 |         if len(dt) > p.maxDets[-1]:
 81 |             dt = dt[0 : p.maxDets[-1]]
 82 | 
 83 |         assert p.iouType == "bbox", "unsupported iouType for iou computation"
 84 | 
 85 |         g = [g["bbox"] for g in gt]
 86 |         d = [d["bbox"] for d in dt]
 87 | 
 88 |         # compute iou between each dt and gt region
 89 |         iscrowd = [int(o["iscrowd"]) for o in gt]
 90 | 
 91 |         # Note: this function is copied from cocoeval.py in cocoapi
 92 |         # and the major difference is here.
 93 |         ious = self.compute_iou_dt_gt(d, g, iscrowd)
 94 |         return ious
 95 | 
 96 | 
 97 | class RotatedCOCOEvaluator(COCOEvaluator):
 98 |     """
 99 |     Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
100 |     with rotated boxes support.
101 |     Note: this uses IOU only and does not consider angle differences.
102 |     """
103 | 
104 |     def process(self, inputs, outputs):
105 |         """
106 |         Args:
107 |             inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
108 |                 It is a list of dict. Each dict corresponds to an image and
109 |                 contains keys like "height", "width", "file_name", "image_id".
110 |             outputs: the outputs of a COCO model. It is a list of dicts with key
111 |                 "instances" that contains :class:`Instances`.
112 |         """
113 |         for input, output in zip(inputs, outputs):
114 |             prediction = {"image_id": input["image_id"]}
115 | 
116 |             if "instances" in output:
117 |                 instances = output["instances"].to(self._cpu_device)
118 | 
119 |                 prediction["instances"] = self.instances_to_json(instances, input["image_id"])
120 |             if "proposals" in output:
121 |                 prediction["proposals"] = output["proposals"].to(self._cpu_device)
122 |             self._predictions.append(prediction)
123 | 
124 |     def instances_to_json(self, instances, img_id):
125 |         num_instance = len(instances)
126 |         if num_instance == 0:
127 |             return []
128 | 
129 |         boxes = instances.pred_boxes.tensor.numpy()
130 |         if boxes.shape[1] == 4:
131 |             boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
132 |         boxes = boxes.tolist()
133 |         scores = instances.scores.tolist()
134 |         classes = instances.pred_classes.tolist()
135 | 
136 |         results = []
137 |         for k in range(num_instance):
138 |             result = {
139 |                 "image_id": img_id,
140 |                 "category_id": classes[k],
141 |                 "bbox": boxes[k],
142 |                 "score": scores[k],
143 |             }
144 | 
145 |             results.append(result)
146 |         return results
147 | 
148 |     def _eval_predictions(self, predictions, img_ids=None):  # img_ids: unused
149 |         """
150 |         Evaluate predictions on the given tasks.
151 |         Fill self._results with the metrics of the tasks.
152 |         """
153 |         self._logger.info("Preparing results for COCO format ...")
154 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
155 | 
156 |         # unmap the category ids for COCO
157 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
158 |             reverse_id_mapping = {
159 |                 v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
160 |             }
161 |             for result in coco_results:
162 |                 result["category_id"] = reverse_id_mapping[result["category_id"]]
163 | 
164 |         if self._output_dir:
165 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
166 |             self._logger.info("Saving results to {}".format(file_path))
167 |             with PathManager.open(file_path, "w") as f:
168 |                 f.write(json.dumps(coco_results))
169 |                 f.flush()
170 | 
171 |         if not self._do_evaluation:
172 |             self._logger.info("Annotations are not available for evaluation.")
173 |             return
174 | 
175 |         self._logger.info("Evaluating predictions ...")
176 | 
177 |         assert self._tasks is None or set(self._tasks) == {
178 |             "bbox"
179 |         }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported"
180 |         coco_eval = (
181 |             self._evaluate_predictions_on_coco(self._coco_api, coco_results)
182 |             if len(coco_results) > 0
183 |             else None  # cocoapi does not handle empty results very well
184 |         )
185 | 
186 |         task = "bbox"
187 |         res = self._derive_coco_results(
188 |             coco_eval, task, class_names=self._metadata.get("thing_classes")
189 |         )
190 |         self._results[task] = res
191 | 
192 |     def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
193 |         """
194 |         Evaluate the coco results using COCOEval API.
195 |         """
196 |         assert len(coco_results) > 0
197 | 
198 |         coco_dt = coco_gt.loadRes(coco_results)
199 | 
200 |         # Only bbox is supported for now
201 |         coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
202 | 
203 |         coco_eval.evaluate()
204 |         coco_eval.accumulate()
205 |         coco_eval.summarize()
206 | 
207 |         return coco_eval
208 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/sem_seg_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import itertools
  3 | import json
  4 | import logging
  5 | import numpy as np
  6 | import os
  7 | from collections import OrderedDict
  8 | import PIL.Image as Image
  9 | import pycocotools.mask as mask_util
 10 | import torch
 11 | 
 12 | from detectron2.data import DatasetCatalog, MetadataCatalog
 13 | from detectron2.utils.comm import all_gather, is_main_process, synchronize
 14 | from detectron2.utils.file_io import PathManager
 15 | 
 16 | from .evaluator import DatasetEvaluator
 17 | 
 18 | 
 19 | class SemSegEvaluator(DatasetEvaluator):
 20 |     """
 21 |     Evaluate semantic segmentation metrics.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         dataset_name,
 27 |         distributed=True,
 28 |         output_dir=None,
 29 |         *,
 30 |         num_classes=None,
 31 |         ignore_label=None,
 32 |     ):
 33 |         """
 34 |         Args:
 35 |             dataset_name (str): name of the dataset to be evaluated.
 36 |             distributed (bool): if True, will collect results from all ranks for evaluation.
 37 |                 Otherwise, will evaluate the results in the current process.
 38 |             output_dir (str): an output directory to dump results.
 39 |             num_classes, ignore_label: deprecated argument
 40 |         """
 41 |         self._logger = logging.getLogger(__name__)
 42 |         if num_classes is not None:
 43 |             self._logger.warn(
 44 |                 "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
 45 |             )
 46 |         if ignore_label is not None:
 47 |             self._logger.warn(
 48 |                 "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
 49 |             )
 50 |         self._dataset_name = dataset_name
 51 |         self._distributed = distributed
 52 |         self._output_dir = output_dir
 53 | 
 54 |         self._cpu_device = torch.device("cpu")
 55 | 
 56 |         self.input_file_to_gt_file = {
 57 |             dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
 58 |             for dataset_record in DatasetCatalog.get(dataset_name)
 59 |         }
 60 | 
 61 |         meta = MetadataCatalog.get(dataset_name)
 62 |         # Dict that maps contiguous training ids to COCO category ids
 63 |         try:
 64 |             c2d = meta.stuff_dataset_id_to_contiguous_id
 65 |             self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
 66 |         except AttributeError:
 67 |             self._contiguous_id_to_dataset_id = None
 68 |         self._class_names = meta.stuff_classes
 69 |         self._num_classes = len(meta.stuff_classes)
 70 |         if num_classes is not None:
 71 |             assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
 72 |         self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
 73 | 
 74 |     def reset(self):
 75 |         self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
 76 |         self._predictions = []
 77 | 
 78 |     def process(self, inputs, outputs):
 79 |         """
 80 |         Args:
 81 |             inputs: the inputs to a model.
 82 |                 It is a list of dicts. Each dict corresponds to an image and
 83 |                 contains keys like "height", "width", "file_name".
 84 |             outputs: the outputs of a model. It is either list of semantic segmentation predictions
 85 |                 (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
 86 |                 segmentation prediction in the same format.
 87 |         """
 88 |         for input, output in zip(inputs, outputs):
 89 |             output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
 90 |             pred = np.array(output, dtype=np.int)
 91 |             with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f:
 92 |                 gt = np.array(Image.open(f), dtype=np.int)
 93 | 
 94 |             gt[gt == self._ignore_label] = self._num_classes
 95 | 
 96 |             self._conf_matrix += np.bincount(
 97 |                 (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
 98 |                 minlength=self._conf_matrix.size,
 99 |             ).reshape(self._conf_matrix.shape)
100 | 
101 |             self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
102 | 
103 |     def evaluate(self):
104 |         """
105 |         Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
106 | 
107 |         * Mean intersection-over-union averaged across classes (mIoU)
108 |         * Frequency Weighted IoU (fwIoU)
109 |         * Mean pixel accuracy averaged across classes (mACC)
110 |         * Pixel Accuracy (pACC)
111 |         """
112 |         if self._distributed:
113 |             synchronize()
114 |             conf_matrix_list = all_gather(self._conf_matrix)
115 |             self._predictions = all_gather(self._predictions)
116 |             self._predictions = list(itertools.chain(*self._predictions))
117 |             if not is_main_process():
118 |                 return
119 | 
120 |             self._conf_matrix = np.zeros_like(self._conf_matrix)
121 |             for conf_matrix in conf_matrix_list:
122 |                 self._conf_matrix += conf_matrix
123 | 
124 |         if self._output_dir:
125 |             PathManager.mkdirs(self._output_dir)
126 |             file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
127 |             with PathManager.open(file_path, "w") as f:
128 |                 f.write(json.dumps(self._predictions))
129 | 
130 |         acc = np.full(self._num_classes, np.nan, dtype=np.float)
131 |         iou = np.full(self._num_classes, np.nan, dtype=np.float)
132 |         tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
133 |         pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
134 |         class_weights = pos_gt / np.sum(pos_gt)
135 |         pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
136 |         acc_valid = pos_gt > 0
137 |         acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
138 |         iou_valid = (pos_gt + pos_pred) > 0
139 |         union = pos_gt + pos_pred - tp
140 |         iou[acc_valid] = tp[acc_valid] / union[acc_valid]
141 |         macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
142 |         miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
143 |         fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
144 |         pacc = np.sum(tp) / np.sum(pos_gt)
145 | 
146 |         res = {}
147 |         res["mIoU"] = 100 * miou
148 |         res["fwIoU"] = 100 * fiou
149 |         for i, name in enumerate(self._class_names):
150 |             res["IoU-{}".format(name)] = 100 * iou[i]
151 |         res["mACC"] = 100 * macc
152 |         res["pACC"] = 100 * pacc
153 |         for i, name in enumerate(self._class_names):
154 |             res["ACC-{}".format(name)] = 100 * acc[i]
155 | 
156 |         if self._output_dir:
157 |             file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
158 |             with PathManager.open(file_path, "wb") as f:
159 |                 torch.save(res, f)
160 |         results = OrderedDict({"sem_seg": res})
161 |         self._logger.info(results)
162 |         return results
163 | 
164 |     def encode_json_sem_seg(self, sem_seg, input_file_name):
165 |         """
166 |         Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
167 |         See http://cocodataset.org/#format-results
168 |         """
169 |         json_list = []
170 |         for label in np.unique(sem_seg):
171 |             if self._contiguous_id_to_dataset_id is not None:
172 |                 assert (
173 |                     label in self._contiguous_id_to_dataset_id
174 |                 ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
175 |                 dataset_id = self._contiguous_id_to_dataset_id[label]
176 |             else:
177 |                 dataset_id = int(label)
178 |             mask = (sem_seg == label).astype(np.uint8)
179 |             mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
180 |             mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
181 |             json_list.append(
182 |                 {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
183 |             )
184 |         return json_list
185 | 


--------------------------------------------------------------------------------
/sas_det/evaluation/testing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import logging
 3 | import numpy as np
 4 | import pprint
 5 | import sys
 6 | from collections.abc import Mapping
 7 | 
 8 | 
 9 | def print_csv_format(results):
10 |     """
11 |     Print main metrics in a format similar to Detectron,
12 |     so that they are easy to copypaste into a spreadsheet.
13 | 
14 |     Args:
15 |         results (OrderedDict[dict]): task_name -> {metric -> score}
16 |             unordered dict can also be printed, but in arbitrary order
17 |     """
18 |     assert isinstance(results, Mapping) or not len(results), results
19 |     logger = logging.getLogger(__name__)
20 |     for task, res in results.items():
21 |         if isinstance(res, Mapping):
22 |             # Don't print "AP-category" metrics since they are usually not tracked.
23 |             important_res = [(k, v) for k, v in res.items() if "-" not in k]
24 |             logger.info("copypaste: Task: {}".format(task))
25 |             logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
26 |             logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
27 |         else:
28 |             logger.info(f"copypaste: {task}={res}")
29 | 
30 | 
31 | def verify_results(cfg, results):
32 |     """
33 |     Args:
34 |         results (OrderedDict[dict]): task_name -> {metric -> score}
35 | 
36 |     Returns:
37 |         bool: whether the verification succeeds or not
38 |     """
39 |     expected_results = cfg.TEST.EXPECTED_RESULTS
40 |     if not len(expected_results):
41 |         return True
42 | 
43 |     ok = True
44 |     for task, metric, expected, tolerance in expected_results:
45 |         actual = results[task].get(metric, None)
46 |         if actual is None:
47 |             ok = False
48 |             continue
49 |         if not np.isfinite(actual):
50 |             ok = False
51 |             continue
52 |         diff = abs(actual - expected)
53 |         if diff > tolerance:
54 |             ok = False
55 | 
56 |     logger = logging.getLogger(__name__)
57 |     if not ok:
58 |         logger.error("Result verification failed!")
59 |         logger.error("Expected Results: " + str(expected_results))
60 |         logger.error("Actual Results: " + pprint.pformat(results))
61 | 
62 |         sys.exit(1)
63 |     else:
64 |         logger.info("Results verification passed.")
65 |     return ok
66 | 
67 | 
68 | def flatten_results_dict(results):
69 |     """
70 |     Expand a hierarchical dict of scalars into a flat dict of scalars.
71 |     If results[k1][k2][k3] = v, the returned dict will have the entry
72 |     {"k1/k2/k3": v}.
73 | 
74 |     Args:
75 |         results (dict):
76 |     """
77 |     r = {}
78 |     for k, v in results.items():
79 |         if isinstance(v, Mapping):
80 |             v = flatten_results_dict(v)
81 |             for kk, vv in v.items():
82 |                 r[k + "/" + kk] = vv
83 |         else:
84 |             r[k] = v
85 |     return r
86 | 


--------------------------------------------------------------------------------
/sas_det/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) NEC Laboratories America, Inc.
 2 | 
 3 | from .backbone import (
 4 |     build_clip_language_encoder,
 5 |     get_clip_tokenzier,
 6 |     get_clip_image_transform,
 7 | )
 8 | 
 9 | from .meta_arch import clip_rcnn as _
10 | 
11 | from .roi_heads import (
12 |     CLIPRes5ROIHeads,
13 |     FastRCNNOutputLayers,
14 | )
15 | 


--------------------------------------------------------------------------------
/sas_det/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) NEC Laboratories America, Inc.
2 | 
3 | from .clip_backbone import (
4 |     build_clip_language_encoder,
5 |     get_clip_tokenzier,
6 |     get_clip_image_transform,
7 | )


--------------------------------------------------------------------------------
/sas_det/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) NEC Laboratories America, Inc.
 2 | 
 3 | from .clip_roi_heads import (
 4 |     CLIPRes5ROIHeads,
 5 |     # PretrainRes5ROIHeads,
 6 |     # CLIPStandardROIHeads,
 7 | )
 8 | from .clip_roi_heads import FastRCNNOutputLayers
 9 | 
10 | __all__ = list(globals().keys())
11 | 


--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Copyright (c) NEC Laboratories America, Inc.
  4 | """
  5 | A main training script.
  6 | 
  7 | This scripts reads a given config file and runs the training or evaluation.
  8 | It is an entry point that is made to train standard models in detectron2.
  9 | 
 10 | In order to let one script support training of many models,
 11 | this script contains logic that are specific to these built-in models and therefore
 12 | may not be suitable for your own project.
 13 | For example, your research project perhaps only needs a single "evaluator".
 14 | 
 15 | Therefore, we recommend you to use detectron2 as an library and take
 16 | this file as an example of how to use the library.
 17 | You may want to write your own script with your datasets and other customizations.
 18 | """
 19 | 
 20 | import logging
 21 | import os
 22 | from collections import OrderedDict
 23 | import torch
 24 | 
 25 | import detectron2.utils.comm as comm
 26 | # from detectron2.checkpoint import DetectionCheckpointer
 27 | from detectron2.config import get_cfg
 28 | from detectron2.data import MetadataCatalog, build_detection_train_loader
 29 | from detectron2.engine import default_argument_parser, default_setup, hooks, launch
 30 | from detectron2.engine import DefaultTrainer    # this may be modified by regionclip
 31 | from detectron2.modeling import GeneralizedRCNNWithTTA
 32 | 
 33 | from sas_det.evaluation import (
 34 |     CityscapesInstanceEvaluator,
 35 |     CityscapesSemSegEvaluator,
 36 |     COCOEvaluator,
 37 |     COCOPanopticEvaluator,
 38 |     DatasetEvaluators,
 39 |     LVISEvaluator,
 40 |     PascalVOCDetectionEvaluator,
 41 |     SemSegEvaluator,
 42 |     verify_results,
 43 | )
 44 | from sas_det.checkpoint import DetectionCheckpointer
 45 | from sas_det import add_sas_det_config
 46 | 
 47 | #os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 48 | import torch.multiprocessing
 49 | torch.multiprocessing.set_sharing_strategy('file_system')
 50 | 
 51 | class Trainer(DefaultTrainer):
 52 |     """
 53 |     We use the "DefaultTrainer" which contains pre-defined default logic for
 54 |     standard training workflow. They may not work for you, especially if you
 55 |     are working on a new research project. In that case you can write your
 56 |     own training loop. You can use "tools/plain_train_net.py" as an example.
 57 |     """
 58 | 
 59 |     @classmethod
 60 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 61 |         """
 62 |         Create evaluator(s) for a given dataset.
 63 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
 64 |         For your own dataset, you can simply create an evaluator manually in your
 65 |         script and do not have to worry about the hacky if-else logic here.
 66 |         """
 67 |         if output_folder is None:
 68 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 69 |         evaluator_list = []
 70 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
 71 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
 72 |             evaluator_list.append(
 73 |                 SemSegEvaluator(
 74 |                     dataset_name,
 75 |                     distributed=True,
 76 |                     output_dir=output_folder,
 77 |                 )
 78 |             )
 79 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
 80 |             evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
 81 |         if evaluator_type == "coco_panoptic_seg":
 82 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
 83 |         if evaluator_type == "cityscapes_instance":
 84 |             assert (
 85 |                 torch.cuda.device_count() >= comm.get_rank()
 86 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 87 |             return CityscapesInstanceEvaluator(dataset_name)
 88 |         if evaluator_type == "cityscapes_sem_seg":
 89 |             assert (
 90 |                 torch.cuda.device_count() >= comm.get_rank()
 91 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 92 |             return CityscapesSemSegEvaluator(dataset_name)
 93 |         elif evaluator_type == "pascal_voc":
 94 |             return PascalVOCDetectionEvaluator(dataset_name)
 95 |         elif evaluator_type == "lvis":
 96 |             return LVISEvaluator(dataset_name, output_dir=output_folder)
 97 |         if len(evaluator_list) == 0:
 98 |             raise NotImplementedError(
 99 |                 "no Evaluator for the dataset {} with the type {}".format(
100 |                     dataset_name, evaluator_type
101 |                 )
102 |             )
103 |         elif len(evaluator_list) == 1:
104 |             return evaluator_list[0]
105 |         return DatasetEvaluators(evaluator_list)
106 | 
107 |     @classmethod
108 |     def test_with_TTA(cls, cfg, model):
109 |         logger = logging.getLogger("detectron2.trainer")
110 |         # In the end of training, run an evaluation with TTA
111 |         # Only support some R-CNN models.
112 |         logger.info("Running inference with test-time augmentation ...")
113 |         model = GeneralizedRCNNWithTTA(cfg, model)
114 |         evaluators = [
115 |             cls.build_evaluator(
116 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
117 |             )
118 |             for name in cfg.DATASETS.TEST
119 |         ]
120 |         res = cls.test(cfg, model, evaluators)
121 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
122 |         return res
123 | 
124 | 
125 | def periodic_update_teacher(trainer):
126 |     update_steps = trainer.cfg.MODEL.OVD.PERIODIC_STEPS
127 |     cur_iters = trainer.iter
128 | 
129 |     if cur_iters in update_steps:
130 |         model = trainer.model
131 |         if isinstance(model, torch.nn.parallel.DistributedDataParallel):
132 |             # wrapped by DistributedDataParallel
133 |             model.module.periodic_update_pairs()
134 |         else:
135 |             model.periodic_update_pairs()
136 | 
137 | 
138 | def setup(args):
139 |     """
140 |     Create configs and perform basic setups.
141 |     """
142 |     cfg = get_cfg()
143 |     add_sas_det_config(cfg)  # sas_det configs
144 | 
145 |     cfg.merge_from_file(args.config_file)
146 |     cfg.merge_from_list(args.opts)
147 |     cfg.freeze()
148 |     default_setup(cfg, args)
149 |     return cfg
150 | 
151 | 
152 | def main(args):
153 |     cfg = setup(args)
154 | 
155 |     assert args.eval_only, "This release supports evaluation only."
156 |     if args.eval_only:
157 |         model = Trainer.build_model(cfg)
158 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
159 |             cfg.MODEL.WEIGHTS, resume=args.resume
160 |         )
161 |         if cfg.MODEL.META_ARCHITECTURE in ['CLIPRCNN', 'CLIPFastRCNN', 'PretrainFastRCNN', 'WeakPretrainFastRCNN'] \
162 |             and cfg.MODEL.CLIP.BB_RPN_WEIGHTS is not None\
163 |             and cfg.MODEL.CLIP.CROP_REGION_TYPE == 'RPN': # load 2nd pretrained model
164 |             DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR, bb_rpn_weights=True).resume_or_load(
165 |                 cfg.MODEL.CLIP.BB_RPN_WEIGHTS, resume=False
166 |             )
167 |         res = Trainer.test(cfg, model)
168 |         if cfg.TEST.AUG.ENABLED:
169 |             res.update(Trainer.test_with_TTA(cfg, model))
170 |         if comm.is_main_process():
171 |             verify_results(cfg, res)
172 |         return res
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     args = default_argument_parser().parse_args()
177 |     print("Command Line Args:", args)
178 |     launch(
179 |         main,
180 |         args.num_gpus,
181 |         num_machines=args.num_machines,
182 |         machine_rank=args.machine_rank,
183 |         dist_url=args.dist_url,
184 |         args=(args,),
185 |     )
186 | 


--------------------------------------------------------------------------------
/tools/offline_eval_onLVIS.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from lvis import LVIS
 5 | from lvis import LVISEval, LVISResults
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(description='evaluate PLs quality offline')
 9 |     parser.add_argument('gt_json', type=str, help='gt coco json file')
10 |     parser.add_argument('pl_json', type=str, help='PL coco json file')
11 |     args = parser.parse_args()
12 |     # print(args)
13 | 
14 |     #############################################
15 |     gt_LVISJson_file = args.gt_json
16 |     pred_LvisJson_file = args.pl_json   
17 |     
18 |     covert_to_result = True # True if .json in coco data format (not coco result format)
19 | 
20 |     #############################################
21 | 
22 |     # load image list in gt_json
23 |     lvis_gt = LVIS(gt_LVISJson_file)
24 |     gt_img_ids = set(lvis_gt.get_img_ids())
25 | 
26 |     if covert_to_result:
27 |         PLData = json.load(open(pred_LvisJson_file, 'r'))
28 |         PL_list = list()
29 |         imageId_list = list()
30 |         for anno in PLData['annotations']:
31 |             cur_image_id = anno['image_id']
32 |             ## eval only on PLs
33 |             if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids):
34 |                 data = {'image_id': cur_image_id,
35 |                         'category_id': anno['category_id'],
36 |                         'bbox': anno['bbox'],
37 |                         'score': anno['confidence']}
38 |                 PL_list.append(data)
39 |                 imageId_list.append(cur_image_id)
40 |             # ## eval on all data (GT + PLs)
41 |             # if cur_image_id in gt_img_ids:
42 |             #     data = {'image_id': cur_image_id,
43 |             #             'category_id': anno['category_id'],
44 |             #             'bbox': anno['bbox'],
45 |             #             'score': anno['confidence']}
46 |             #     PL_list.append(data)
47 |             #     imageId_list.append(cur_image_id)
48 | 
49 |         print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) )
50 |     else:
51 |         PL_list = json.load(open(pred_LvisJson_file, 'r'))
52 | 
53 |     # do evaluation
54 |     lvis_results = LVISResults(lvis_gt, PL_list, max_dets=300)
55 |     lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type="bbox")
56 |     lvis_eval.run()
57 |     lvis_eval.print_results()
58 | 


--------------------------------------------------------------------------------
/tools/offline_eval_onO365.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from pycocotools.coco import COCO
 5 | from pycocotools.cocoeval import COCOeval
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(description='evaluate PLs quality offline')
10 |     parser.add_argument('gt_json', type=str, help='gt coco json file')
11 |     parser.add_argument('pl_json', type=str, help='PL coco json file')
12 |     parser.add_argument('-r', '--raw', action='store_true')
13 | 
14 |     args = parser.parse_args()
15 |     # print(args)
16 | 
17 |     #############################################
18 |     gt_COCOJson_file = args.gt_json
19 |     pred_COCOJson_file = args.pl_json   
20 |     #############################################
21 | 
22 |     # load image list in gt_json
23 |     GtData = json.load(open(gt_COCOJson_file, 'r'))
24 |     gt_img_ids = [x['id'] for x in GtData['images']]
25 |     gt_img_ids = set(gt_img_ids)
26 | 
27 |     PLData = json.load(open(pred_COCOJson_file, 'r'))
28 |     
29 |     if args.raw:
30 |         PL_list = PLData
31 |         imageId_list = gt_img_ids
32 |     else:
33 |         PL_list = list()
34 |         imageId_list = list()
35 |         for anno in PLData['annotations']:
36 |             cur_image_id = anno['image_id']
37 | 
38 |             score = anno.get('confidence', None)
39 |             if score is None:
40 |                 # take all PLs
41 |                 data = {'image_id': cur_image_id,
42 |                         'category_id': anno['category_id'],
43 |                         'bbox': anno['bbox'],
44 |                         'score': anno['confidence']}
45 |                 PL_list.append(data)
46 |                 imageId_list.append(cur_image_id)
47 | 
48 |             # if args.raw:
49 |             #     # take all annos from PLs
50 |             #     data = {'image_id': cur_image_id,
51 |             #             'category_id': anno['category_id'],
52 |             #             'bbox': anno['bbox'],
53 |             #             'score': anno['confidence']}
54 |             #     PL_list.append(data)
55 |             #     imageId_list.append(cur_image_id)
56 |             # else:
57 |             #     if ("thing_isNovel" in anno.keys()) and anno['thing_isNovel'] and (cur_image_id in gt_img_ids):
58 |             #         data = {'image_id': cur_image_id,
59 |             #                 'category_id': anno['category_id'],
60 |             #                 'bbox': anno['bbox'],
61 |             #                 'score': anno['confidence']}
62 |             #         PL_list.append(data)
63 |             #         imageId_list.append(cur_image_id)
64 | 
65 |     print( 'Total PL boxes num: %d, avg num: %.2f\n' % (len(PL_list), len(PL_list)/len(set(imageId_list))) )
66 | 
67 |     curSaveJson = './.temp.json'
68 |     with open(curSaveJson, 'w') as outfile:
69 |         json.dump(PL_list, outfile)
70 | 
71 |     cocoGt = COCO(gt_COCOJson_file)
72 |     cocoDt = cocoGt.loadRes(curSaveJson)
73 | 
74 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
75 |     cocoEval.evaluate()
76 |     cocoEval.accumulate()
77 |     cocoEval.summarize()
78 | 


--------------------------------------------------------------------------------