├── .gitignore ├── LICENSE ├── README.md ├── config └── DINO │ ├── DINO_4scale.py │ ├── DINO_4scale_convnext.py │ ├── DINO_4scale_swin.py │ ├── DINO_5scale.py │ └── coco_transformer.py ├── datasets ├── __init__.py ├── coco.py ├── coco_eval.py ├── coco_panoptic.py ├── data_util.py ├── dataset.py ├── panoptic_eval.py ├── random_crop.py ├── sltransform.py └── transforms.py ├── engine.py ├── figs ├── 12ep.png ├── 50ep.png ├── curve.png ├── dinosaur.png ├── framework.png ├── idea.jpg ├── sota.jpg └── sota_table.png ├── inference_and_visualization.ipynb ├── main.py ├── models ├── __init__.py ├── dino │ ├── __init__.py │ ├── attention.py │ ├── backbone.py │ ├── convnext.py │ ├── deformable_transformer.py │ ├── dino.py │ ├── dn_components.py │ ├── matcher.py │ ├── ops │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ ├── make.sh │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ ├── setup.py │ │ ├── src │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ ├── ms_deform_attn.h │ │ │ └── vision.cpp │ │ └── test.py │ ├── position_encoding.py │ ├── segmentation.py │ ├── swin_transformer.py │ ├── transformer_deformable.py │ └── utils.py └── registry.py ├── requirements.txt ├── run_with_submitit.py ├── scripts ├── DINO_eval.sh ├── DINO_eval_dist.sh ├── DINO_eval_submitit.sh ├── DINO_eval_submitit_5scale.sh ├── DINO_train.sh ├── DINO_train_convnext.sh ├── DINO_train_dist.sh ├── DINO_train_submitit.sh ├── DINO_train_submitit_5scale.sh ├── DINO_train_submitit_convnext.sh ├── DINO_train_submitit_swin.sh └── DINO_train_swin.sh ├── tools ├── README.md └── benchmark.py └── util ├── __init__.py ├── box_loss.py ├── box_ops.py ├── coco_id2name.json ├── get_param_dicts.py ├── logger.py ├── misc.py ├── plot_utils.py ├── slconfig.py ├── slio.py ├── static_data_path.py ├── time_counter.py ├── utils.py ├── vis_utils.py └── visualizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | .nfs* 2 | *.ipynb 3 | *.pyc 4 | .dumbo.json 5 | .DS_Store 6 | .*.swp 7 | *.pth 8 | **/__pycache__/** 9 | .ipynb_checkpoints/ 10 | datasets/data/ 11 | experiment-* 12 | *.tmp 13 | *.pkl 14 | **/.mypy_cache/* 15 | .mypy_cache/* 16 | not_tracked_dir/ 17 | .vscode 18 | logs 19 | jobs 20 | subs 21 | tmp 22 | *.sub 23 | vis/ 24 | model_zoo/ 25 | model_zoo_old/ 26 | scripts/ 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 IDEA 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | DAB-DETR(https://github.com/IDEA-Research/DAB-DETR) 204 | 205 | Copyright 2022 IDEA 206 | 207 | Licensed under the Apache License, Version 2.0 (the "License"); 208 | you may not use this file except in compliance with the License. 209 | You may obtain a copy of the License at 210 | 211 | http://www.apache.org/licenses/LICENSE-2.0 212 | 213 | Unless required by applicable law or agreed to in writing, software 214 | distributed under the License is distributed on an "AS IS" BASIS, 215 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 216 | See the License for the specific language governing permissions and 217 | limitations under the License. 218 | 219 | Conditional DETR(https://github.com/Atten4Vis/ConditionalDETR) 220 | 221 | Copyright 2021 Microsoft. 222 | 223 | Licensed under the Apache License, Version 2.0 (the "License"); 224 | you may not use this file except in compliance with the License. 225 | You may obtain a copy of the License at 226 | 227 | http://www.apache.org/licenses/LICENSE-2.0 228 | 229 | Unless required by applicable law or agreed to in writing, software 230 | distributed under the License is distributed on an "AS IS" BASIS, 231 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 232 | See the License for the specific language governing permissions and 233 | limitations under the License. 234 | 235 | 236 | Deformable DETR(https://github.com/fundamentalvision/Deformable-DETR) 237 | 238 | Copyright 2020 SenseTime 239 | 240 | Licensed under the Apache License, Version 2.0 (the "License"); 241 | you may not use this file except in compliance with the License. 242 | You may obtain a copy of the License at 243 | 244 | http://www.apache.org/licenses/LICENSE-2.0 245 | 246 | Unless required by applicable law or agreed to in writing, software 247 | distributed under the License is distributed on an "AS IS" BASIS, 248 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 249 | See the License for the specific language governing permissions and 250 | limitations under the License. 251 | 252 | 253 | DETR(https://github.com/facebookresearch/detr) 254 | 255 | Copyright 2020 - present, Facebook, Inc 256 | 257 | Licensed under the Apache License, Version 2.0 (the "License"); 258 | you may not use this file except in compliance with the License. 259 | You may obtain a copy of the License at 260 | 261 | http://www.apache.org/licenses/LICENSE-2.0 262 | 263 | Unless required by applicable law or agreed to in writing, software 264 | distributed under the License is distributed on an "AS IS" BASIS, 265 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 266 | See the License for the specific language governing permissions and 267 | limitations under the License. 268 | -------------------------------------------------------------------------------- /config/DINO/DINO_4scale.py: -------------------------------------------------------------------------------- 1 | _base_ = ['coco_transformer.py'] 2 | 3 | num_classes=91 4 | 5 | lr = 0.0001 6 | param_dict_type = 'default' 7 | lr_backbone = 1e-05 8 | lr_backbone_names = ['backbone.0'] 9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets'] 10 | lr_linear_proj_mult = 0.1 11 | ddetr_lr_param = False 12 | batch_size = 2 13 | weight_decay = 0.0001 14 | epochs = 12 15 | lr_drop = 11 16 | save_checkpoint_interval = 1 17 | clip_max_norm = 0.1 18 | onecyclelr = False 19 | multi_step_lr = False 20 | lr_drop_list = [33, 45] 21 | 22 | 23 | modelname = 'dino' 24 | frozen_weights = None 25 | backbone = 'resnet50' 26 | use_checkpoint = False 27 | 28 | dilation = False 29 | position_embedding = 'sine' 30 | pe_temperatureH = 20 31 | pe_temperatureW = 20 32 | return_interm_indices = [1, 2, 3] 33 | backbone_freeze_keywords = None 34 | enc_layers = 6 35 | dec_layers = 6 36 | unic_layers = 0 37 | pre_norm = False 38 | dim_feedforward = 2048 39 | hidden_dim = 256 40 | dropout = 0.0 41 | nheads = 8 42 | num_queries = 900 43 | query_dim = 4 44 | num_patterns = 0 45 | pdetr3_bbox_embed_diff_each_layer = False 46 | pdetr3_refHW = -1 47 | random_refpoints_xy = False 48 | fix_refpoints_hw = -1 49 | dabdetr_yolo_like_anchor_update = False 50 | dabdetr_deformable_encoder = False 51 | dabdetr_deformable_decoder = False 52 | use_deformable_box_attn = False 53 | box_attn_type = 'roi_align' 54 | dec_layer_number = None 55 | num_feature_levels = 4 56 | enc_n_points = 4 57 | dec_n_points = 4 58 | decoder_layer_noise = False 59 | dln_xy_noise = 0.2 60 | dln_hw_noise = 0.2 61 | add_channel_attention = False 62 | add_pos_value = False 63 | two_stage_type = 'standard' 64 | two_stage_pat_embed = 0 65 | two_stage_add_query_num = 0 66 | two_stage_bbox_embed_share = False 67 | two_stage_class_embed_share = False 68 | two_stage_learn_wh = False 69 | two_stage_default_hw = 0.05 70 | two_stage_keep_all_tokens = False 71 | num_select = 300 72 | transformer_activation = 'relu' 73 | batch_norm_type = 'FrozenBatchNorm2d' 74 | masks = False 75 | aux_loss = True 76 | set_cost_class = 2.0 77 | set_cost_bbox = 5.0 78 | set_cost_giou = 2.0 79 | cls_loss_coef = 1.0 80 | mask_loss_coef = 1.0 81 | dice_loss_coef = 1.0 82 | bbox_loss_coef = 5.0 83 | giou_loss_coef = 2.0 84 | enc_loss_coef = 1.0 85 | interm_loss_coef = 1.0 86 | no_interm_box_loss = False 87 | focal_alpha = 0.25 88 | 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content'] 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher 91 | decoder_module_seq = ['sa', 'ca', 'ffn'] 92 | nms_iou_threshold = -1 93 | 94 | dec_pred_bbox_embed_share = True 95 | dec_pred_class_embed_share = True 96 | 97 | # for dn 98 | use_dn = True 99 | dn_number = 100 100 | dn_box_noise_scale = 0.4 101 | dn_label_noise_ratio = 0.5 102 | embed_init_tgt = True 103 | dn_labelbook_size = 91 104 | 105 | match_unstable_error = True 106 | 107 | # for ema 108 | use_ema = False 109 | ema_decay = 0.9997 110 | ema_epoch = 0 111 | 112 | use_detached_boxes_dec_out = False 113 | 114 | -------------------------------------------------------------------------------- /config/DINO/DINO_4scale_convnext.py: -------------------------------------------------------------------------------- 1 | _base_ = ['coco_transformer.py'] 2 | 3 | num_classes=91 4 | 5 | lr = 0.0001 6 | param_dict_type = 'default' 7 | lr_backbone = 1e-05 8 | lr_backbone_names = ['backbone.0'] 9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets'] 10 | lr_linear_proj_mult = 0.1 11 | ddetr_lr_param = False 12 | batch_size = 2 13 | weight_decay = 0.0001 14 | epochs = 12 15 | lr_drop = 11 16 | save_checkpoint_interval = 1 17 | clip_max_norm = 0.1 18 | onecyclelr = False 19 | multi_step_lr = False 20 | lr_drop_list = [33, 45] 21 | 22 | 23 | modelname = 'dino' 24 | frozen_weights = None 25 | backbone = 'convnext_xlarge_22k' 26 | use_checkpoint = False 27 | 28 | dilation = False 29 | position_embedding = 'sine' 30 | pe_temperatureH = 20 31 | pe_temperatureW = 20 32 | return_interm_indices = [1, 2, 3] 33 | backbone_freeze_keywords = None 34 | enc_layers = 6 35 | dec_layers = 6 36 | unic_layers = 0 37 | pre_norm = False 38 | dim_feedforward = 2048 39 | hidden_dim = 256 40 | dropout = 0.0 41 | nheads = 8 42 | num_queries = 900 43 | query_dim = 4 44 | num_patterns = 0 45 | pdetr3_bbox_embed_diff_each_layer = False 46 | pdetr3_refHW = -1 47 | random_refpoints_xy = False 48 | fix_refpoints_hw = -1 49 | dabdetr_yolo_like_anchor_update = False 50 | dabdetr_deformable_encoder = False 51 | dabdetr_deformable_decoder = False 52 | use_deformable_box_attn = False 53 | box_attn_type = 'roi_align' 54 | dec_layer_number = None 55 | num_feature_levels = 4 56 | enc_n_points = 4 57 | dec_n_points = 4 58 | decoder_layer_noise = False 59 | dln_xy_noise = 0.2 60 | dln_hw_noise = 0.2 61 | add_channel_attention = False 62 | add_pos_value = False 63 | two_stage_type = 'standard' 64 | two_stage_pat_embed = 0 65 | two_stage_add_query_num = 0 66 | two_stage_bbox_embed_share = False 67 | two_stage_class_embed_share = False 68 | two_stage_learn_wh = False 69 | two_stage_default_hw = 0.05 70 | two_stage_keep_all_tokens = False 71 | num_select = 300 72 | transformer_activation = 'relu' 73 | batch_norm_type = 'FrozenBatchNorm2d' 74 | masks = False 75 | aux_loss = True 76 | set_cost_class = 2.0 77 | set_cost_bbox = 5.0 78 | set_cost_giou = 2.0 79 | cls_loss_coef = 1.0 80 | mask_loss_coef = 1.0 81 | dice_loss_coef = 1.0 82 | bbox_loss_coef = 5.0 83 | giou_loss_coef = 2.0 84 | enc_loss_coef = 1.0 85 | interm_loss_coef = 1.0 86 | no_interm_box_loss = False 87 | focal_alpha = 0.25 88 | 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content'] 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher 91 | decoder_module_seq = ['sa', 'ca', 'ffn'] 92 | nms_iou_threshold = -1 93 | 94 | dec_pred_bbox_embed_share = True 95 | dec_pred_class_embed_share = True 96 | 97 | # for dn 98 | use_dn = True 99 | dn_number = 100 100 | dn_box_noise_scale = 0.4 101 | dn_label_noise_ratio = 0.5 102 | embed_init_tgt = True 103 | dn_labelbook_size = 91 104 | 105 | match_unstable_error = True 106 | 107 | # for ema 108 | use_ema = False 109 | ema_decay = 0.9997 110 | ema_epoch = 0 111 | 112 | use_detached_boxes_dec_out = False 113 | 114 | -------------------------------------------------------------------------------- /config/DINO/DINO_4scale_swin.py: -------------------------------------------------------------------------------- 1 | _base_ = ['coco_transformer.py'] 2 | 3 | num_classes=91 4 | 5 | lr = 0.0001 6 | param_dict_type = 'default' 7 | lr_backbone = 1e-05 8 | lr_backbone_names = ['backbone.0'] 9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets'] 10 | lr_linear_proj_mult = 0.1 11 | ddetr_lr_param = False 12 | batch_size = 2 13 | weight_decay = 0.0001 14 | epochs = 12 15 | lr_drop = 11 16 | save_checkpoint_interval = 1 17 | clip_max_norm = 0.1 18 | onecyclelr = False 19 | multi_step_lr = False 20 | lr_drop_list = [33, 45] 21 | 22 | 23 | modelname = 'dino' 24 | frozen_weights = None 25 | backbone = 'swin_L_384_22k' 26 | use_checkpoint = True 27 | 28 | dilation = False 29 | position_embedding = 'sine' 30 | pe_temperatureH = 20 31 | pe_temperatureW = 20 32 | return_interm_indices = [1, 2, 3] 33 | backbone_freeze_keywords = None 34 | enc_layers = 6 35 | dec_layers = 6 36 | unic_layers = 0 37 | pre_norm = False 38 | dim_feedforward = 2048 39 | hidden_dim = 256 40 | dropout = 0.0 41 | nheads = 8 42 | num_queries = 900 43 | query_dim = 4 44 | num_patterns = 0 45 | pdetr3_bbox_embed_diff_each_layer = False 46 | pdetr3_refHW = -1 47 | random_refpoints_xy = False 48 | fix_refpoints_hw = -1 49 | dabdetr_yolo_like_anchor_update = False 50 | dabdetr_deformable_encoder = False 51 | dabdetr_deformable_decoder = False 52 | use_deformable_box_attn = False 53 | box_attn_type = 'roi_align' 54 | dec_layer_number = None 55 | num_feature_levels = 4 56 | enc_n_points = 4 57 | dec_n_points = 4 58 | decoder_layer_noise = False 59 | dln_xy_noise = 0.2 60 | dln_hw_noise = 0.2 61 | add_channel_attention = False 62 | add_pos_value = False 63 | two_stage_type = 'standard' 64 | two_stage_pat_embed = 0 65 | two_stage_add_query_num = 0 66 | two_stage_bbox_embed_share = False 67 | two_stage_class_embed_share = False 68 | two_stage_learn_wh = False 69 | two_stage_default_hw = 0.05 70 | two_stage_keep_all_tokens = False 71 | num_select = 300 72 | transformer_activation = 'relu' 73 | batch_norm_type = 'FrozenBatchNorm2d' 74 | masks = False 75 | aux_loss = True 76 | set_cost_class = 2.0 77 | set_cost_bbox = 5.0 78 | set_cost_giou = 2.0 79 | cls_loss_coef = 1.0 80 | mask_loss_coef = 1.0 81 | dice_loss_coef = 1.0 82 | bbox_loss_coef = 5.0 83 | giou_loss_coef = 2.0 84 | enc_loss_coef = 1.0 85 | interm_loss_coef = 1.0 86 | no_interm_box_loss = False 87 | focal_alpha = 0.25 88 | 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content'] 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher 91 | decoder_module_seq = ['sa', 'ca', 'ffn'] 92 | nms_iou_threshold = -1 93 | 94 | dec_pred_bbox_embed_share = True 95 | dec_pred_class_embed_share = True 96 | 97 | # for dn 98 | use_dn = True 99 | dn_number = 100 100 | dn_box_noise_scale = 0.4 101 | dn_label_noise_ratio = 0.5 102 | embed_init_tgt = True 103 | dn_labelbook_size = 91 104 | 105 | match_unstable_error = True 106 | 107 | # for ema 108 | use_ema = False 109 | ema_decay = 0.9997 110 | ema_epoch = 0 111 | 112 | use_detached_boxes_dec_out = False 113 | 114 | -------------------------------------------------------------------------------- /config/DINO/DINO_5scale.py: -------------------------------------------------------------------------------- 1 | _base_ = ['coco_transformer.py'] 2 | 3 | num_classes=91 4 | 5 | lr = 0.0001 6 | param_dict_type = 'default' 7 | lr_backbone = 1e-05 8 | lr_backbone_names = ['backbone.0'] 9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets'] 10 | lr_linear_proj_mult = 0.1 11 | ddetr_lr_param = False 12 | batch_size = 1 13 | weight_decay = 0.0001 14 | epochs = 12 15 | lr_drop = 11 16 | save_checkpoint_interval = 1 17 | clip_max_norm = 0.1 18 | onecyclelr = False 19 | multi_step_lr = False 20 | lr_drop_list = [33, 45] 21 | 22 | 23 | modelname = 'dino' 24 | frozen_weights = None 25 | backbone = 'resnet50' 26 | use_checkpoint = False 27 | 28 | dilation = False 29 | position_embedding = 'sine' 30 | pe_temperatureH = 20 31 | pe_temperatureW = 20 32 | return_interm_indices = [0, 1, 2, 3] 33 | backbone_freeze_keywords = None 34 | enc_layers = 6 35 | dec_layers = 6 36 | unic_layers = 0 37 | pre_norm = False 38 | dim_feedforward = 2048 39 | hidden_dim = 256 40 | dropout = 0.0 41 | nheads = 8 42 | num_queries = 900 43 | query_dim = 4 44 | num_patterns = 0 45 | pdetr3_bbox_embed_diff_each_layer = False 46 | pdetr3_refHW = -1 47 | random_refpoints_xy = False 48 | fix_refpoints_hw = -1 49 | dabdetr_yolo_like_anchor_update = False 50 | dabdetr_deformable_encoder = False 51 | dabdetr_deformable_decoder = False 52 | use_deformable_box_attn = False 53 | box_attn_type = 'roi_align' 54 | dec_layer_number = None 55 | num_feature_levels = 5 56 | enc_n_points = 4 57 | dec_n_points = 4 58 | decoder_layer_noise = False 59 | dln_xy_noise = 0.2 60 | dln_hw_noise = 0.2 61 | add_channel_attention = False 62 | add_pos_value = False 63 | two_stage_type = 'standard' 64 | two_stage_pat_embed = 0 65 | two_stage_add_query_num = 0 66 | two_stage_bbox_embed_share = False 67 | two_stage_class_embed_share = False 68 | two_stage_learn_wh = False 69 | two_stage_default_hw = 0.05 70 | two_stage_keep_all_tokens = False 71 | num_select = 300 72 | transformer_activation = 'relu' 73 | batch_norm_type = 'FrozenBatchNorm2d' 74 | masks = False 75 | aux_loss = True 76 | set_cost_class = 2.0 77 | set_cost_bbox = 5.0 78 | set_cost_giou = 2.0 79 | cls_loss_coef = 1.0 80 | mask_loss_coef = 1.0 81 | dice_loss_coef = 1.0 82 | bbox_loss_coef = 5.0 83 | giou_loss_coef = 2.0 84 | enc_loss_coef = 1.0 85 | interm_loss_coef = 1.0 86 | no_interm_box_loss = False 87 | focal_alpha = 0.25 88 | 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content'] 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher 91 | decoder_module_seq = ['sa', 'ca', 'ffn'] 92 | nms_iou_threshold = -1 93 | 94 | dec_pred_bbox_embed_share = True 95 | dec_pred_class_embed_share = True 96 | 97 | # for dn 98 | use_dn = True 99 | dn_number = 100 100 | dn_box_noise_scale = 0.4 101 | dn_label_noise_ratio = 0.5 102 | embed_init_tgt = True 103 | dn_labelbook_size = 91 104 | 105 | match_unstable_error = True 106 | 107 | # for ema 108 | use_ema = False 109 | ema_decay = 0.9997 110 | ema_epoch = 0 111 | 112 | use_detached_boxes_dec_out = False 113 | 114 | -------------------------------------------------------------------------------- /config/DINO/coco_transformer.py: -------------------------------------------------------------------------------- 1 | data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] 2 | data_aug_max_size = 1333 3 | data_aug_scales2_resize = [400, 500, 600] 4 | data_aug_scales2_crop = [384, 600] 5 | 6 | 7 | data_aug_scale_overlap = None 8 | 9 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import torch.utils.data 3 | import torchvision 4 | 5 | from .coco import build as build_coco 6 | 7 | 8 | def get_coco_api_from_dataset(dataset): 9 | for _ in range(10): 10 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 11 | # break 12 | if isinstance(dataset, torch.utils.data.Subset): 13 | dataset = dataset.dataset 14 | if isinstance(dataset, torchvision.datasets.CocoDetection): 15 | return dataset.coco 16 | 17 | 18 | def build_dataset(image_set, args): 19 | if args.dataset_file == 'coco': 20 | return build_coco(image_set, args) 21 | if args.dataset_file == 'coco_panoptic': 22 | # to avoid making panopticapi required for coco 23 | from .coco_panoptic import build as build_coco_panoptic 24 | return build_coco_panoptic(image_set, args) 25 | if args.dataset_file == 'o365': 26 | from .o365 import build_o365_combine 27 | return build_o365_combine(image_set, args) 28 | if args.dataset_file == 'vanke': 29 | from .vanke import build_vanke 30 | return build_vanke(image_set, args) 31 | raise ValueError(f'dataset {args.dataset_file} not supported') 32 | -------------------------------------------------------------------------------- /datasets/coco_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | COCO evaluator that works in distributed mode. 4 | 5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py 6 | The difference is that there is less copy-pasting from pycocotools 7 | in the end of the file, as python3 can suppress prints with contextlib 8 | """ 9 | import os 10 | import contextlib 11 | import copy 12 | import numpy as np 13 | import torch 14 | 15 | from pycocotools.cocoeval import COCOeval 16 | from pycocotools.coco import COCO 17 | import pycocotools.mask as mask_util 18 | 19 | from util.misc import all_gather 20 | 21 | 22 | class CocoEvaluator(object): 23 | def __init__(self, coco_gt, iou_types, useCats=True): 24 | assert isinstance(iou_types, (list, tuple)) 25 | coco_gt = copy.deepcopy(coco_gt) 26 | self.coco_gt = coco_gt 27 | 28 | self.iou_types = iou_types 29 | self.coco_eval = {} 30 | for iou_type in iou_types: 31 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 32 | self.coco_eval[iou_type].useCats = useCats 33 | 34 | self.img_ids = [] 35 | self.eval_imgs = {k: [] for k in iou_types} 36 | self.useCats = useCats 37 | 38 | def update(self, predictions): 39 | img_ids = list(np.unique(list(predictions.keys()))) 40 | self.img_ids.extend(img_ids) 41 | 42 | for iou_type in self.iou_types: 43 | results = self.prepare(predictions, iou_type) 44 | 45 | # suppress pycocotools prints 46 | with open(os.devnull, 'w') as devnull: 47 | with contextlib.redirect_stdout(devnull): 48 | coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() 49 | coco_eval = self.coco_eval[iou_type] 50 | 51 | coco_eval.cocoDt = coco_dt 52 | coco_eval.params.imgIds = list(img_ids) 53 | coco_eval.params.useCats = self.useCats 54 | img_ids, eval_imgs = evaluate(coco_eval) 55 | 56 | self.eval_imgs[iou_type].append(eval_imgs) 57 | 58 | def synchronize_between_processes(self): 59 | for iou_type in self.iou_types: 60 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 61 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) 62 | 63 | def accumulate(self): 64 | for coco_eval in self.coco_eval.values(): 65 | coco_eval.accumulate() 66 | 67 | def summarize(self): 68 | for iou_type, coco_eval in self.coco_eval.items(): 69 | print("IoU metric: {}".format(iou_type)) 70 | coco_eval.summarize() 71 | 72 | def prepare(self, predictions, iou_type): 73 | if iou_type == "bbox": 74 | return self.prepare_for_coco_detection(predictions) 75 | elif iou_type == "segm": 76 | return self.prepare_for_coco_segmentation(predictions) 77 | elif iou_type == "keypoints": 78 | return self.prepare_for_coco_keypoint(predictions) 79 | else: 80 | raise ValueError("Unknown iou type {}".format(iou_type)) 81 | 82 | def prepare_for_coco_detection(self, predictions): 83 | coco_results = [] 84 | for original_id, prediction in predictions.items(): 85 | if len(prediction) == 0: 86 | continue 87 | 88 | boxes = prediction["boxes"] 89 | boxes = convert_to_xywh(boxes).tolist() 90 | if not isinstance(prediction["scores"], list): 91 | scores = prediction["scores"].tolist() 92 | else: 93 | scores = prediction["scores"] 94 | if not isinstance(prediction["labels"], list): 95 | labels = prediction["labels"].tolist() 96 | else: 97 | labels = prediction["labels"] 98 | 99 | 100 | try: 101 | coco_results.extend( 102 | [ 103 | { 104 | "image_id": original_id, 105 | "category_id": labels[k], 106 | "bbox": box, 107 | "score": scores[k], 108 | } 109 | for k, box in enumerate(boxes) 110 | ] 111 | ) 112 | except: 113 | import ipdb; ipdb.set_trace() 114 | return coco_results 115 | 116 | def prepare_for_coco_segmentation(self, predictions): 117 | coco_results = [] 118 | for original_id, prediction in predictions.items(): 119 | if len(prediction) == 0: 120 | continue 121 | 122 | scores = prediction["scores"] 123 | labels = prediction["labels"] 124 | masks = prediction["masks"] 125 | 126 | masks = masks > 0.5 127 | 128 | scores = prediction["scores"].tolist() 129 | labels = prediction["labels"].tolist() 130 | 131 | rles = [ 132 | mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] 133 | for mask in masks 134 | ] 135 | for rle in rles: 136 | rle["counts"] = rle["counts"].decode("utf-8") 137 | 138 | coco_results.extend( 139 | [ 140 | { 141 | "image_id": original_id, 142 | "category_id": labels[k], 143 | "segmentation": rle, 144 | "score": scores[k], 145 | } 146 | for k, rle in enumerate(rles) 147 | ] 148 | ) 149 | return coco_results 150 | 151 | def prepare_for_coco_keypoint(self, predictions): 152 | coco_results = [] 153 | for original_id, prediction in predictions.items(): 154 | if len(prediction) == 0: 155 | continue 156 | 157 | boxes = prediction["boxes"] 158 | boxes = convert_to_xywh(boxes).tolist() 159 | scores = prediction["scores"].tolist() 160 | labels = prediction["labels"].tolist() 161 | keypoints = prediction["keypoints"] 162 | keypoints = keypoints.flatten(start_dim=1).tolist() 163 | 164 | coco_results.extend( 165 | [ 166 | { 167 | "image_id": original_id, 168 | "category_id": labels[k], 169 | 'keypoints': keypoint, 170 | "score": scores[k], 171 | } 172 | for k, keypoint in enumerate(keypoints) 173 | ] 174 | ) 175 | return coco_results 176 | 177 | 178 | def convert_to_xywh(boxes): 179 | xmin, ymin, xmax, ymax = boxes.unbind(1) 180 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 181 | 182 | 183 | def merge(img_ids, eval_imgs): 184 | all_img_ids = all_gather(img_ids) 185 | all_eval_imgs = all_gather(eval_imgs) 186 | 187 | merged_img_ids = [] 188 | for p in all_img_ids: 189 | merged_img_ids.extend(p) 190 | 191 | merged_eval_imgs = [] 192 | for p in all_eval_imgs: 193 | merged_eval_imgs.append(p) 194 | 195 | merged_img_ids = np.array(merged_img_ids) 196 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 197 | 198 | # keep only unique (and in sorted order) images 199 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 200 | merged_eval_imgs = merged_eval_imgs[..., idx] 201 | 202 | return merged_img_ids, merged_eval_imgs 203 | 204 | 205 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 206 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 207 | img_ids = list(img_ids) 208 | eval_imgs = list(eval_imgs.flatten()) 209 | 210 | coco_eval.evalImgs = eval_imgs 211 | coco_eval.params.imgIds = img_ids 212 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 213 | 214 | 215 | ################################################################# 216 | # From pycocotools, just removed the prints and fixed 217 | # a Python3 bug about unicode not defined 218 | ################################################################# 219 | 220 | 221 | def evaluate(self): 222 | ''' 223 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 224 | :return: None 225 | ''' 226 | p = self.params 227 | # add backward compatibility if useSegm is specified in params 228 | if p.useSegm is not None: 229 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 230 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 231 | p.imgIds = list(np.unique(p.imgIds)) 232 | if p.useCats: 233 | p.catIds = list(np.unique(p.catIds)) 234 | p.maxDets = sorted(p.maxDets) 235 | self.params = p 236 | 237 | self._prepare() 238 | # loop through images, area range, max detection number 239 | catIds = p.catIds if p.useCats else [-1] 240 | 241 | if p.iouType == 'segm' or p.iouType == 'bbox': 242 | computeIoU = self.computeIoU 243 | elif p.iouType == 'keypoints': 244 | computeIoU = self.computeOks 245 | self.ious = { 246 | (imgId, catId): computeIoU(imgId, catId) 247 | for imgId in p.imgIds 248 | for catId in catIds} 249 | 250 | evaluateImg = self.evaluateImg 251 | maxDet = p.maxDets[-1] 252 | evalImgs = [ 253 | evaluateImg(imgId, catId, areaRng, maxDet) 254 | for catId in catIds 255 | for areaRng in p.areaRng 256 | for imgId in p.imgIds 257 | ] 258 | # this is NOT in the pycocotools code, but could be done outside 259 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 260 | self._paramsEval = copy.deepcopy(self.params) 261 | 262 | return p.imgIds, evalImgs 263 | 264 | ################################################################# 265 | # end of straight copy from pycocotools, just removing the prints 266 | ################################################################# 267 | -------------------------------------------------------------------------------- /datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import json 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import torch 7 | from PIL import Image 8 | 9 | from panopticapi.utils import rgb2id 10 | from util.box_ops import masks_to_boxes 11 | 12 | from .coco import make_coco_transforms 13 | 14 | 15 | class CocoPanoptic: 16 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 17 | with open(ann_file, 'r') as f: 18 | self.coco = json.load(f) 19 | 20 | # sort 'images' field so that they are aligned with 'annotations' 21 | # i.e., in alphabetical order 22 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 23 | # sanity check 24 | if "annotations" in self.coco: 25 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 26 | assert img['file_name'][:-4] == ann['file_name'][:-4] 27 | 28 | self.img_folder = img_folder 29 | self.ann_folder = ann_folder 30 | self.ann_file = ann_file 31 | self.transforms = transforms 32 | self.return_masks = return_masks 33 | 34 | def __getitem__(self, idx): 35 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 36 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 37 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 38 | 39 | img = Image.open(img_path).convert('RGB') 40 | w, h = img.size 41 | if "segments_info" in ann_info: 42 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 43 | masks = rgb2id(masks) 44 | 45 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 46 | masks = masks == ids[:, None, None] 47 | 48 | masks = torch.as_tensor(masks, dtype=torch.uint8) 49 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 50 | 51 | target = {} 52 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 53 | if self.return_masks: 54 | target['masks'] = masks 55 | target['labels'] = labels 56 | 57 | target["boxes"] = masks_to_boxes(masks) 58 | 59 | target['size'] = torch.as_tensor([int(h), int(w)]) 60 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 61 | if "segments_info" in ann_info: 62 | for name in ['iscrowd', 'area']: 63 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 64 | 65 | if self.transforms is not None: 66 | img, target = self.transforms(img, target) 67 | 68 | return img, target 69 | 70 | def __len__(self): 71 | return len(self.coco['images']) 72 | 73 | def get_height_and_width(self, idx): 74 | img_info = self.coco['images'][idx] 75 | height = img_info['height'] 76 | width = img_info['width'] 77 | return height, width 78 | 79 | 80 | def build(image_set, args): 81 | img_folder_root = Path(args.coco_path) 82 | ann_folder_root = Path(args.coco_panoptic_path) 83 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 84 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 85 | mode = 'panoptic' 86 | PATHS = { 87 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 88 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 89 | } 90 | 91 | img_folder, ann_file = PATHS[image_set] 92 | img_folder_path = img_folder_root / img_folder 93 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 94 | ann_file = ann_folder_root / ann_file 95 | 96 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 97 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 98 | 99 | return dataset 100 | -------------------------------------------------------------------------------- /datasets/data_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import shutil 4 | import time 5 | import datetime 6 | 7 | import torch 8 | 9 | from util.slconfig import SLConfig 10 | 11 | class Error(OSError): 12 | pass 13 | 14 | def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile, 15 | ignore_dangling_symlinks=False): 16 | """ 17 | modified from shutil.copytree without copystat. 18 | 19 | Recursively copy a directory tree. 20 | 21 | The destination directory must not already exist. 22 | If exception(s) occur, an Error is raised with a list of reasons. 23 | 24 | If the optional symlinks flag is true, symbolic links in the 25 | source tree result in symbolic links in the destination tree; if 26 | it is false, the contents of the files pointed to by symbolic 27 | links are copied. If the file pointed by the symlink doesn't 28 | exist, an exception will be added in the list of errors raised in 29 | an Error exception at the end of the copy process. 30 | 31 | You can set the optional ignore_dangling_symlinks flag to true if you 32 | want to silence this exception. Notice that this has no effect on 33 | platforms that don't support os.symlink. 34 | 35 | The optional ignore argument is a callable. If given, it 36 | is called with the `src` parameter, which is the directory 37 | being visited by copytree(), and `names` which is the list of 38 | `src` contents, as returned by os.listdir(): 39 | 40 | callable(src, names) -> ignored_names 41 | 42 | Since copytree() is called recursively, the callable will be 43 | called once for each directory that is copied. It returns a 44 | list of names relative to the `src` directory that should 45 | not be copied. 46 | 47 | The optional copy_function argument is a callable that will be used 48 | to copy each file. It will be called with the source path and the 49 | destination path as arguments. By default, copy2() is used, but any 50 | function that supports the same signature (like copy()) can be used. 51 | 52 | """ 53 | errors = [] 54 | if os.path.isdir(src): 55 | names = os.listdir(src) 56 | if ignore is not None: 57 | ignored_names = ignore(src, names) 58 | else: 59 | ignored_names = set() 60 | 61 | os.makedirs(dst) 62 | for name in names: 63 | if name in ignored_names: 64 | continue 65 | srcname = os.path.join(src, name) 66 | dstname = os.path.join(dst, name) 67 | try: 68 | if os.path.islink(srcname): 69 | linkto = os.readlink(srcname) 70 | if symlinks: 71 | # We can't just leave it to `copy_function` because legacy 72 | # code with a custom `copy_function` may rely on copytree 73 | # doing the right thing. 74 | os.symlink(linkto, dstname) 75 | else: 76 | # ignore dangling symlink if the flag is on 77 | if not os.path.exists(linkto) and ignore_dangling_symlinks: 78 | continue 79 | # otherwise let the copy occurs. copy2 will raise an error 80 | if os.path.isdir(srcname): 81 | slcopytree(srcname, dstname, symlinks, ignore, 82 | copy_function) 83 | else: 84 | copy_function(srcname, dstname) 85 | elif os.path.isdir(srcname): 86 | slcopytree(srcname, dstname, symlinks, ignore, copy_function) 87 | else: 88 | # Will raise a SpecialFileError for unsupported file types 89 | copy_function(srcname, dstname) 90 | # catch the Error from the recursive copytree so that we can 91 | # continue with other files 92 | except Error as err: 93 | errors.extend(err.args[0]) 94 | except OSError as why: 95 | errors.append((srcname, dstname, str(why))) 96 | else: 97 | copy_function(src, dst) 98 | 99 | if errors: 100 | raise Error(errors) 101 | return dst 102 | 103 | def check_and_copy(src_path, tgt_path): 104 | if os.path.exists(tgt_path): 105 | return None 106 | 107 | return slcopytree(src_path, tgt_path) 108 | 109 | 110 | def remove(srcpath): 111 | if os.path.isdir(srcpath): 112 | return shutil.rmtree(srcpath) 113 | else: 114 | return os.remove(srcpath) 115 | 116 | 117 | def preparing_dataset(pathdict, image_set, args): 118 | start_time = time.time() 119 | dataset_file = args.dataset_file 120 | data_static_info = SLConfig.fromfile('util/static_data_path.py') 121 | static_dict = data_static_info[dataset_file][image_set] 122 | 123 | copyfilelist = [] 124 | for k,tgt_v in pathdict.items(): 125 | if os.path.exists(tgt_v): 126 | if args.local_rank == 0: 127 | print("path <{}> exist. remove it!".format(tgt_v)) 128 | remove(tgt_v) 129 | # continue 130 | 131 | if args.local_rank == 0: 132 | src_v = static_dict[k] 133 | assert isinstance(src_v, str) 134 | if src_v.endswith('.zip'): 135 | # copy 136 | cp_tgt_dir = os.path.dirname(tgt_v) 137 | filename = os.path.basename(src_v) 138 | cp_tgt_path = os.path.join(cp_tgt_dir, filename) 139 | print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path)) 140 | os.makedirs(cp_tgt_dir, exist_ok=True) 141 | check_and_copy(src_v, cp_tgt_path) 142 | 143 | # unzip 144 | import zipfile 145 | print("Starting unzip <{}>".format(cp_tgt_path)) 146 | with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref: 147 | zip_ref.extractall(os.path.dirname(cp_tgt_path)) 148 | 149 | copyfilelist.append(cp_tgt_path) 150 | copyfilelist.append(tgt_v) 151 | else: 152 | print('Copy from <{}> to <{}>.'.format(src_v, tgt_v)) 153 | os.makedirs(os.path.dirname(tgt_v), exist_ok=True) 154 | check_and_copy(src_v, tgt_v) 155 | copyfilelist.append(tgt_v) 156 | 157 | if len(copyfilelist) == 0: 158 | copyfilelist = None 159 | args.copyfilelist = copyfilelist 160 | 161 | if args.distributed: 162 | torch.distributed.barrier() 163 | total_time = time.time() - start_time 164 | if copyfilelist: 165 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 166 | print('Data copy time {}'.format(total_time_str)) 167 | return copyfilelist 168 | 169 | 170 | -------------------------------------------------------------------------------- /datasets/dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import torch 4 | import torchvision.datasets as datasets 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | from .tsv_io import TSVFile 8 | import numpy as np 9 | import base64 10 | import io 11 | 12 | 13 | class TSVDataset(Dataset): 14 | """ TSV dataset for ImageNet 1K training 15 | """ 16 | def __init__(self, tsv_file, transform=None, target_transform=None): 17 | self.tsv = TSVFile(tsv_file) 18 | self.transform = transform 19 | self.target_transform = target_transform 20 | 21 | def __getitem__(self, index): 22 | """ 23 | Args: 24 | index (int): Index 25 | Returns: 26 | tuple: (image, target) where target is class_index of the target class. 27 | """ 28 | row = self.tsv.seek(index) 29 | image_data = base64.b64decode(row[-1]) 30 | image = Image.open(io.BytesIO(image_data)) 31 | image = image.convert('RGB') 32 | target = int(row[1]) 33 | 34 | if self.transform is not None: 35 | img = self.transform(image) 36 | else: 37 | img = image 38 | if self.target_transform is not None: 39 | target = self.target_transform(target) 40 | 41 | return img, target 42 | 43 | def __len__(self): 44 | return self.tsv.num_rows() 45 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import json 3 | import os 4 | 5 | import util.misc as utils 6 | 7 | try: 8 | from panopticapi.evaluation import pq_compute 9 | except ImportError: 10 | pass 11 | 12 | 13 | class PanopticEvaluator(object): 14 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 15 | self.gt_json = ann_file 16 | self.gt_folder = ann_folder 17 | if utils.is_main_process(): 18 | if not os.path.exists(output_dir): 19 | os.mkdir(output_dir) 20 | self.output_dir = output_dir 21 | self.predictions = [] 22 | 23 | def update(self, predictions): 24 | for p in predictions: 25 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 26 | f.write(p.pop("png_string")) 27 | 28 | self.predictions += predictions 29 | 30 | def synchronize_between_processes(self): 31 | all_predictions = utils.all_gather(self.predictions) 32 | merged_predictions = [] 33 | for p in all_predictions: 34 | merged_predictions += p 35 | self.predictions = merged_predictions 36 | 37 | def summarize(self): 38 | if utils.is_main_process(): 39 | json_data = {"annotations": self.predictions} 40 | predictions_json = os.path.join(self.output_dir, "predictions.json") 41 | with open(predictions_json, "w") as f: 42 | f.write(json.dumps(json_data)) 43 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 44 | return None 45 | -------------------------------------------------------------------------------- /datasets/random_crop.py: -------------------------------------------------------------------------------- 1 | import PIL #version 1.2.0 2 | import torch 3 | import os 4 | import torchvision.transforms.functional as F 5 | import numpy as np 6 | import random 7 | 8 | 9 | def intersect(boxes1, boxes2): 10 | ''' 11 | Find intersection of every box combination between two sets of box 12 | boxes1: bounding boxes 1, a tensor of dimensions (n1, 4) 13 | boxes2: bounding boxes 2, a tensor of dimensions (n2, 4) 14 | 15 | Out: Intersection each of boxes1 with respect to each of boxes2, 16 | a tensor of dimensions (n1, n2) 17 | ''' 18 | n1 = boxes1.size(0) 19 | n2 = boxes2.size(0) 20 | max_xy = torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2), 21 | boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2)) 22 | 23 | min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2), 24 | boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2)) 25 | inter = torch.clamp(max_xy - min_xy , min=0) # (n1, n2, 2) 26 | return inter[:, :, 0] * inter[:, :, 1] #(n1, n2) 27 | def find_IoU(boxes1, boxes2): 28 | ''' 29 | Find IoU between every boxes set of boxes 30 | boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom) 31 | boxes2: a tensor of dimensions (n2, 4) 32 | 33 | Out: IoU each of boxes1 with respect to each of boxes2, a tensor of 34 | dimensions (n1, n2) 35 | 36 | Formula: 37 | (box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 )) 38 | ''' 39 | inter = intersect(boxes1, boxes2) 40 | area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1]) 41 | area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1]) 42 | 43 | area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2) 44 | area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter) #(n1, n2) 45 | union = (area_boxes1 + area_boxes2 - inter) 46 | return inter / union 47 | 48 | 49 | def random_crop(image, boxes, labels, difficulties=None): 50 | ''' 51 | image: A PIL image 52 | boxes: Bounding boxes, a tensor of dimensions (#objects, 4) 53 | labels: labels of object, a tensor of dimensions (#objects) 54 | difficulties: difficulties of detect object, a tensor of dimensions (#objects) 55 | 56 | Out: cropped image , new boxes, new labels, new difficulties 57 | ''' 58 | if type(image) == PIL.Image.Image: 59 | image = F.to_tensor(image) 60 | original_h = image.size(1) 61 | original_w = image.size(2) 62 | 63 | while True: 64 | mode = random.choice([0.1, 0.3, 0.5, 0.9, None]) 65 | 66 | if mode is None: 67 | return F.to_pil_image(image), boxes, labels, difficulties 68 | 69 | new_image = image 70 | new_boxes = boxes 71 | new_difficulties = difficulties 72 | new_labels = labels 73 | for _ in range(50): 74 | # Crop dimensions: [0.3, 1] of original dimensions 75 | new_h = random.uniform(0.3*original_h, original_h) 76 | new_w = random.uniform(0.3*original_w, original_w) 77 | 78 | # Aspect ratio constraint b/t .5 & 2 79 | if new_h/new_w < 0.5 or new_h/new_w > 2: 80 | continue 81 | 82 | #Crop coordinate 83 | left = random.uniform(0, original_w - new_w) 84 | right = left + new_w 85 | top = random.uniform(0, original_h - new_h) 86 | bottom = top + new_h 87 | crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)]) 88 | 89 | # Calculate IoU between the crop and the bounding boxes 90 | overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects) 91 | overlap = overlap.squeeze(0) 92 | 93 | # If not a single bounding box has a IoU of greater than the minimum, try again 94 | if overlap.shape[0] == 0: 95 | continue 96 | if overlap.max().item() < mode: 97 | continue 98 | 99 | #Crop 100 | new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w) 101 | 102 | #Center of bounding boxes 103 | center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0 104 | 105 | #Find bounding box has been had center in crop 106 | center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right 107 | ) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom) #( #objects) 108 | 109 | if not center_in_crop.any(): 110 | continue 111 | 112 | #take matching bounding box 113 | new_boxes = boxes[center_in_crop, :] 114 | 115 | #take matching labels 116 | new_labels = labels[center_in_crop] 117 | 118 | #take matching difficulities 119 | if difficulties is not None: 120 | new_difficulties = difficulties[center_in_crop] 121 | else: 122 | new_difficulties = None 123 | 124 | #Use the box left and top corner or the crop's 125 | new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2]) 126 | 127 | #adjust to crop 128 | new_boxes[:, :2] -= crop[:2] 129 | 130 | new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:]) 131 | 132 | #adjust to crop 133 | new_boxes[:, 2:] -= crop[:2] 134 | 135 | return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties -------------------------------------------------------------------------------- /datasets/sltransform.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb 2 | 3 | import PIL #version 1.2.0 4 | from PIL import Image #version 6.1.0 5 | import torch 6 | import os 7 | import torchvision.transforms.functional as F 8 | import numpy as np 9 | import random 10 | 11 | from .random_crop import random_crop 12 | from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh 13 | 14 | class AdjustContrast: 15 | def __init__(self, contrast_factor): 16 | self.contrast_factor = contrast_factor 17 | 18 | def __call__(self, img, target): 19 | """ 20 | img (PIL Image or Tensor): Image to be adjusted. 21 | """ 22 | _contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor 23 | img = F.adjust_contrast(img, _contrast_factor) 24 | return img, target 25 | 26 | class AdjustBrightness: 27 | def __init__(self, brightness_factor): 28 | self.brightness_factor = brightness_factor 29 | 30 | def __call__(self, img, target): 31 | """ 32 | img (PIL Image or Tensor): Image to be adjusted. 33 | """ 34 | _brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor 35 | img = F.adjust_brightness(img, _brightness_factor) 36 | return img, target 37 | 38 | def lighting_noise(image): 39 | ''' 40 | color channel swap in image 41 | image: A PIL image 42 | ''' 43 | new_image = image 44 | perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2), 45 | (1, 2, 0), (2, 0, 1), (2, 1, 0)) 46 | swap = perms[random.randint(0, len(perms)- 1)] 47 | new_image = F.to_tensor(new_image) 48 | new_image = new_image[swap, :, :] 49 | new_image = F.to_pil_image(new_image) 50 | return new_image 51 | 52 | class LightingNoise: 53 | def __init__(self) -> None: 54 | pass 55 | 56 | def __call__(self, img, target): 57 | return lighting_noise(img), target 58 | 59 | 60 | def rotate(image, boxes, angle): 61 | ''' 62 | Rotate image and bounding box 63 | image: A Pil image (w, h) 64 | boxes: A tensors of dimensions (#objects, 4) 65 | 66 | Out: rotated image (w, h), rotated boxes 67 | ''' 68 | new_image = image.copy() 69 | new_boxes = boxes.clone() 70 | 71 | #Rotate image, expand = True 72 | w = image.width 73 | h = image.height 74 | cx = w/2 75 | cy = h/2 76 | new_image = new_image.rotate(angle, expand=True) 77 | angle = np.radians(angle) 78 | alpha = np.cos(angle) 79 | beta = np.sin(angle) 80 | #Get affine matrix 81 | AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy], 82 | [-beta, alpha, beta*cx + (1-alpha)*cy]]) 83 | 84 | #Rotation boxes 85 | box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1) 86 | box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1) 87 | 88 | #Get corners for boxes 89 | x1 = boxes[:,0].reshape(-1,1) 90 | y1 = boxes[:,1].reshape(-1,1) 91 | 92 | x2 = x1 + box_width 93 | y2 = y1 94 | 95 | x3 = x1 96 | y3 = y1 + box_height 97 | 98 | x4 = boxes[:,2].reshape(-1,1) 99 | y4 = boxes[:,3].reshape(-1,1) 100 | 101 | corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1) 102 | # corners.reshape(-1, 8) #Tensors of dimensions (#objects, 8) 103 | corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2) 104 | corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3)) 105 | 106 | cos = np.abs(AffineMatrix[0, 0]) 107 | sin = np.abs(AffineMatrix[0, 1]) 108 | 109 | nW = int((h * sin) + (w * cos)) 110 | nH = int((h * cos) + (w * sin)) 111 | AffineMatrix[0, 2] += (nW / 2) - cx 112 | AffineMatrix[1, 2] += (nH / 2) - cy 113 | 114 | 115 | #Apply affine transform 116 | rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t() 117 | rotate_corners = rotate_corners.reshape(-1,8) 118 | 119 | x_corners = rotate_corners[:,[0,2,4,6]] 120 | y_corners = rotate_corners[:,[1,3,5,7]] 121 | 122 | #Get (x_min, y_min, x_max, y_max) 123 | x_min, _ = torch.min(x_corners, dim= 1) 124 | x_min = x_min.reshape(-1, 1) 125 | y_min, _ = torch.min(y_corners, dim= 1) 126 | y_min = y_min.reshape(-1, 1) 127 | x_max, _ = torch.max(x_corners, dim= 1) 128 | x_max = x_max.reshape(-1, 1) 129 | y_max, _ = torch.max(y_corners, dim= 1) 130 | y_max = y_max.reshape(-1, 1) 131 | 132 | new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1) 133 | 134 | scale_x = new_image.width / w 135 | scale_y = new_image.height / h 136 | 137 | #Resize new image to (w, h) 138 | 139 | new_image = new_image.resize((w, h)) 140 | 141 | #Resize boxes 142 | new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y]) 143 | new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w) 144 | new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h) 145 | new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w) 146 | new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h) 147 | return new_image, new_boxes 148 | 149 | # def convert_xywh_to_xyxy(boxes: torch.Tensor): 150 | # _boxes = boxes.clone() 151 | # box_xy = _boxes[:, :2] 152 | # box_wh = _boxes[:, 2:] 153 | # box_x1y1 = box_xy - box_wh/2 154 | # box_x2y2 = box_xy + box_wh/2 155 | # box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1) 156 | # return box_xyxy 157 | 158 | class Rotate: 159 | def __init__(self, angle=10) -> None: 160 | self.angle = angle 161 | 162 | def __call__(self, img, target): 163 | w,h = img.size 164 | whwh = torch.Tensor([w, h, w, h]) 165 | boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh 166 | img, boxes_new = rotate(img, boxes_xyxy, self.angle) 167 | target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3) 168 | return img, target 169 | 170 | 171 | class RandomCrop: 172 | def __init__(self) -> None: 173 | pass 174 | 175 | def __call__(self, img, target): 176 | w,h = img.size 177 | try: 178 | boxes_xyxy = target['boxes'] 179 | labels = target['labels'] 180 | img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels) 181 | target['boxes'] = new_boxes 182 | target['labels'] = new_labels 183 | except Exception as e: 184 | pass 185 | return img, target 186 | 187 | 188 | class RandomCropDebug: 189 | def __init__(self) -> None: 190 | pass 191 | 192 | def __call__(self, img, target): 193 | boxes_xyxy = target['boxes'].clone() 194 | labels = target['labels'].clone() 195 | img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels) 196 | target['boxes'] = new_boxes 197 | target['labels'] = new_labels 198 | 199 | 200 | return img, target 201 | 202 | class RandomSelectMulti(object): 203 | """ 204 | Randomly selects between transforms1 and transforms2, 205 | """ 206 | def __init__(self, transformslist, p=-1): 207 | self.transformslist = transformslist 208 | self.p = p 209 | assert p == -1 210 | 211 | def __call__(self, img, target): 212 | if self.p == -1: 213 | return random.choice(self.transformslist)(img, target) 214 | 215 | 216 | class Albumentations: 217 | def __init__(self): 218 | import albumentations as A 219 | self.transform = A.Compose([ 220 | A.Blur(p=0.01), 221 | A.MedianBlur(p=0.01), 222 | A.ToGray(p=0.01), 223 | A.CLAHE(p=0.01), 224 | A.RandomBrightnessContrast(p=0.005), 225 | A.RandomGamma(p=0.005), 226 | A.ImageCompression(quality_lower=75, p=0.005)], 227 | bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels'])) 228 | 229 | def __call__(self, img, target, p=1.0): 230 | """ 231 | Input: 232 | target['boxes']: xyxy, unnormalized data. 233 | 234 | """ 235 | boxes_raw = target['boxes'] 236 | labels_raw = target['labels'] 237 | img_np = np.array(img) 238 | if self.transform and random.random() < p: 239 | new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw) # transformed 240 | boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw) 241 | img_np = new_res['image'] 242 | labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype) 243 | img_new = Image.fromarray(img_np) 244 | target['boxes'] = boxes_new 245 | target['labels'] = labels_new 246 | 247 | return img_new, target -------------------------------------------------------------------------------- /datasets/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Transforms and data augmentation for both image + bbox. 4 | """ 5 | import random 6 | 7 | import PIL 8 | import torch 9 | import torchvision.transforms as T 10 | import torchvision.transforms.functional as F 11 | 12 | from util.box_ops import box_xyxy_to_cxcywh 13 | from util.misc import interpolate 14 | 15 | 16 | def crop(image, target, region): 17 | cropped_image = F.crop(image, *region) 18 | 19 | target = target.copy() 20 | i, j, h, w = region 21 | 22 | # should we do something wrt the original size? 23 | target["size"] = torch.tensor([h, w]) 24 | 25 | fields = ["labels", "area", "iscrowd"] 26 | 27 | if "boxes" in target: 28 | boxes = target["boxes"] 29 | max_size = torch.as_tensor([w, h], dtype=torch.float32) 30 | cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) 31 | cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) 32 | cropped_boxes = cropped_boxes.clamp(min=0) 33 | area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) 34 | target["boxes"] = cropped_boxes.reshape(-1, 4) 35 | target["area"] = area 36 | fields.append("boxes") 37 | 38 | if "masks" in target: 39 | # FIXME should we update the area here if there are no boxes? 40 | target['masks'] = target['masks'][:, i:i + h, j:j + w] 41 | fields.append("masks") 42 | 43 | 44 | # remove elements for which the boxes or masks that have zero area 45 | if "boxes" in target or "masks" in target: 46 | # favor boxes selection when defining which elements to keep 47 | # this is compatible with previous implementation 48 | if "boxes" in target: 49 | cropped_boxes = target['boxes'].reshape(-1, 2, 2) 50 | keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) 51 | else: 52 | keep = target['masks'].flatten(1).any(1) 53 | 54 | for field in fields: 55 | target[field] = target[field][keep] 56 | 57 | return cropped_image, target 58 | 59 | 60 | def hflip(image, target): 61 | flipped_image = F.hflip(image) 62 | 63 | w, h = image.size 64 | 65 | target = target.copy() 66 | if "boxes" in target: 67 | boxes = target["boxes"] 68 | boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) 69 | target["boxes"] = boxes 70 | 71 | if "masks" in target: 72 | target['masks'] = target['masks'].flip(-1) 73 | 74 | return flipped_image, target 75 | 76 | 77 | def resize(image, target, size, max_size=None): 78 | # size can be min_size (scalar) or (w, h) tuple 79 | 80 | def get_size_with_aspect_ratio(image_size, size, max_size=None): 81 | w, h = image_size 82 | if max_size is not None: 83 | min_original_size = float(min((w, h))) 84 | max_original_size = float(max((w, h))) 85 | if max_original_size / min_original_size * size > max_size: 86 | size = int(round(max_size * min_original_size / max_original_size)) 87 | 88 | if (w <= h and w == size) or (h <= w and h == size): 89 | return (h, w) 90 | 91 | if w < h: 92 | ow = size 93 | oh = int(size * h / w) 94 | else: 95 | oh = size 96 | ow = int(size * w / h) 97 | 98 | return (oh, ow) 99 | 100 | def get_size(image_size, size, max_size=None): 101 | if isinstance(size, (list, tuple)): 102 | return size[::-1] 103 | else: 104 | return get_size_with_aspect_ratio(image_size, size, max_size) 105 | 106 | size = get_size(image.size, size, max_size) 107 | rescaled_image = F.resize(image, size) 108 | 109 | if target is None: 110 | return rescaled_image, None 111 | 112 | ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) 113 | ratio_width, ratio_height = ratios 114 | 115 | target = target.copy() 116 | if "boxes" in target: 117 | boxes = target["boxes"] 118 | scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) 119 | target["boxes"] = scaled_boxes 120 | 121 | if "area" in target: 122 | area = target["area"] 123 | scaled_area = area * (ratio_width * ratio_height) 124 | target["area"] = scaled_area 125 | 126 | h, w = size 127 | target["size"] = torch.tensor([h, w]) 128 | 129 | if "masks" in target: 130 | target['masks'] = interpolate( 131 | target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 132 | 133 | return rescaled_image, target 134 | 135 | 136 | def pad(image, target, padding): 137 | # assumes that we only pad on the bottom right corners 138 | padded_image = F.pad(image, (0, 0, padding[0], padding[1])) 139 | if target is None: 140 | return padded_image, None 141 | target = target.copy() 142 | # should we do something wrt the original size? 143 | target["size"] = torch.tensor(padded_image.size[::-1]) 144 | if "masks" in target: 145 | target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) 146 | return padded_image, target 147 | 148 | 149 | class ResizeDebug(object): 150 | def __init__(self, size): 151 | self.size = size 152 | 153 | def __call__(self, img, target): 154 | return resize(img, target, self.size) 155 | 156 | 157 | class RandomCrop(object): 158 | def __init__(self, size): 159 | self.size = size 160 | 161 | def __call__(self, img, target): 162 | region = T.RandomCrop.get_params(img, self.size) 163 | return crop(img, target, region) 164 | 165 | 166 | class RandomSizeCrop(object): 167 | def __init__(self, min_size: int, max_size: int): 168 | self.min_size = min_size 169 | self.max_size = max_size 170 | 171 | def __call__(self, img: PIL.Image.Image, target: dict): 172 | w = random.randint(self.min_size, min(img.width, self.max_size)) 173 | h = random.randint(self.min_size, min(img.height, self.max_size)) 174 | region = T.RandomCrop.get_params(img, [h, w]) 175 | return crop(img, target, region) 176 | 177 | 178 | class CenterCrop(object): 179 | def __init__(self, size): 180 | self.size = size 181 | 182 | def __call__(self, img, target): 183 | image_width, image_height = img.size 184 | crop_height, crop_width = self.size 185 | crop_top = int(round((image_height - crop_height) / 2.)) 186 | crop_left = int(round((image_width - crop_width) / 2.)) 187 | return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) 188 | 189 | 190 | class RandomHorizontalFlip(object): 191 | def __init__(self, p=0.5): 192 | self.p = p 193 | 194 | def __call__(self, img, target): 195 | if random.random() < self.p: 196 | return hflip(img, target) 197 | return img, target 198 | 199 | 200 | class RandomResize(object): 201 | def __init__(self, sizes, max_size=None): 202 | assert isinstance(sizes, (list, tuple)) 203 | self.sizes = sizes 204 | self.max_size = max_size 205 | 206 | def __call__(self, img, target=None): 207 | size = random.choice(self.sizes) 208 | return resize(img, target, size, self.max_size) 209 | 210 | 211 | class RandomPad(object): 212 | def __init__(self, max_pad): 213 | self.max_pad = max_pad 214 | 215 | def __call__(self, img, target): 216 | pad_x = random.randint(0, self.max_pad) 217 | pad_y = random.randint(0, self.max_pad) 218 | return pad(img, target, (pad_x, pad_y)) 219 | 220 | 221 | class RandomSelect(object): 222 | """ 223 | Randomly selects between transforms1 and transforms2, 224 | with probability p for transforms1 and (1 - p) for transforms2 225 | """ 226 | def __init__(self, transforms1, transforms2, p=0.5): 227 | self.transforms1 = transforms1 228 | self.transforms2 = transforms2 229 | self.p = p 230 | 231 | def __call__(self, img, target): 232 | if random.random() < self.p: 233 | return self.transforms1(img, target) 234 | return self.transforms2(img, target) 235 | 236 | 237 | class ToTensor(object): 238 | def __call__(self, img, target): 239 | return F.to_tensor(img), target 240 | 241 | 242 | class RandomErasing(object): 243 | 244 | def __init__(self, *args, **kwargs): 245 | self.eraser = T.RandomErasing(*args, **kwargs) 246 | 247 | def __call__(self, img, target): 248 | return self.eraser(img), target 249 | 250 | 251 | class Normalize(object): 252 | def __init__(self, mean, std): 253 | self.mean = mean 254 | self.std = std 255 | 256 | def __call__(self, image, target=None): 257 | image = F.normalize(image, mean=self.mean, std=self.std) 258 | if target is None: 259 | return image, None 260 | target = target.copy() 261 | h, w = image.shape[-2:] 262 | if "boxes" in target: 263 | boxes = target["boxes"] 264 | boxes = box_xyxy_to_cxcywh(boxes) 265 | boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) 266 | target["boxes"] = boxes 267 | return image, target 268 | 269 | 270 | class Compose(object): 271 | def __init__(self, transforms): 272 | self.transforms = transforms 273 | 274 | def __call__(self, image, target): 275 | for t in self.transforms: 276 | image, target = t(image, target) 277 | return image, target 278 | 279 | def __repr__(self): 280 | format_string = self.__class__.__name__ + "(" 281 | for t in self.transforms: 282 | format_string += "\n" 283 | format_string += " {0}".format(t) 284 | format_string += "\n)" 285 | return format_string 286 | -------------------------------------------------------------------------------- /figs/12ep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/12ep.png -------------------------------------------------------------------------------- /figs/50ep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/50ep.png -------------------------------------------------------------------------------- /figs/curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/curve.png -------------------------------------------------------------------------------- /figs/dinosaur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/dinosaur.png -------------------------------------------------------------------------------- /figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/framework.png -------------------------------------------------------------------------------- /figs/idea.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/idea.jpg -------------------------------------------------------------------------------- /figs/sota.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/sota.jpg -------------------------------------------------------------------------------- /figs/sota_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/sota_table.png -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 7 | from .dino import build_dino 8 | 9 | def build_model(args): 10 | return build(args) 11 | -------------------------------------------------------------------------------- /models/dino/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Conditional DETR 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Copied from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | # ------------------------------------------------------------------------ 9 | 10 | from .dino import build_dino 11 | -------------------------------------------------------------------------------- /models/dino/backbone.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Conditional DETR 7 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Copied from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | Backbone modules. 16 | """ 17 | from collections import OrderedDict 18 | import os 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | import torchvision 23 | from torch import nn 24 | from torchvision.models._utils import IntermediateLayerGetter 25 | from typing import Dict, List 26 | 27 | 28 | from util.misc import NestedTensor, clean_state_dict, is_main_process 29 | 30 | from .position_encoding import build_position_encoding 31 | from .convnext import build_convnext 32 | from .swin_transformer import build_swin_transformer 33 | 34 | 35 | 36 | class FrozenBatchNorm2d(torch.nn.Module): 37 | """ 38 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 39 | 40 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 41 | without which any other models than torchvision.models.resnet[18,34,50,101] 42 | produce nans. 43 | """ 44 | 45 | def __init__(self, n): 46 | super(FrozenBatchNorm2d, self).__init__() 47 | self.register_buffer("weight", torch.ones(n)) 48 | self.register_buffer("bias", torch.zeros(n)) 49 | self.register_buffer("running_mean", torch.zeros(n)) 50 | self.register_buffer("running_var", torch.ones(n)) 51 | 52 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 53 | missing_keys, unexpected_keys, error_msgs): 54 | num_batches_tracked_key = prefix + 'num_batches_tracked' 55 | if num_batches_tracked_key in state_dict: 56 | del state_dict[num_batches_tracked_key] 57 | 58 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 59 | state_dict, prefix, local_metadata, strict, 60 | missing_keys, unexpected_keys, error_msgs) 61 | 62 | def forward(self, x): 63 | # move reshapes to the beginning 64 | # to make it fuser-friendly 65 | w = self.weight.reshape(1, -1, 1, 1) 66 | b = self.bias.reshape(1, -1, 1, 1) 67 | rv = self.running_var.reshape(1, -1, 1, 1) 68 | rm = self.running_mean.reshape(1, -1, 1, 1) 69 | eps = 1e-5 70 | scale = w * (rv + eps).rsqrt() 71 | bias = b - rm * scale 72 | return x * scale + bias 73 | 74 | 75 | class BackboneBase(nn.Module): 76 | 77 | def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_indices: list): 78 | super().__init__() 79 | for name, parameter in backbone.named_parameters(): 80 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 81 | parameter.requires_grad_(False) 82 | 83 | return_layers = {} 84 | for idx, layer_index in enumerate(return_interm_indices): 85 | return_layers.update({"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}) 86 | 87 | # if len: 88 | # if use_stage1_feature: 89 | # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 90 | # else: 91 | # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} 92 | # else: 93 | # return_layers = {'layer4': "0"} 94 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 95 | self.num_channels = num_channels 96 | 97 | def forward(self, tensor_list: NestedTensor): 98 | xs = self.body(tensor_list.tensors) 99 | out: Dict[str, NestedTensor] = {} 100 | for name, x in xs.items(): 101 | m = tensor_list.mask 102 | assert m is not None 103 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 104 | out[name] = NestedTensor(x, mask) 105 | 106 | return out 107 | 108 | 109 | class Backbone(BackboneBase): 110 | """ResNet backbone with frozen BatchNorm.""" 111 | def __init__(self, name: str, 112 | train_backbone: bool, 113 | dilation: bool, 114 | return_interm_indices:list, 115 | batch_norm=FrozenBatchNorm2d, 116 | ): 117 | if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']: 118 | backbone = getattr(torchvision.models, name)( 119 | replace_stride_with_dilation=[False, False, dilation], 120 | pretrained=is_main_process(), norm_layer=batch_norm) 121 | else: 122 | raise NotImplementedError("Why you can get here with name {}".format(name)) 123 | # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 124 | assert name not in ('resnet18', 'resnet34'), "Only resnet50 and resnet101 are available." 125 | assert return_interm_indices in [[0,1,2,3], [1,2,3], [3]] 126 | num_channels_all = [256, 512, 1024, 2048] 127 | num_channels = num_channels_all[4-len(return_interm_indices):] 128 | super().__init__(backbone, train_backbone, num_channels, return_interm_indices) 129 | 130 | 131 | class Joiner(nn.Sequential): 132 | def __init__(self, backbone, position_embedding): 133 | super().__init__(backbone, position_embedding) 134 | 135 | def forward(self, tensor_list: NestedTensor): 136 | xs = self[0](tensor_list) 137 | out: List[NestedTensor] = [] 138 | pos = [] 139 | for name, x in xs.items(): 140 | out.append(x) 141 | # position encoding 142 | pos.append(self[1](x).to(x.tensors.dtype)) 143 | 144 | return out, pos 145 | 146 | 147 | def build_backbone(args): 148 | """ 149 | Useful args: 150 | - backbone: backbone name 151 | - lr_backbone: 152 | - dilation 153 | - return_interm_indices: available: [0,1,2,3], [1,2,3], [3] 154 | - backbone_freeze_keywords: 155 | - use_checkpoint: for swin only for now 156 | 157 | """ 158 | position_embedding = build_position_encoding(args) 159 | train_backbone = args.lr_backbone > 0 160 | if not train_backbone: 161 | raise ValueError("Please set lr_backbone > 0") 162 | return_interm_indices = args.return_interm_indices 163 | assert return_interm_indices in [[0,1,2,3], [1,2,3], [3]] 164 | backbone_freeze_keywords = args.backbone_freeze_keywords 165 | use_checkpoint = getattr(args, 'use_checkpoint', False) 166 | 167 | if args.backbone in ['resnet50', 'resnet101']: 168 | backbone = Backbone(args.backbone, train_backbone, args.dilation, 169 | return_interm_indices, 170 | batch_norm=FrozenBatchNorm2d) 171 | bb_num_channels = backbone.num_channels 172 | elif args.backbone in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']: 173 | pretrain_img_size = int(args.backbone.split('_')[-2]) 174 | backbone = build_swin_transformer(args.backbone, \ 175 | pretrain_img_size=pretrain_img_size, \ 176 | out_indices=tuple(return_interm_indices), \ 177 | dilation=args.dilation, use_checkpoint=use_checkpoint) 178 | 179 | # freeze some layers 180 | if backbone_freeze_keywords is not None: 181 | for name, parameter in backbone.named_parameters(): 182 | for keyword in backbone_freeze_keywords: 183 | if keyword in name: 184 | parameter.requires_grad_(False) 185 | break 186 | if "backbone_dir" in args: 187 | pretrained_dir = args.backbone_dir 188 | PTDICT = { 189 | 'swin_T_224_1k': 'swin_tiny_patch4_window7_224.pth', 190 | 'swin_B_384_22k': 'swin_base_patch4_window12_384.pth', 191 | 'swin_L_384_22k': 'swin_large_patch4_window12_384_22k.pth', 192 | } 193 | pretrainedpath = os.path.join(pretrained_dir, PTDICT[args.backbone]) 194 | checkpoint = torch.load(pretrainedpath, map_location='cpu')['model'] 195 | from collections import OrderedDict 196 | def key_select_function(keyname): 197 | if 'head' in keyname: 198 | return False 199 | if args.dilation and 'layers.3' in keyname: 200 | return False 201 | return True 202 | _tmp_st = OrderedDict({k:v for k, v in clean_state_dict(checkpoint).items() if key_select_function(k)}) 203 | _tmp_st_output = backbone.load_state_dict(_tmp_st, strict=False) 204 | print(str(_tmp_st_output)) 205 | bb_num_channels = backbone.num_features[4 - len(return_interm_indices):] 206 | elif args.backbone in ['convnext_xlarge_22k']: 207 | backbone = build_convnext(modelname=args.backbone, pretrained=True, out_indices=tuple(return_interm_indices),backbone_dir=args.backbone_dir) 208 | bb_num_channels = backbone.dims[4 - len(return_interm_indices):] 209 | else: 210 | raise NotImplementedError("Unknown backbone {}".format(args.backbone)) 211 | 212 | 213 | assert len(bb_num_channels) == len(return_interm_indices), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}" 214 | 215 | 216 | model = Joiner(backbone, position_embedding) 217 | model.num_channels = bb_num_channels 218 | assert isinstance(bb_num_channels, List), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels)) 219 | return model 220 | -------------------------------------------------------------------------------- /models/dino/convnext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | 3 | # All rights reserved. 4 | 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | 9 | from functools import partial 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from timm.models.layers import trunc_normal_, DropPath 14 | 15 | from util.misc import NestedTensor 16 | # from timm.models.registry import register_model 17 | 18 | class Block(nn.Module): 19 | r""" ConvNeXt Block. There are two equivalent implementations: 20 | (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) 21 | (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back 22 | We use (2) as we find it slightly faster in PyTorch 23 | 24 | Args: 25 | dim (int): Number of input channels. 26 | drop_path (float): Stochastic depth rate. Default: 0.0 27 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 28 | """ 29 | def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): 30 | super().__init__() 31 | self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv 32 | self.norm = LayerNorm(dim, eps=1e-6) 33 | self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers 34 | self.act = nn.GELU() 35 | self.pwconv2 = nn.Linear(4 * dim, dim) 36 | self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 37 | requires_grad=True) if layer_scale_init_value > 0 else None 38 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 39 | 40 | def forward(self, x): 41 | input = x 42 | x = self.dwconv(x) 43 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 44 | x = self.norm(x) 45 | x = self.pwconv1(x) 46 | x = self.act(x) 47 | x = self.pwconv2(x) 48 | if self.gamma is not None: 49 | x = self.gamma * x 50 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 51 | 52 | x = input + self.drop_path(x) 53 | return x 54 | 55 | class ConvNeXt(nn.Module): 56 | r""" ConvNeXt 57 | A PyTorch impl of : `A ConvNet for the 2020s` - 58 | https://arxiv.org/pdf/2201.03545.pdf 59 | 60 | Args: 61 | in_chans (int): Number of input image channels. Default: 3 62 | num_classes (int): Number of classes for classification head. Default: 1000 63 | depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] 64 | dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] 65 | drop_path_rate (float): Stochastic depth rate. Default: 0. 66 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 67 | head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. 68 | """ 69 | def __init__(self, in_chans=3, num_classes=1000, 70 | depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., 71 | layer_scale_init_value=1e-6, head_init_scale=1., 72 | out_indices=[0, 1, 2, 3] 73 | ): 74 | super().__init__() 75 | self.dims = dims 76 | 77 | self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers 78 | stem = nn.Sequential( 79 | nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), 80 | LayerNorm(dims[0], eps=1e-6, data_format="channels_first") 81 | ) 82 | self.downsample_layers.append(stem) 83 | for i in range(3): 84 | downsample_layer = nn.Sequential( 85 | LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), 86 | nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), 87 | ) 88 | self.downsample_layers.append(downsample_layer) 89 | 90 | self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks 91 | dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 92 | cur = 0 93 | for i in range(4): 94 | stage = nn.Sequential( 95 | *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 96 | layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] 97 | ) 98 | self.stages.append(stage) 99 | cur += depths[i] 100 | 101 | self.out_indices = out_indices 102 | 103 | norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") 104 | for i_layer in range(4): 105 | layer = norm_layer(dims[i_layer]) 106 | layer_name = f'norm{i_layer}' 107 | self.add_module(layer_name, layer) 108 | 109 | # self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer 110 | # self.head = nn.Linear(dims[-1], num_classes) 111 | 112 | # self.apply(self._init_weights) 113 | # self.head.weight.data.mul_(head_init_scale) 114 | # self.head.bias.data.mul_(head_init_scale) 115 | 116 | def _init_weights(self, m): 117 | if isinstance(m, (nn.Conv2d, nn.Linear)): 118 | trunc_normal_(m.weight, std=.02) 119 | nn.init.constant_(m.bias, 0) 120 | 121 | def forward_features(self, x): 122 | outs = [] 123 | for i in range(4): 124 | x = self.downsample_layers[i](x) 125 | x = self.stages[i](x) 126 | if i in self.out_indices: 127 | norm_layer = getattr(self, f'norm{i}') 128 | x_out = norm_layer(x) 129 | outs.append(x_out) 130 | # return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C) 131 | return tuple(outs) 132 | 133 | # def forward(self, x): 134 | # x = self.forward_features(x) 135 | # return x 136 | 137 | 138 | def forward(self, tensor_list: NestedTensor): 139 | x = tensor_list.tensors 140 | outs = self.forward_features(x) 141 | 142 | # collect for nesttensors 143 | outs_dict = {} 144 | for idx, out_i in enumerate(outs): 145 | m = tensor_list.mask 146 | assert m is not None 147 | mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0] 148 | outs_dict[idx] = NestedTensor(out_i, mask) 149 | 150 | return outs_dict 151 | 152 | class LayerNorm(nn.Module): 153 | r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 154 | The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 155 | shape (batch_size, height, width, channels) while channels_first corresponds to inputs 156 | with shape (batch_size, channels, height, width). 157 | """ 158 | def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): 159 | super().__init__() 160 | self.weight = nn.Parameter(torch.ones(normalized_shape)) 161 | self.bias = nn.Parameter(torch.zeros(normalized_shape)) 162 | self.eps = eps 163 | self.data_format = data_format 164 | if self.data_format not in ["channels_last", "channels_first"]: 165 | raise NotImplementedError 166 | self.normalized_shape = (normalized_shape, ) 167 | 168 | def forward(self, x): 169 | if self.data_format == "channels_last": 170 | return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) 171 | elif self.data_format == "channels_first": 172 | u = x.mean(1, keepdim=True) 173 | s = (x - u).pow(2).mean(1, keepdim=True) 174 | x = (x - u) / torch.sqrt(s + self.eps) 175 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 176 | return x 177 | 178 | 179 | model_urls = { 180 | "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", 181 | "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", 182 | "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", 183 | "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", 184 | "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", 185 | "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", 186 | "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", 187 | } 188 | 189 | # @register_model 190 | # def convnext_tiny(pretrained=False, **kwargs): 191 | # model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs) 192 | # if pretrained: 193 | # url = model_urls['convnext_tiny_1k'] 194 | # checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) 195 | # model.load_state_dict(checkpoint["model"]) 196 | # return model 197 | 198 | # @register_model 199 | # def convnext_small(pretrained=False, **kwargs): 200 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) 201 | # if pretrained: 202 | # url = model_urls['convnext_small_1k'] 203 | # checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) 204 | # model.load_state_dict(checkpoint["model"]) 205 | # return model 206 | 207 | # @register_model 208 | # def convnext_base(pretrained=False, in_22k=False, **kwargs): 209 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) 210 | # if pretrained: 211 | # url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k'] 212 | # checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) 213 | # model.load_state_dict(checkpoint["model"]) 214 | # return model 215 | 216 | # @register_model 217 | # def convnext_large(pretrained=False, in_22k=False, **kwargs): 218 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) 219 | # if pretrained: 220 | # url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k'] 221 | # checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) 222 | # model.load_state_dict(checkpoint["model"]) 223 | # return model 224 | 225 | # @register_model 226 | # def convnext_xlarge(pretrained=False, in_22k=False, **kwargs): 227 | # model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) 228 | # if pretrained: 229 | # url = model_urls['convnext_xlarge_22k'] if in_22k else model_urls['convnext_xlarge_1k'] 230 | # checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) 231 | # model.load_state_dict(checkpoint["model"]) 232 | # return model 233 | 234 | def build_convnext(modelname, pretrained,backbone_dir=None, **kw): 235 | assert modelname in ['convnext_xlarge_22k'] 236 | 237 | model_para_dict = { 238 | 'convnext_xlarge_22k': dict( 239 | depths=[3, 3, 27, 3], 240 | dims=[256, 512, 1024, 2048], 241 | ), 242 | } 243 | kw_cgf = model_para_dict[modelname] 244 | kw_cgf.update(kw) 245 | model = ConvNeXt(**kw_cgf) 246 | if pretrained: 247 | url = model_urls[modelname] 248 | checkpoint = torch.hub.load_state_dict_from_url(url=url, model_dir=backbone_dir, map_location="cpu", check_hash=True) 249 | _tmp_st_output = model.load_state_dict(checkpoint["model"], strict=False) 250 | print(str(_tmp_st_output)) 251 | 252 | return model -------------------------------------------------------------------------------- /models/dino/dn_components.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # DN-DETR 7 | # Copyright (c) 2022 IDEA. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | 10 | 11 | import torch 12 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list, 13 | accuracy, get_world_size, interpolate, 14 | is_dist_avail_and_initialized, inverse_sigmoid) 15 | # from .DABDETR import sigmoid_focal_loss 16 | from util import box_ops 17 | import torch.nn.functional as F 18 | 19 | 20 | def prepare_for_cdn(dn_args, training, num_queries, num_classes, hidden_dim, label_enc): 21 | """ 22 | A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector 23 | forward function and use learnable tgt embedding, so we change this function a little bit. 24 | :param dn_args: targets, dn_number, label_noise_ratio, box_noise_scale 25 | :param training: if it is training or inference 26 | :param num_queries: number of queires 27 | :param num_classes: number of classes 28 | :param hidden_dim: transformer hidden dim 29 | :param label_enc: encode labels in dn 30 | :return: 31 | """ 32 | if training: 33 | targets, dn_number, label_noise_ratio, box_noise_scale = dn_args 34 | # positive and negative dn queries 35 | dn_number = dn_number * 2 36 | known = [(torch.ones_like(t['labels'])).cuda() for t in targets] 37 | batch_size = len(known) 38 | known_num = [sum(k) for k in known] 39 | if int(max(known_num)) == 0: 40 | dn_number = 1 41 | else: 42 | if dn_number >= 100: 43 | dn_number = dn_number // (int(max(known_num) * 2)) 44 | elif dn_number < 1: 45 | dn_number = 1 46 | if dn_number == 0: 47 | dn_number = 1 48 | unmask_bbox = unmask_label = torch.cat(known) 49 | labels = torch.cat([t['labels'] for t in targets]) 50 | boxes = torch.cat([t['boxes'] for t in targets]) 51 | batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)]) 52 | 53 | known_indice = torch.nonzero(unmask_label + unmask_bbox) 54 | known_indice = known_indice.view(-1) 55 | 56 | known_indice = known_indice.repeat(2 * dn_number, 1).view(-1) 57 | known_labels = labels.repeat(2 * dn_number, 1).view(-1) 58 | known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1) 59 | known_bboxs = boxes.repeat(2 * dn_number, 1) 60 | known_labels_expaned = known_labels.clone() 61 | known_bbox_expand = known_bboxs.clone() 62 | 63 | if label_noise_ratio > 0: 64 | p = torch.rand_like(known_labels_expaned.float()) 65 | chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1) # half of bbox prob 66 | new_label = torch.randint_like(chosen_indice, 0, num_classes) # randomly put a new one here 67 | known_labels_expaned.scatter_(0, chosen_indice, new_label) 68 | single_pad = int(max(known_num)) 69 | 70 | pad_size = int(single_pad * 2 * dn_number) 71 | positive_idx = torch.tensor(range(len(boxes))).long().cuda().unsqueeze(0).repeat(dn_number, 1) 72 | positive_idx += (torch.tensor(range(dn_number)) * len(boxes) * 2).long().cuda().unsqueeze(1) 73 | positive_idx = positive_idx.flatten() 74 | negative_idx = positive_idx + len(boxes) 75 | if box_noise_scale > 0: 76 | known_bbox_ = torch.zeros_like(known_bboxs) 77 | known_bbox_[:, :2] = known_bboxs[:, :2] - known_bboxs[:, 2:] / 2 78 | known_bbox_[:, 2:] = known_bboxs[:, :2] + known_bboxs[:, 2:] / 2 79 | 80 | diff = torch.zeros_like(known_bboxs) 81 | diff[:, :2] = known_bboxs[:, 2:] / 2 82 | diff[:, 2:] = known_bboxs[:, 2:] / 2 83 | 84 | rand_sign = torch.randint_like(known_bboxs, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0 85 | rand_part = torch.rand_like(known_bboxs) 86 | rand_part[negative_idx] += 1.0 87 | rand_part *= rand_sign 88 | known_bbox_ = known_bbox_ + torch.mul(rand_part, 89 | diff).cuda() * box_noise_scale 90 | known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0) 91 | known_bbox_expand[:, :2] = (known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2 92 | known_bbox_expand[:, 2:] = known_bbox_[:, 2:] - known_bbox_[:, :2] 93 | 94 | m = known_labels_expaned.long().to('cuda') 95 | input_label_embed = label_enc(m) 96 | input_bbox_embed = inverse_sigmoid(known_bbox_expand) 97 | 98 | padding_label = torch.zeros(pad_size, hidden_dim).cuda() 99 | padding_bbox = torch.zeros(pad_size, 4).cuda() 100 | 101 | input_query_label = padding_label.repeat(batch_size, 1, 1) 102 | input_query_bbox = padding_bbox.repeat(batch_size, 1, 1) 103 | 104 | map_known_indice = torch.tensor([]).to('cuda') 105 | if len(known_num): 106 | map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3] 107 | map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long() 108 | if len(known_bid): 109 | input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed 110 | input_query_bbox[(known_bid.long(), map_known_indice)] = input_bbox_embed 111 | 112 | tgt_size = pad_size + num_queries 113 | attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0 114 | # match query cannot see the reconstruct 115 | attn_mask[pad_size:, :pad_size] = True 116 | # reconstruct cannot see each other 117 | for i in range(dn_number): 118 | if i == 0: 119 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True 120 | if i == dn_number - 1: 121 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True 122 | else: 123 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True 124 | attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True 125 | 126 | dn_meta = { 127 | 'pad_size': pad_size, 128 | 'num_dn_group': dn_number, 129 | } 130 | else: 131 | 132 | input_query_label = None 133 | input_query_bbox = None 134 | attn_mask = None 135 | dn_meta = None 136 | 137 | return input_query_label, input_query_bbox, attn_mask, dn_meta 138 | 139 | 140 | def dn_post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss): 141 | """ 142 | post process of dn after output from the transformer 143 | put the dn part in the dn_meta 144 | """ 145 | if dn_meta and dn_meta['pad_size'] > 0: 146 | output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :] 147 | output_known_coord = outputs_coord[:, :, :dn_meta['pad_size'], :] 148 | outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :] 149 | outputs_coord = outputs_coord[:, :, dn_meta['pad_size']:, :] 150 | out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]} 151 | if aux_loss: 152 | out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord) 153 | dn_meta['output_known_lbs_bboxes'] = out 154 | return outputs_class, outputs_coord 155 | 156 | 157 | -------------------------------------------------------------------------------- /models/dino/matcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modules to compute the matching cost and solve the corresponding LSAP. 7 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Modified from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 12 | # ------------------------------------------------------------------------ 13 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 14 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 15 | # ------------------------------------------------------------------------ 16 | 17 | 18 | import torch, os 19 | from torch import nn 20 | from scipy.optimize import linear_sum_assignment 21 | 22 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou 23 | 24 | 25 | class HungarianMatcher(nn.Module): 26 | """This class computes an assignment between the targets and the predictions of the network 27 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 28 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 29 | while the others are un-matched (and thus treated as non-objects). 30 | """ 31 | 32 | def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25): 33 | """Creates the matcher 34 | Params: 35 | cost_class: This is the relative weight of the classification error in the matching cost 36 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 37 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 38 | """ 39 | super().__init__() 40 | self.cost_class = cost_class 41 | self.cost_bbox = cost_bbox 42 | self.cost_giou = cost_giou 43 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 44 | 45 | self.focal_alpha = focal_alpha 46 | 47 | @torch.no_grad() 48 | def forward(self, outputs, targets): 49 | """ Performs the matching 50 | Params: 51 | outputs: This is a dict that contains at least these entries: 52 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 53 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 54 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 55 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 56 | objects in the target) containing the class labels 57 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 58 | Returns: 59 | A list of size batch_size, containing tuples of (index_i, index_j) where: 60 | - index_i is the indices of the selected predictions (in order) 61 | - index_j is the indices of the corresponding selected targets (in order) 62 | For each batch element, it holds: 63 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 64 | """ 65 | 66 | bs, num_queries = outputs["pred_logits"].shape[:2] 67 | 68 | # We flatten to compute the cost matrices in a batch 69 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] 70 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 71 | 72 | # Also concat the target labels and boxes 73 | tgt_ids = torch.cat([v["labels"] for v in targets]) 74 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 75 | 76 | # Compute the classification cost. 77 | alpha = self.focal_alpha 78 | gamma = 2.0 79 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 80 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 81 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 82 | 83 | # Compute the L1 cost between boxes 84 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 85 | 86 | # Compute the giou cost betwen boxes 87 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) 88 | 89 | # Final cost matrix 90 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 91 | C = C.view(bs, num_queries, -1).cpu() 92 | 93 | sizes = [len(v["boxes"]) for v in targets] 94 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 95 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 96 | 97 | 98 | class SimpleMinsumMatcher(nn.Module): 99 | """This class computes an assignment between the targets and the predictions of the network 100 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 101 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 102 | while the others are un-matched (and thus treated as non-objects). 103 | """ 104 | 105 | def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25): 106 | """Creates the matcher 107 | Params: 108 | cost_class: This is the relative weight of the classification error in the matching cost 109 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 110 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 111 | """ 112 | super().__init__() 113 | self.cost_class = cost_class 114 | self.cost_bbox = cost_bbox 115 | self.cost_giou = cost_giou 116 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 117 | 118 | self.focal_alpha = focal_alpha 119 | 120 | @torch.no_grad() 121 | def forward(self, outputs, targets): 122 | """ Performs the matching 123 | Params: 124 | outputs: This is a dict that contains at least these entries: 125 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 126 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 127 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 128 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 129 | objects in the target) containing the class labels 130 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 131 | Returns: 132 | A list of size batch_size, containing tuples of (index_i, index_j) where: 133 | - index_i is the indices of the selected predictions (in order) 134 | - index_j is the indices of the corresponding selected targets (in order) 135 | For each batch element, it holds: 136 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 137 | """ 138 | 139 | bs, num_queries = outputs["pred_logits"].shape[:2] 140 | 141 | # We flatten to compute the cost matrices in a batch 142 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] 143 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 144 | 145 | # Also concat the target labels and boxes 146 | tgt_ids = torch.cat([v["labels"] for v in targets]) 147 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 148 | 149 | # Compute the classification cost. 150 | alpha = self.focal_alpha 151 | gamma = 2.0 152 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 153 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 154 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 155 | 156 | # Compute the L1 cost between boxes 157 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 158 | 159 | # Compute the giou cost betwen boxes 160 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) 161 | 162 | # Final cost matrix 163 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 164 | C = C.view(bs, num_queries, -1) 165 | 166 | sizes = [len(v["boxes"]) for v in targets] 167 | indices = [] 168 | device = C.device 169 | for i, (c, _size) in enumerate(zip(C.split(sizes, -1), sizes)): 170 | weight_mat = c[i] 171 | idx_i = weight_mat.min(0)[1] 172 | idx_j = torch.arange(_size).to(device) 173 | indices.append((idx_i, idx_j)) 174 | 175 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 176 | 177 | 178 | def build_matcher(args): 179 | assert args.matcher_type in ['HungarianMatcher', 'SimpleMinsumMatcher'], "Unknown args.matcher_type: {}".format(args.matcher_type) 180 | if args.matcher_type == 'HungarianMatcher': 181 | return HungarianMatcher( 182 | cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou, 183 | focal_alpha=args.focal_alpha 184 | ) 185 | elif args.matcher_type == 'SimpleMinsumMatcher': 186 | return SimpleMinsumMatcher( 187 | cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou, 188 | focal_alpha=args.focal_alpha 189 | ) 190 | else: 191 | raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type)) -------------------------------------------------------------------------------- /models/dino/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/dino/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /models/dino/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | 11 | # TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='/path/to/your/cuda/dir' 12 | python setup.py build install 13 | -------------------------------------------------------------------------------- /models/dino/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /models/dino/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import warnings 14 | import math 15 | 16 | import torch 17 | from torch import nn 18 | import torch.nn.functional as F 19 | from torch.nn.init import xavier_uniform_, constant_ 20 | 21 | from ..functions import MSDeformAttnFunction 22 | 23 | 24 | def _is_power_of_2(n): 25 | if (not isinstance(n, int)) or (n < 0): 26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 27 | return (n & (n-1) == 0) and n != 0 28 | 29 | 30 | class MSDeformAttn(nn.Module): 31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 32 | """ 33 | Multi-Scale Deformable Attention Module 34 | :param d_model hidden dimension 35 | :param n_levels number of feature levels 36 | :param n_heads number of attention heads 37 | :param n_points number of sampling points per attention head per feature level 38 | """ 39 | super().__init__() 40 | if d_model % n_heads != 0: 41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 42 | _d_per_head = d_model // n_heads 43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 44 | if not _is_power_of_2(_d_per_head): 45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 46 | "which is more efficient in our CUDA implementation.") 47 | 48 | self.im2col_step = 64 49 | 50 | self.d_model = d_model 51 | self.n_levels = n_levels 52 | self.n_heads = n_heads 53 | self.n_points = n_points 54 | 55 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 56 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 57 | self.value_proj = nn.Linear(d_model, d_model) 58 | self.output_proj = nn.Linear(d_model, d_model) 59 | 60 | self._reset_parameters() 61 | 62 | def _reset_parameters(self): 63 | constant_(self.sampling_offsets.weight.data, 0.) 64 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 65 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 66 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 67 | for i in range(self.n_points): 68 | grid_init[:, :, i, :] *= i + 1 69 | with torch.no_grad(): 70 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 71 | constant_(self.attention_weights.weight.data, 0.) 72 | constant_(self.attention_weights.bias.data, 0.) 73 | xavier_uniform_(self.value_proj.weight.data) 74 | constant_(self.value_proj.bias.data, 0.) 75 | xavier_uniform_(self.output_proj.weight.data) 76 | constant_(self.output_proj.bias.data, 0.) 77 | 78 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 79 | """ 80 | :param query (N, Length_{query}, C) 81 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 82 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 83 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 84 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 85 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 86 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 87 | 88 | :return output (N, Length_{query}, C) 89 | """ 90 | N, Len_q, _ = query.shape 91 | N, Len_in, _ = input_flatten.shape 92 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 93 | 94 | value = self.value_proj(input_flatten) 95 | if input_padding_mask is not None: 96 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 97 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 98 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 99 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 100 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 101 | # N, Len_q, n_heads, n_levels, n_points, 2 102 | if reference_points.shape[-1] == 2: 103 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 104 | sampling_locations = reference_points[:, :, None, :, None, :] \ 105 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 106 | elif reference_points.shape[-1] == 4: 107 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 108 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 109 | else: 110 | raise ValueError( 111 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 112 | 113 | # for amp 114 | if value.dtype == torch.float16: 115 | # for mixed precision 116 | output = MSDeformAttnFunction.apply( 117 | value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step) 118 | output = output.to(torch.float16) 119 | output = self.output_proj(output) 120 | return output 121 | 122 | 123 | output = MSDeformAttnFunction.apply( 124 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 125 | output = self.output_proj(output) 126 | return output 127 | -------------------------------------------------------------------------------- /models/dino/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | 37 | 38 | if torch.cuda.is_available() and CUDA_HOME is not None: 39 | extension = CUDAExtension 40 | sources += source_cuda 41 | define_macros += [("WITH_CUDA", None)] 42 | extra_compile_args["nvcc"] = [ 43 | "-DCUDA_HAS_FP16=1", 44 | "-D__CUDA_NO_HALF_OPERATORS__", 45 | "-D__CUDA_NO_HALF_CONVERSIONS__", 46 | "-D__CUDA_NO_HALF2_OPERATORS__", 47 | ] 48 | else: 49 | raise NotImplementedError('Cuda is not availabel') 50 | 51 | sources = [os.path.join(extensions_dir, s) for s in sources] 52 | include_dirs = [extensions_dir] 53 | ext_modules = [ 54 | extension( 55 | "MultiScaleDeformableAttention", 56 | sources, 57 | include_dirs=include_dirs, 58 | define_macros=define_macros, 59 | extra_compile_args=extra_compile_args, 60 | ) 61 | ] 62 | return ext_modules 63 | 64 | setup( 65 | name="MultiScaleDeformableAttention", 66 | version="1.0", 67 | author="Weijie Su", 68 | url="https://github.com/fundamentalvision/Deformable-DETR", 69 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 70 | packages=find_packages(exclude=("configs", "tests",)), 71 | ext_modules=get_extensions(), 72 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 73 | ) 74 | -------------------------------------------------------------------------------- /models/dino/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /models/dino/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models/dino/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /models/dino/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/dino/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /models/dino/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/dino/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/dino/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Conditional DETR 7 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Copied from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | Various positional encodings for the transformer. 16 | """ 17 | import math 18 | import torch 19 | from torch import nn 20 | 21 | from util.misc import NestedTensor 22 | 23 | 24 | class PositionEmbeddingSine(nn.Module): 25 | """ 26 | This is a more standard version of the position embedding, very similar to the one 27 | used by the Attention is all you need paper, generalized to work on images. 28 | """ 29 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 30 | super().__init__() 31 | self.num_pos_feats = num_pos_feats 32 | self.temperature = temperature 33 | self.normalize = normalize 34 | if scale is not None and normalize is False: 35 | raise ValueError("normalize should be True if scale is passed") 36 | if scale is None: 37 | scale = 2 * math.pi 38 | self.scale = scale 39 | 40 | def forward(self, tensor_list: NestedTensor): 41 | x = tensor_list.tensors 42 | mask = tensor_list.mask 43 | assert mask is not None 44 | not_mask = ~mask 45 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 46 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 47 | if self.normalize: 48 | eps = 1e-6 49 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 50 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 51 | 52 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 53 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 54 | 55 | pos_x = x_embed[:, :, :, None] / dim_t 56 | pos_y = y_embed[:, :, :, None] / dim_t 57 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 58 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 59 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 60 | return pos 61 | 62 | class PositionEmbeddingSineHW(nn.Module): 63 | """ 64 | This is a more standard version of the position embedding, very similar to the one 65 | used by the Attention is all you need paper, generalized to work on images. 66 | """ 67 | def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None): 68 | super().__init__() 69 | self.num_pos_feats = num_pos_feats 70 | self.temperatureH = temperatureH 71 | self.temperatureW = temperatureW 72 | self.normalize = normalize 73 | if scale is not None and normalize is False: 74 | raise ValueError("normalize should be True if scale is passed") 75 | if scale is None: 76 | scale = 2 * math.pi 77 | self.scale = scale 78 | 79 | def forward(self, tensor_list: NestedTensor): 80 | x = tensor_list.tensors 81 | mask = tensor_list.mask 82 | assert mask is not None 83 | not_mask = ~mask 84 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 85 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 86 | 87 | 88 | 89 | if self.normalize: 90 | eps = 1e-6 91 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 92 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 93 | 94 | dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 95 | dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats) 96 | pos_x = x_embed[:, :, :, None] / dim_tx 97 | 98 | dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 99 | dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats) 100 | pos_y = y_embed[:, :, :, None] / dim_ty 101 | 102 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 103 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 104 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 105 | 106 | 107 | 108 | return pos 109 | 110 | class PositionEmbeddingLearned(nn.Module): 111 | """ 112 | Absolute pos embedding, learned. 113 | """ 114 | def __init__(self, num_pos_feats=256): 115 | super().__init__() 116 | self.row_embed = nn.Embedding(50, num_pos_feats) 117 | self.col_embed = nn.Embedding(50, num_pos_feats) 118 | self.reset_parameters() 119 | 120 | def reset_parameters(self): 121 | nn.init.uniform_(self.row_embed.weight) 122 | nn.init.uniform_(self.col_embed.weight) 123 | 124 | def forward(self, tensor_list: NestedTensor): 125 | x = tensor_list.tensors 126 | h, w = x.shape[-2:] 127 | i = torch.arange(w, device=x.device) 128 | j = torch.arange(h, device=x.device) 129 | x_emb = self.col_embed(i) 130 | y_emb = self.row_embed(j) 131 | pos = torch.cat([ 132 | x_emb.unsqueeze(0).repeat(h, 1, 1), 133 | y_emb.unsqueeze(1).repeat(1, w, 1), 134 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 135 | return pos 136 | 137 | 138 | def build_position_encoding(args): 139 | N_steps = args.hidden_dim // 2 140 | if args.position_embedding in ('v2', 'sine'): 141 | # TODO find a better way of exposing other arguments 142 | position_embedding = PositionEmbeddingSineHW( 143 | N_steps, 144 | temperatureH=args.pe_temperatureH, 145 | temperatureW=args.pe_temperatureW, 146 | normalize=True 147 | ) 148 | elif args.position_embedding in ('v3', 'learned'): 149 | position_embedding = PositionEmbeddingLearned(N_steps) 150 | else: 151 | raise ValueError(f"not supported {args.position_embedding}") 152 | 153 | return position_embedding 154 | -------------------------------------------------------------------------------- /models/dino/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | from torch import nn, Tensor 9 | 10 | import math 11 | import torch.nn.functional as F 12 | from torch import nn 13 | 14 | 15 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor, learnedwh=None): 16 | """ 17 | Input: 18 | - memory: bs, \sum{hw}, d_model 19 | - memory_padding_mask: bs, \sum{hw} 20 | - spatial_shapes: nlevel, 2 21 | - learnedwh: 2 22 | Output: 23 | - output_memory: bs, \sum{hw}, d_model 24 | - output_proposals: bs, \sum{hw}, 4 25 | """ 26 | N_, S_, C_ = memory.shape 27 | base_scale = 4.0 28 | proposals = [] 29 | _cur = 0 30 | for lvl, (H_, W_) in enumerate(spatial_shapes): 31 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1) 32 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) 33 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) 34 | 35 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), 36 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) 37 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 38 | 39 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) 40 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale 41 | 42 | if learnedwh is not None: 43 | wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl) 44 | else: 45 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl) 46 | 47 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) 48 | proposals.append(proposal) 49 | _cur += (H_ * W_) 50 | 51 | output_proposals = torch.cat(proposals, 1) 52 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) 53 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid 54 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) 55 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) 56 | 57 | output_memory = memory 58 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) 59 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) 60 | 61 | return output_memory, output_proposals 62 | 63 | 64 | class RandomBoxPerturber(): 65 | def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None: 66 | self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]) 67 | 68 | def __call__(self, refanchors: Tensor) -> Tensor: 69 | nq, bs, query_dim = refanchors.shape 70 | device = refanchors.device 71 | 72 | noise_raw = torch.rand_like(refanchors) 73 | noise_scale = self.noise_scale.to(device)[:query_dim] 74 | 75 | new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) 76 | return new_refanchors.clamp_(0, 1) 77 | 78 | 79 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 80 | """ 81 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 82 | Args: 83 | inputs: A float tensor of arbitrary shape. 84 | The predictions for each example. 85 | targets: A float tensor with the same shape as inputs. Stores the binary 86 | classification label for each element in inputs 87 | (0 for the negative class and 1 for the positive class). 88 | alpha: (optional) Weighting factor in range (0,1) to balance 89 | positive vs negative examples. Default = -1 (no weighting). 90 | gamma: Exponent of the modulating factor (1 - p_t) to 91 | balance easy vs hard examples. 92 | Returns: 93 | Loss tensor 94 | """ 95 | prob = inputs.sigmoid() 96 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 97 | p_t = prob * targets + (1 - prob) * (1 - targets) 98 | loss = ce_loss * ((1 - p_t) ** gamma) 99 | 100 | if alpha >= 0: 101 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 102 | loss = alpha_t * loss 103 | 104 | return loss.mean(1).sum() / num_boxes 105 | 106 | 107 | class MLP(nn.Module): 108 | """ Very simple multi-layer perceptron (also called FFN)""" 109 | 110 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 111 | super().__init__() 112 | self.num_layers = num_layers 113 | h = [hidden_dim] * (num_layers - 1) 114 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 115 | 116 | def forward(self, x): 117 | for i, layer in enumerate(self.layers): 118 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 119 | return x 120 | 121 | 122 | def _get_activation_fn(activation, d_model=256, batch_dim=0): 123 | """Return an activation function given a string""" 124 | if activation == "relu": 125 | return F.relu 126 | if activation == "gelu": 127 | return F.gelu 128 | if activation == "glu": 129 | return F.glu 130 | if activation == "prelu": 131 | return nn.PReLU() 132 | if activation == "selu": 133 | return F.selu 134 | 135 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 136 | 137 | 138 | def gen_sineembed_for_position(pos_tensor): 139 | # n_query, bs, _ = pos_tensor.size() 140 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 141 | scale = 2 * math.pi 142 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) 143 | dim_t = 10000 ** (2 * (dim_t // 2) / 128) 144 | x_embed = pos_tensor[:, :, 0] * scale 145 | y_embed = pos_tensor[:, :, 1] * scale 146 | pos_x = x_embed[:, :, None] / dim_t 147 | pos_y = y_embed[:, :, None] / dim_t 148 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 149 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) 150 | if pos_tensor.size(-1) == 2: 151 | pos = torch.cat((pos_y, pos_x), dim=2) 152 | elif pos_tensor.size(-1) == 4: 153 | w_embed = pos_tensor[:, :, 2] * scale 154 | pos_w = w_embed[:, :, None] / dim_t 155 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 156 | 157 | h_embed = pos_tensor[:, :, 3] * scale 158 | pos_h = h_embed[:, :, None] / dim_t 159 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) 160 | 161 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) 162 | else: 163 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) 164 | return pos -------------------------------------------------------------------------------- /models/registry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Yihao Chen 3 | # @Date: 2021-08-16 16:03:17 4 | # @Last Modified by: Shilong Liu 5 | # @Last Modified time: 2022-01-23 15:26 6 | # modified from mmcv 7 | 8 | import inspect 9 | from functools import partial 10 | 11 | 12 | class Registry(object): 13 | 14 | def __init__(self, name): 15 | self._name = name 16 | self._module_dict = dict() 17 | 18 | def __repr__(self): 19 | format_str = self.__class__.__name__ + '(name={}, items={})'.format( 20 | self._name, list(self._module_dict.keys())) 21 | return format_str 22 | 23 | def __len__(self): 24 | return len(self._module_dict) 25 | 26 | @property 27 | def name(self): 28 | return self._name 29 | 30 | @property 31 | def module_dict(self): 32 | return self._module_dict 33 | 34 | def get(self, key): 35 | return self._module_dict.get(key, None) 36 | 37 | def registe_with_name(self, module_name=None, force=False): 38 | return partial(self.register, module_name=module_name, force=force) 39 | 40 | def register(self, module_build_function, module_name=None, force=False): 41 | """Register a module build function. 42 | Args: 43 | module (:obj:`nn.Module`): Module to be registered. 44 | """ 45 | if not inspect.isfunction(module_build_function): 46 | raise TypeError('module_build_function must be a function, but got {}'.format( 47 | type(module_build_function))) 48 | if module_name is None: 49 | module_name = module_build_function.__name__ 50 | if not force and module_name in self._module_dict: 51 | raise KeyError('{} is already registered in {}'.format( 52 | module_name, self.name)) 53 | self._module_dict[module_name] = module_build_function 54 | 55 | return module_build_function 56 | 57 | MODULE_BUILD_FUNCS = Registry('model build functions') 58 | 59 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools 3 | submitit 4 | torch>=1.5.0 5 | torchvision>=0.6.0 6 | git+https://github.com/cocodataset/panopticapi.git#egg=panopticapi 7 | scipy 8 | termcolor 9 | addict 10 | yapf 11 | timm -------------------------------------------------------------------------------- /run_with_submitit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | A script to run multinode training with submitit. 4 | """ 5 | import argparse 6 | import os, sys 7 | import uuid 8 | from pathlib import Path 9 | 10 | import main as detection 11 | import submitit 12 | 13 | 14 | def parse_args(): 15 | detection_parser = detection.get_args_parser() 16 | parser = argparse.ArgumentParser("Submitit for detection", parents=[detection_parser]) 17 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 18 | parser.add_argument("--nodes", default=1, type=int, help="Number of nodes to request") 19 | parser.add_argument("--timeout", default=60, type=int, help="Duration of the job") 20 | parser.add_argument("--cpus_per_task", default=16, type=int, help="Duration of the job") 21 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 22 | parser.add_argument("--job_name", type=str, help="Job name.") 23 | parser.add_argument("--qos", type=str, default=None, help="specify preemptive QOS.") 24 | parser.add_argument("--requeue", action='store_true', help="job requeue if preempted.") 25 | parser.add_argument("--mail_type", type=str, default='ALL', help=" send email when job begins, ends, fails or preempted.") 26 | parser.add_argument("--mail_user", type=str, default='', help=" email address.") 27 | # refer to https://slurm.schedmd.com/sbatch.html & \ 28 | # https://github.com/facebookincubator/submitit/blob/11d8f87f785669e8a01aa9773a107f9180a63b09/submitit/slurm/slurm.py \ 29 | # for more details about parameters of slurm. 30 | return parser.parse_args() 31 | 32 | 33 | def get_shared_folder() -> Path: 34 | user = os.getenv("USER") 35 | if Path("/comp_robot").is_dir(): 36 | p = Path(f"/comp_robot/{user}/experiments") 37 | p.mkdir(exist_ok=True) 38 | return p 39 | raise RuntimeError("No shared folder available") 40 | 41 | 42 | def get_init_file(): 43 | # Init file must not exist, but it's parent dir must exist. 44 | os.makedirs(str(get_shared_folder()), exist_ok=True) 45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 46 | if init_file.exists(): 47 | os.remove(str(init_file)) 48 | return init_file 49 | 50 | 51 | class Trainer(object): 52 | def __init__(self, args): 53 | self.args = args 54 | 55 | def __call__(self): 56 | self._setup_gpu_args() 57 | detection.main(self.args) 58 | 59 | def checkpoint(self): 60 | import os 61 | import submitit 62 | 63 | checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") 64 | if os.path.exists(checkpoint_file): 65 | self.args.resume = checkpoint_file 66 | print("Requeuing ", self.args) 67 | empty_trainer = type(self)(self.args) 68 | return submitit.helpers.DelayedSubmission(empty_trainer) 69 | 70 | def _setup_gpu_args(self): 71 | import submitit 72 | 73 | job_env = submitit.JobEnvironment() 74 | self.args.output_dir = self.args.job_dir 75 | self.args.output_dir = str(self.args.output_dir).replace("%j", str(job_env.job_id)) 76 | self.args.gpu = job_env.local_rank 77 | self.args.rank = job_env.global_rank 78 | self.args.world_size = job_env.num_tasks 79 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 80 | 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | args.commad_txt = "Command: "+' '.join(sys.argv) 86 | if args.job_dir == "": 87 | raise ValueError("You must set job_dir mannually.") 88 | 89 | # Note that the folder will depend on the job_id, to easily track experiments 90 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 91 | 92 | # cluster setup is defined by environment variables 93 | num_gpus_per_node = args.ngpus 94 | nodes = args.nodes 95 | timeout_min = args.timeout 96 | qos = args.qos 97 | 98 | additional_parameters = { 99 | 'mail-user': args.mail_user, 100 | 'mail-type': args.mail_type, 101 | } 102 | if args.requeue: 103 | additional_parameters['requeue'] = args.requeue 104 | 105 | 106 | executor.update_parameters( 107 | mem_gb=50 * num_gpus_per_node, 108 | gpus_per_node=num_gpus_per_node, 109 | tasks_per_node=num_gpus_per_node, # one task per GPU 110 | cpus_per_task=16, 111 | nodes=nodes, 112 | timeout_min=timeout_min, # max is 60 * 72 113 | qos=qos, 114 | slurm_additional_parameters=additional_parameters 115 | ) 116 | 117 | executor.update_parameters(name=args.job_name) 118 | args.dist_url = get_init_file().as_uri() 119 | 120 | # run and submit 121 | trainer = Trainer(args) 122 | job = executor.submit(trainer) 123 | 124 | print("Submitted job_id:", job.job_id) 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /scripts/DINO_eval.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | checkpoint=$2 3 | python main.py \ 4 | --output_dir logs/DINO/R50-MS4-%j \ 5 | -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 6 | --eval --resume $checkpoint \ 7 | --options dn_scalar=100 embed_init_tgt=TRUE \ 8 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 9 | dn_box_noise_scale=1.0 10 | -------------------------------------------------------------------------------- /scripts/DINO_eval_dist.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | checkpoint=$2 3 | python -m torch.distributed.launch --nproc_per_node=8 main.py \ 4 | --output_dir logs/DINO/R50-MS4-%j \ 5 | -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 6 | --eval --resume $checkpoint \ 7 | --options dn_scalar=100 embed_init_tgt=TRUE \ 8 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 9 | dn_box_noise_scale=1.0 10 | -------------------------------------------------------------------------------- /scripts/DINO_eval_submitit.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | checkpoint=$2 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 4 | --job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \ 5 | -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 6 | --eval --resume $checkpoint \ 7 | --options dn_scalar=100 embed_init_tgt=TRUE \ 8 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 9 | dn_box_noise_scale=1.0 10 | -------------------------------------------------------------------------------- /scripts/DINO_eval_submitit_5scale.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | checkpoint=$2 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 4 | --job_dir logs/DINO/R50-MS5-%j --ngpus 8 --nodes 1 \ 5 | -c config/DINO/DINO_5scale.py --coco_path $coco_path \ 6 | --eval --resume $checkpoint \ 7 | --options dn_scalar=100 embed_init_tgt=TRUE \ 8 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 9 | dn_box_noise_scale=1.0 10 | -------------------------------------------------------------------------------- /scripts/DINO_train.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | python main.py \ 3 | --output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 4 | --options dn_scalar=100 embed_init_tgt=TRUE \ 5 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 6 | dn_box_noise_scale=1.0 7 | -------------------------------------------------------------------------------- /scripts/DINO_train_convnext.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | backbone_dir=$2 3 | export CUDA_VISIBLE_DEVICES=$3 && python main.py \ 4 | --output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale_convnext.py --coco_path $coco_path \ 5 | --options dn_scalar=100 embed_init_tgt=TRUE \ 6 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 7 | dn_box_noise_scale=1.0 backbone_dir=$backbone_dir -------------------------------------------------------------------------------- /scripts/DINO_train_dist.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | python -m torch.distributed.launch --nproc_per_node=8 main.py \ 3 | --output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 4 | --options dn_scalar=100 embed_init_tgt=TRUE \ 5 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 6 | dn_box_noise_scale=1.0 7 | -------------------------------------------------------------------------------- /scripts/DINO_train_submitit.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 3 | --job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \ 4 | -c config/DINO/DINO_4scale.py --coco_path $coco_path \ 5 | --options dn_scalar=100 embed_init_tgt=TRUE \ 6 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 7 | dn_box_noise_scale=1.0 8 | -------------------------------------------------------------------------------- /scripts/DINO_train_submitit_5scale.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 3 | --job_dir logs/DINO/R50-MS5-%j --ngpus 8 --nodes 2 \ 4 | -c config/DINO/DINO_5scale.py --coco_path $coco_path \ 5 | --options dn_scalar=100 embed_init_tgt=TRUE \ 6 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 7 | dn_box_noise_scale=1.0 8 | -------------------------------------------------------------------------------- /scripts/DINO_train_submitit_convnext.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | backbone_dir=$2 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 4 | --job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \ 5 | -c config/DINO/DINO_4scale_convnext.py --coco_path $coco_path \ 6 | --options dn_scalar=100 embed_init_tgt=TRUE \ 7 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 8 | dn_box_noise_scale=1.0 backbone_dir=$backbone_dir 9 | -------------------------------------------------------------------------------- /scripts/DINO_train_submitit_swin.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | backbone_dir=$2 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \ 4 | --job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \ 5 | -c config/DINO/DINO_4scale_swin.py --coco_path $coco_path \ 6 | --options dn_scalar=100 embed_init_tgt=TRUE \ 7 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 8 | dn_box_noise_scale=1.0 backbone_dir=$backbone_dir 9 | -------------------------------------------------------------------------------- /scripts/DINO_train_swin.sh: -------------------------------------------------------------------------------- 1 | coco_path=$1 2 | backbone_dir=$2 3 | export CUDA_VISIBLE_DEVICES=$3 && python main.py \ 4 | --output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale_swin.py --coco_path $coco_path \ 5 | --options dn_scalar=100 embed_init_tgt=TRUE \ 6 | dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ 7 | dn_box_noise_scale=1.0 backbone_dir=$backbone_dir 8 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | We provide a scirpt to calculate model size, GFLOPS, and FPS. 2 | 3 | An example to use it: 4 | ```bash 5 | python tools/benchmark.py \ 6 | --output_dir logs/test_flops \ 7 | -c config/DINO/DINO_4scale.py \ 8 | --options batch_size=1 \ 9 | --coco_path /path/to/your/coco/dir 10 | ``` 11 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /util/box_loss.py: -------------------------------------------------------------------------------- 1 | # borrow from https://github.com/Zzh-tju/CIoU/blob/master/layers/modules/multibox_loss.py 2 | 3 | import torch, math 4 | 5 | 6 | 7 | def ciou(bboxes1, bboxes2): 8 | bboxes1 = torch.sigmoid(bboxes1) 9 | bboxes2 = torch.sigmoid(bboxes2) 10 | rows = bboxes1.shape[0] 11 | cols = bboxes2.shape[0] 12 | cious = torch.zeros((rows, cols)) 13 | if rows * cols == 0: 14 | return cious 15 | exchange = False 16 | if bboxes1.shape[0] > bboxes2.shape[0]: 17 | bboxes1, bboxes2 = bboxes2, bboxes1 18 | cious = torch.zeros((cols, rows)) 19 | exchange = True 20 | w1 = torch.exp(bboxes1[:, 2]) 21 | h1 = torch.exp(bboxes1[:, 3]) 22 | w2 = torch.exp(bboxes2[:, 2]) 23 | h2 = torch.exp(bboxes2[:, 3]) 24 | area1 = w1 * h1 25 | area2 = w2 * h2 26 | center_x1 = bboxes1[:, 0] 27 | center_y1 = bboxes1[:, 1] 28 | center_x2 = bboxes2[:, 0] 29 | center_y2 = bboxes2[:, 1] 30 | 31 | inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2) 32 | inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2) 33 | inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2) 34 | inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2) 35 | inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0) 36 | 37 | c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2) 38 | c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2) 39 | c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2) 40 | c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2) 41 | 42 | inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2 43 | c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2 44 | 45 | union = area1+area2-inter_area 46 | u = (inter_diag) / c_diag 47 | iou = inter_area / union 48 | v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(w2 / h2) - torch.atan(w1 / h1)), 2) 49 | with torch.no_grad(): 50 | S = (iou>0.5).float() 51 | alpha= S*v/(1-iou+v) 52 | cious = iou - u - alpha * v 53 | cious = torch.clamp(cious,min=-1.0,max = 1.0) 54 | if exchange: 55 | cious = cious.T 56 | return 1-cious 57 | 58 | def diou(bboxes1, bboxes2): 59 | bboxes1 = torch.sigmoid(bboxes1) 60 | bboxes2 = torch.sigmoid(bboxes2) 61 | rows = bboxes1.shape[0] 62 | cols = bboxes2.shape[0] 63 | cious = torch.zeros((rows, cols)) 64 | if rows * cols == 0: 65 | return cious 66 | exchange = False 67 | if bboxes1.shape[0] > bboxes2.shape[0]: 68 | bboxes1, bboxes2 = bboxes2, bboxes1 69 | cious = torch.zeros((cols, rows)) 70 | exchange = True 71 | w1 = torch.exp(bboxes1[:, 2]) 72 | h1 = torch.exp(bboxes1[:, 3]) 73 | w2 = torch.exp(bboxes2[:, 2]) 74 | h2 = torch.exp(bboxes2[:, 3]) 75 | area1 = w1 * h1 76 | area2 = w2 * h2 77 | center_x1 = bboxes1[:, 0] 78 | center_y1 = bboxes1[:, 1] 79 | center_x2 = bboxes2[:, 0] 80 | center_y2 = bboxes2[:, 1] 81 | 82 | inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2) 83 | inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2) 84 | inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2) 85 | inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2) 86 | inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0) 87 | 88 | c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2) 89 | c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2) 90 | c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2) 91 | c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2) 92 | 93 | inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2 94 | c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2 95 | 96 | union = area1+area2-inter_area 97 | u = (inter_diag) / c_diag 98 | iou = inter_area / union 99 | dious = iou - u 100 | dious = torch.clamp(dious,min=-1.0,max = 1.0) 101 | if exchange: 102 | dious = dious.T 103 | return 1-dious 104 | 105 | 106 | if __name__ == "__main__": 107 | x = torch.rand(10, 4) 108 | y = torch.rand(10,4) 109 | import ipdb;ipdb.set_trace() 110 | cxy = ciou(x, y) 111 | dxy = diou(x, y) 112 | print(cxy.shape, dxy.shape) 113 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch, os 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | 29 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 30 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 31 | 32 | wh = (rb - lt).clamp(min=0) # [N,M,2] 33 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 34 | 35 | union = area1[:, None] + area2 - inter 36 | 37 | iou = inter / (union + 1e-6) 38 | return iou, union 39 | 40 | 41 | def generalized_box_iou(boxes1, boxes2): 42 | """ 43 | Generalized IoU from https://giou.stanford.edu/ 44 | 45 | The boxes should be in [x0, y0, x1, y1] format 46 | 47 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 48 | and M = len(boxes2) 49 | """ 50 | # degenerate boxes gives inf / nan results 51 | # so do an early check 52 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 53 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 54 | 55 | iou, union = box_iou(boxes1, boxes2) 56 | 57 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 58 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 59 | 60 | wh = (rb - lt).clamp(min=0) # [N,M,2] 61 | area = wh[:, :, 0] * wh[:, :, 1] 62 | 63 | return iou - (area - union) / (area + 1e-6) 64 | 65 | 66 | 67 | # modified from torchvision to also return the union 68 | def box_iou_pairwise(boxes1, boxes2): 69 | area1 = box_area(boxes1) 70 | area2 = box_area(boxes2) 71 | 72 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 73 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 74 | 75 | wh = (rb - lt).clamp(min=0) # [N,2] 76 | inter = wh[:, 0] * wh[:, 1] # [N] 77 | 78 | union = area1 + area2 - inter 79 | 80 | iou = inter / union 81 | return iou, union 82 | 83 | 84 | def generalized_box_iou_pairwise(boxes1, boxes2): 85 | """ 86 | Generalized IoU from https://giou.stanford.edu/ 87 | 88 | Input: 89 | - boxes1, boxes2: N,4 90 | Output: 91 | - giou: N, 4 92 | """ 93 | # degenerate boxes gives inf / nan results 94 | # so do an early check 95 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 96 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 97 | assert boxes1.shape == boxes2.shape 98 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 99 | 100 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 101 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 102 | 103 | wh = (rb - lt).clamp(min=0) # [N,2] 104 | area = wh[:, 0] * wh[:, 1] 105 | 106 | return iou - (area - union) / area 107 | 108 | def masks_to_boxes(masks): 109 | """Compute the bounding boxes around the provided masks 110 | 111 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 112 | 113 | Returns a [N, 4] tensors, with the boxes in xyxy format 114 | """ 115 | if masks.numel() == 0: 116 | return torch.zeros((0, 4), device=masks.device) 117 | 118 | h, w = masks.shape[-2:] 119 | 120 | y = torch.arange(0, h, dtype=torch.float) 121 | x = torch.arange(0, w, dtype=torch.float) 122 | y, x = torch.meshgrid(y, x) 123 | 124 | x_mask = (masks * x.unsqueeze(0)) 125 | x_max = x_mask.flatten(1).max(-1)[0] 126 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 127 | 128 | y_mask = (masks * y.unsqueeze(0)) 129 | y_max = y_mask.flatten(1).max(-1)[0] 130 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 131 | 132 | return torch.stack([x_min, y_min, x_max, y_max], 1) 133 | 134 | if __name__ == '__main__': 135 | x = torch.rand(5, 4) 136 | y = torch.rand(3, 4) 137 | iou, union = box_iou(x, y) 138 | import ipdb; ipdb.set_trace() -------------------------------------------------------------------------------- /util/coco_id2name.json: -------------------------------------------------------------------------------- 1 | {"1": "person", "2": "bicycle", "3": "car", "4": "motorcycle", "5": "airplane", "6": "bus", "7": "train", "8": "truck", "9": "boat", "10": "traffic light", "11": "fire hydrant", "13": "stop sign", "14": "parking meter", "15": "bench", "16": "bird", "17": "cat", "18": "dog", "19": "horse", "20": "sheep", "21": "cow", "22": "elephant", "23": "bear", "24": "zebra", "25": "giraffe", "27": "backpack", "28": "umbrella", "31": "handbag", "32": "tie", "33": "suitcase", "34": "frisbee", "35": "skis", "36": "snowboard", "37": "sports ball", "38": "kite", "39": "baseball bat", "40": "baseball glove", "41": "skateboard", "42": "surfboard", "43": "tennis racket", "44": "bottle", "46": "wine glass", "47": "cup", "48": "fork", "49": "knife", "50": "spoon", "51": "bowl", "52": "banana", "53": "apple", "54": "sandwich", "55": "orange", "56": "broccoli", "57": "carrot", "58": "hot dog", "59": "pizza", "60": "donut", "61": "cake", "62": "chair", "63": "couch", "64": "potted plant", "65": "bed", "67": "dining table", "70": "toilet", "72": "tv", "73": "laptop", "74": "mouse", "75": "remote", "76": "keyboard", "77": "cell phone", "78": "microwave", "79": "oven", "80": "toaster", "81": "sink", "82": "refrigerator", "84": "book", "85": "clock", "86": "vase", "87": "scissors", "88": "teddy bear", "89": "hair drier", "90": "toothbrush"} -------------------------------------------------------------------------------- /util/get_param_dicts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def match_name_keywords(n: str, name_keywords: list): 7 | out = False 8 | for b in name_keywords: 9 | if b in n: 10 | out = True 11 | break 12 | return out 13 | 14 | 15 | def get_param_dict(args, model_without_ddp: nn.Module): 16 | try: 17 | param_dict_type = args.param_dict_type 18 | except: 19 | param_dict_type = 'default' 20 | assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd'] 21 | 22 | # by default 23 | if param_dict_type == 'default': 24 | param_dicts = [ 25 | {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, 26 | { 27 | "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], 28 | "lr": args.lr_backbone, 29 | } 30 | ] 31 | return param_dicts 32 | 33 | if param_dict_type == 'ddetr_in_mmdet': 34 | param_dicts = [ 35 | { 36 | "params": 37 | [p for n, p in model_without_ddp.named_parameters() 38 | if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], 39 | "lr": args.lr, 40 | }, 41 | { 42 | "params": [p for n, p in model_without_ddp.named_parameters() 43 | if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], 44 | "lr": args.lr_backbone, 45 | }, 46 | { 47 | "params": [p for n, p in model_without_ddp.named_parameters() 48 | if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], 49 | "lr": args.lr * args.lr_linear_proj_mult, 50 | } 51 | ] 52 | return param_dicts 53 | 54 | if param_dict_type == 'large_wd': 55 | param_dicts = [ 56 | { 57 | "params": 58 | [p for n, p in model_without_ddp.named_parameters() 59 | if not match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 60 | }, 61 | { 62 | "params": [p for n, p in model_without_ddp.named_parameters() 63 | if match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 64 | "lr": args.lr_backbone, 65 | "weight_decay": 0.0, 66 | }, 67 | { 68 | "params": [p for n, p in model_without_ddp.named_parameters() 69 | if match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 70 | "lr": args.lr_backbone, 71 | "weight_decay": args.weight_decay, 72 | }, 73 | { 74 | "params": 75 | [p for n, p in model_without_ddp.named_parameters() 76 | if not match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad], 77 | "lr": args.lr, 78 | "weight_decay": 0.0, 79 | } 80 | ] 81 | 82 | # print("param_dicts: {}".format(param_dicts)) 83 | 84 | return param_dicts -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import functools 3 | import logging 4 | import os 5 | import sys 6 | from termcolor import colored 7 | 8 | 9 | class _ColorfulFormatter(logging.Formatter): 10 | def __init__(self, *args, **kwargs): 11 | self._root_name = kwargs.pop("root_name") + "." 12 | self._abbrev_name = kwargs.pop("abbrev_name", "") 13 | if len(self._abbrev_name): 14 | self._abbrev_name = self._abbrev_name + "." 15 | super(_ColorfulFormatter, self).__init__(*args, **kwargs) 16 | 17 | def formatMessage(self, record): 18 | record.name = record.name.replace(self._root_name, self._abbrev_name) 19 | log = super(_ColorfulFormatter, self).formatMessage(record) 20 | if record.levelno == logging.WARNING: 21 | prefix = colored("WARNING", "red", attrs=["blink"]) 22 | elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: 23 | prefix = colored("ERROR", "red", attrs=["blink", "underline"]) 24 | else: 25 | return log 26 | return prefix + " " + log 27 | 28 | 29 | # so that calling setup_logger multiple times won't add many handlers 30 | @functools.lru_cache() 31 | def setup_logger( 32 | output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None 33 | ): 34 | """ 35 | Initialize the detectron2 logger and set its verbosity level to "INFO". 36 | 37 | Args: 38 | output (str): a file name or a directory to save log. If None, will not save log file. 39 | If ends with ".txt" or ".log", assumed to be a file name. 40 | Otherwise, logs will be saved to `output/log.txt`. 41 | name (str): the root module name of this logger 42 | 43 | Returns: 44 | logging.Logger: a logger 45 | """ 46 | logger = logging.getLogger(name) 47 | logger.setLevel(logging.DEBUG) 48 | logger.propagate = False 49 | 50 | if abbrev_name is None: 51 | abbrev_name = name 52 | 53 | plain_formatter = logging.Formatter( 54 | '[%(asctime)s.%(msecs)03d]: %(message)s', 55 | datefmt='%m/%d %H:%M:%S' 56 | ) 57 | # stdout logging: master only 58 | if distributed_rank == 0: 59 | ch = logging.StreamHandler(stream=sys.stdout) 60 | ch.setLevel(logging.DEBUG) 61 | if color: 62 | formatter = _ColorfulFormatter( 63 | colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s", 64 | datefmt="%m/%d %H:%M:%S", 65 | root_name=name, 66 | abbrev_name=str(abbrev_name), 67 | ) 68 | else: 69 | formatter = plain_formatter 70 | ch.setFormatter(formatter) 71 | logger.addHandler(ch) 72 | 73 | # file logging: all workers 74 | if output is not None: 75 | if output.endswith(".txt") or output.endswith(".log"): 76 | filename = output 77 | else: 78 | filename = os.path.join(output, "log.txt") 79 | if distributed_rank > 0: 80 | filename = filename + f".rank{distributed_rank}" 81 | os.makedirs(os.path.dirname(filename), exist_ok=True) 82 | 83 | fh = logging.StreamHandler(_cached_log_stream(filename)) 84 | fh.setLevel(logging.DEBUG) 85 | fh.setFormatter(plain_formatter) 86 | logger.addHandler(fh) 87 | 88 | return logger 89 | 90 | 91 | # cache the opened file object, so that different calls to `setup_logger` 92 | # with the same file name can safely write to the same file. 93 | @functools.lru_cache(maxsize=None) 94 | def _cached_log_stream(filename): 95 | return open(filename, "a") 96 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plotting utilities to visualize training logs. 3 | """ 4 | import torch 5 | import pandas as pd 6 | import numpy as np 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | 10 | from pathlib import Path, PurePath 11 | 12 | 13 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 14 | ''' 15 | Function to plot specific fields from training log(s). Plots both training and test results. 16 | 17 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 18 | - fields = which results to plot from each log file - plots both training and test for each field. 19 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 20 | - log_name = optional, name of log file if different than default 'log.txt'. 21 | 22 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 23 | - solid lines are training results, dashed lines are test results. 24 | 25 | ''' 26 | func_name = "plot_utils.py::plot_logs" 27 | 28 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 29 | # convert single Path to list to avoid 'not iterable' error 30 | 31 | if not isinstance(logs, list): 32 | if isinstance(logs, PurePath): 33 | logs = [logs] 34 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 35 | else: 36 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 37 | Expect list[Path] or single Path obj, received {type(logs)}") 38 | 39 | # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir 40 | for i, dir in enumerate(logs): 41 | if not isinstance(dir, PurePath): 42 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 43 | if not dir.exists(): 44 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 45 | # verify log_name exists 46 | fn = Path(dir / log_name) 47 | if not fn.exists(): 48 | print(f"-> missing {log_name}. Have you gotten to Epoch 1 in training?") 49 | print(f"--> full path of missing log file: {fn}") 50 | return 51 | 52 | # load log file(s) and plot 53 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 54 | 55 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 56 | 57 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 58 | for j, field in enumerate(fields): 59 | if field == 'mAP': 60 | coco_eval = pd.DataFrame( 61 | np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1] 62 | ).ewm(com=ewm_col).mean() 63 | axs[j].plot(coco_eval, c=color) 64 | else: 65 | df.interpolate().ewm(com=ewm_col).mean().plot( 66 | y=[f'train_{field}', f'test_{field}'], 67 | ax=axs[j], 68 | color=[color] * 2, 69 | style=['-', '--'] 70 | ) 71 | for ax, field in zip(axs, fields): 72 | if field == 'mAP': 73 | ax.legend([Path(p).name for p in logs]) 74 | ax.set_title(field) 75 | else: 76 | ax.legend([f'train', f'test']) 77 | ax.set_title(field) 78 | 79 | return fig, axs 80 | 81 | def plot_precision_recall(files, naming_scheme='iter'): 82 | if naming_scheme == 'exp_id': 83 | # name becomes exp_id 84 | names = [f.parts[-3] for f in files] 85 | elif naming_scheme == 'iter': 86 | names = [f.stem for f in files] 87 | else: 88 | raise ValueError(f'not supported {naming_scheme}') 89 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 90 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 91 | data = torch.load(f) 92 | # precision is n_iou, n_points, n_cat, n_area, max_det 93 | precision = data['precision'] 94 | recall = data['params'].recThrs 95 | scores = data['scores'] 96 | # take precision for all classes, all areas and 100 detections 97 | precision = precision[0, :, :, 0, -1].mean(1) 98 | scores = scores[0, :, :, 0, -1].mean(1) 99 | prec = precision.mean() 100 | rec = data['recall'][0, :, 0, -1].mean() 101 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 102 | f'score={scores.mean():0.3f}, ' + 103 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 104 | ) 105 | axs[0].plot(recall, precision, c=color) 106 | axs[1].plot(recall, scores, c=color) 107 | 108 | axs[0].set_title('Precision / Recall') 109 | axs[0].legend(names) 110 | axs[1].set_title('Scores / Recall') 111 | axs[1].legend(names) 112 | return fig, axs 113 | -------------------------------------------------------------------------------- /util/slio.py: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Modified from mmcv 3 | # ========================================================== 4 | 5 | import json, pickle, yaml 6 | try: 7 | from yaml import CLoader as Loader, CDumper as Dumper 8 | except ImportError: 9 | from yaml import Loader, Dumper 10 | 11 | from pathlib import Path 12 | from abc import ABCMeta, abstractmethod 13 | 14 | # =========================== 15 | # Rigister handler 16 | # =========================== 17 | 18 | class BaseFileHandler(metaclass=ABCMeta): 19 | 20 | @abstractmethod 21 | def load_from_fileobj(self, file, **kwargs): 22 | pass 23 | 24 | @abstractmethod 25 | def dump_to_fileobj(self, obj, file, **kwargs): 26 | pass 27 | 28 | @abstractmethod 29 | def dump_to_str(self, obj, **kwargs): 30 | pass 31 | 32 | def load_from_path(self, filepath, mode='r', **kwargs): 33 | with open(filepath, mode) as f: 34 | return self.load_from_fileobj(f, **kwargs) 35 | 36 | def dump_to_path(self, obj, filepath, mode='w', **kwargs): 37 | with open(filepath, mode) as f: 38 | self.dump_to_fileobj(obj, f, **kwargs) 39 | 40 | class JsonHandler(BaseFileHandler): 41 | 42 | def load_from_fileobj(self, file): 43 | return json.load(file) 44 | 45 | def dump_to_fileobj(self, obj, file, **kwargs): 46 | json.dump(obj, file, **kwargs) 47 | 48 | def dump_to_str(self, obj, **kwargs): 49 | return json.dumps(obj, **kwargs) 50 | 51 | class PickleHandler(BaseFileHandler): 52 | 53 | def load_from_fileobj(self, file, **kwargs): 54 | return pickle.load(file, **kwargs) 55 | 56 | def load_from_path(self, filepath, **kwargs): 57 | return super(PickleHandler, self).load_from_path( 58 | filepath, mode='rb', **kwargs) 59 | 60 | def dump_to_str(self, obj, **kwargs): 61 | kwargs.setdefault('protocol', 2) 62 | return pickle.dumps(obj, **kwargs) 63 | 64 | def dump_to_fileobj(self, obj, file, **kwargs): 65 | kwargs.setdefault('protocol', 2) 66 | pickle.dump(obj, file, **kwargs) 67 | 68 | def dump_to_path(self, obj, filepath, **kwargs): 69 | super(PickleHandler, self).dump_to_path( 70 | obj, filepath, mode='wb', **kwargs) 71 | 72 | class YamlHandler(BaseFileHandler): 73 | 74 | def load_from_fileobj(self, file, **kwargs): 75 | kwargs.setdefault('Loader', Loader) 76 | return yaml.load(file, **kwargs) 77 | 78 | def dump_to_fileobj(self, obj, file, **kwargs): 79 | kwargs.setdefault('Dumper', Dumper) 80 | yaml.dump(obj, file, **kwargs) 81 | 82 | def dump_to_str(self, obj, **kwargs): 83 | kwargs.setdefault('Dumper', Dumper) 84 | return yaml.dump(obj, **kwargs) 85 | 86 | file_handlers = { 87 | 'json': JsonHandler(), 88 | 'yaml': YamlHandler(), 89 | 'yml': YamlHandler(), 90 | 'pickle': PickleHandler(), 91 | 'pkl': PickleHandler() 92 | } 93 | 94 | # =========================== 95 | # load and dump 96 | # =========================== 97 | 98 | def is_str(x): 99 | """Whether the input is an string instance. 100 | 101 | Note: This method is deprecated since python 2 is no longer supported. 102 | """ 103 | return isinstance(x, str) 104 | 105 | def slload(file, file_format=None, **kwargs): 106 | """Load data from json/yaml/pickle files. 107 | 108 | This method provides a unified api for loading data from serialized files. 109 | 110 | Args: 111 | file (str or :obj:`Path` or file-like object): Filename or a file-like 112 | object. 113 | file_format (str, optional): If not specified, the file format will be 114 | inferred from the file extension, otherwise use the specified one. 115 | Currently supported formats include "json", "yaml/yml" and 116 | "pickle/pkl". 117 | 118 | Returns: 119 | The content from the file. 120 | """ 121 | if isinstance(file, Path): 122 | file = str(file) 123 | if file_format is None and is_str(file): 124 | file_format = file.split('.')[-1] 125 | if file_format not in file_handlers: 126 | raise TypeError(f'Unsupported format: {file_format}') 127 | 128 | handler = file_handlers[file_format] 129 | if is_str(file): 130 | obj = handler.load_from_path(file, **kwargs) 131 | elif hasattr(file, 'read'): 132 | obj = handler.load_from_fileobj(file, **kwargs) 133 | else: 134 | raise TypeError('"file" must be a filepath str or a file-object') 135 | return obj 136 | 137 | 138 | def sldump(obj, file=None, file_format=None, **kwargs): 139 | """Dump data to json/yaml/pickle strings or files. 140 | 141 | This method provides a unified api for dumping data as strings or to files, 142 | and also supports custom arguments for each file format. 143 | 144 | Args: 145 | obj (any): The python object to be dumped. 146 | file (str or :obj:`Path` or file-like object, optional): If not 147 | specified, then the object is dump to a str, otherwise to a file 148 | specified by the filename or file-like object. 149 | file_format (str, optional): Same as :func:`load`. 150 | 151 | Returns: 152 | bool: True for success, False otherwise. 153 | """ 154 | if isinstance(file, Path): 155 | file = str(file) 156 | if file_format is None: 157 | if is_str(file): 158 | file_format = file.split('.')[-1] 159 | elif file is None: 160 | raise ValueError( 161 | 'file_format must be specified since file is None') 162 | if file_format not in file_handlers: 163 | raise TypeError(f'Unsupported format: {file_format}') 164 | 165 | handler = file_handlers[file_format] 166 | if file is None: 167 | return handler.dump_to_str(obj, **kwargs) 168 | elif is_str(file): 169 | handler.dump_to_path(obj, file, **kwargs) 170 | elif hasattr(file, 'write'): 171 | handler.dump_to_fileobj(obj, file, **kwargs) 172 | else: 173 | raise TypeError('"file" must be a filename str or a file-object') 174 | -------------------------------------------------------------------------------- /util/static_data_path.py: -------------------------------------------------------------------------------- 1 | coco = dict( 2 | train = dict( 3 | img_folder = '/comp_robot/cv_public_dataset/COCO2017/train2017', 4 | ann_file = '/comp_robot/cv_public_dataset/COCO2017/annotations/instances_train2017.json' 5 | ), 6 | val = dict( 7 | img_folder = '/comp_robot/cv_public_dataset/COCO2017/val2017', 8 | ann_file = '/comp_robot/cv_public_dataset/COCO2017/annotations/instances_val2017.json' 9 | ) 10 | ) -------------------------------------------------------------------------------- /util/time_counter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | class TimeCounter: 5 | def __init__(self) -> None: 6 | pass 7 | 8 | def clear(self): 9 | self.timedict = {} 10 | self.basetime = time.perf_counter() 11 | 12 | def timeit(self, name): 13 | nowtime = time.perf_counter() - self.basetime 14 | self.timedict[name] = nowtime 15 | self.basetime = time.perf_counter() 16 | 17 | 18 | class TimeHolder: 19 | def __init__(self) -> None: 20 | self.timedict = {} 21 | 22 | def update(self, _timedict:dict): 23 | for k,v in _timedict.items(): 24 | if k not in self.timedict: 25 | self.timedict[k] = AverageMeter(name=k, val_only=True) 26 | self.timedict[k].update(val=v) 27 | 28 | def final_res(self): 29 | return {k:v.avg for k,v in self.timedict.items()} 30 | 31 | def __str__(self): 32 | return json.dumps(self.final_res(), indent=2) 33 | 34 | 35 | class AverageMeter(object): 36 | """Computes and stores the average and current value""" 37 | def __init__(self, name, fmt=':f', val_only=False): 38 | self.name = name 39 | self.fmt = fmt 40 | self.val_only = val_only 41 | self.reset() 42 | 43 | def reset(self): 44 | self.val = 0 45 | self.avg = 0 46 | self.sum = 0 47 | self.count = 0 48 | 49 | def update(self, val, n=1): 50 | self.val = val 51 | self.sum += val * n 52 | self.count += n 53 | self.avg = self.sum / self.count 54 | 55 | def __str__(self): 56 | if self.val_only: 57 | fmtstr = '{name} {val' + self.fmt + '}' 58 | else: 59 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 60 | return fmtstr.format(**self.__dict__) -------------------------------------------------------------------------------- /util/vis_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | from util.utils import renorm 5 | from util.misc import color_sys 6 | 7 | _color_getter = color_sys(100) 8 | 9 | # plot known and unknown box 10 | def add_box_to_img(img, boxes, colorlist, brands=None): 11 | """[summary] 12 | 13 | Args: 14 | img ([type]): np.array, H,W,3 15 | boxes ([type]): list of list(4) 16 | colorlist: list of colors. 17 | brands: text. 18 | 19 | Return: 20 | img: np.array. H,W,3. 21 | """ 22 | H, W = img.shape[:2] 23 | for _i, (box, color) in enumerate(zip(boxes, colorlist)): 24 | x, y, w, h = box[0] * W, box[1] * H, box[2] * W, box[3] * H 25 | img = cv2.rectangle(img.copy(), (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), color, 2) 26 | if brands is not None: 27 | brand = brands[_i] 28 | org = (int(x-w/2), int(y+h/2)) 29 | font = cv2.FONT_HERSHEY_SIMPLEX 30 | fontScale = 0.5 31 | thickness = 1 32 | img = cv2.putText(img.copy(), str(brand), org, font, 33 | fontScale, color, thickness, cv2.LINE_AA) 34 | return img 35 | 36 | def plot_dual_img(img, boxes, labels, idxs, probs=None): 37 | """[summary] 38 | 39 | Args: 40 | img ([type]): 3,H,W. tensor. 41 | boxes (): tensor(Kx4) or list of tensor(1x4). 42 | labels ([type]): list of ints. 43 | idxs ([type]): list of ints. 44 | probs (optional): listof floats. 45 | 46 | Returns: 47 | img_classcolor: np.array. H,W,3. img with class-wise label. 48 | img_seqcolor: np.array. H,W,3. img with seq-wise label. 49 | """ 50 | 51 | boxes = [i.cpu().tolist() for i in boxes] 52 | img = (renorm(img.cpu()).permute(1,2,0).numpy() * 255).astype(np.uint8) 53 | # plot with class 54 | class_colors = [_color_getter(i) for i in labels] 55 | if probs is not None: 56 | brands = ["{},{:.2f}".format(j,k) for j,k in zip(labels, probs)] 57 | else: 58 | brands = labels 59 | img_classcolor = add_box_to_img(img, boxes, class_colors, brands=brands) 60 | # plot with seq 61 | seq_colors = [_color_getter((i * 11) % 100) for i in idxs] 62 | img_seqcolor = add_box_to_img(img, boxes, seq_colors, brands=idxs) 63 | return img_classcolor, img_seqcolor 64 | 65 | 66 | def plot_raw_img(img, boxes, labels): 67 | """[summary] 68 | 69 | Args: 70 | img ([type]): 3,H,W. tensor. 71 | boxes ([type]): Kx4. tensor 72 | labels ([type]): K. tensor. 73 | 74 | return: 75 | img: np.array. H,W,3. img with bbox annos. 76 | 77 | """ 78 | img = (renorm(img.cpu()).permute(1,2,0).numpy() * 255).astype(np.uint8) 79 | H, W = img.shape[:2] 80 | for box, label in zip(boxes.tolist(), labels.tolist()): 81 | x, y, w, h = box[0] * W, box[1] * H, box[2] * W, box[3] * H 82 | 83 | img = cv2.rectangle(img.copy(), (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), _color_getter(label), 2) 84 | # add text 85 | org = (int(x-w/2), int(y+h/2)) 86 | font = cv2.FONT_HERSHEY_SIMPLEX 87 | fontScale = 1 88 | thickness = 1 89 | img = cv2.putText(img.copy(), str(label), org, font, 90 | fontScale, _color_getter(label), thickness, cv2.LINE_AA) 91 | 92 | return img -------------------------------------------------------------------------------- /util/visualizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @File : visualizer.py 4 | @Time : 2022/04/05 11:39:33 5 | @Author : Shilong Liu 6 | @Contact : liusl20@mail.tsinghua.edu.cn; slongliu86@gmail.com 7 | Modified from COCO evaluator 8 | ''' 9 | 10 | import os, sys 11 | from textwrap import wrap 12 | import torch 13 | import numpy as np 14 | import cv2 15 | import datetime 16 | 17 | import matplotlib.pyplot as plt 18 | from matplotlib.collections import PatchCollection 19 | from matplotlib.patches import Polygon 20 | from pycocotools import mask as maskUtils 21 | from matplotlib import transforms 22 | 23 | def renorm(img: torch.FloatTensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) \ 24 | -> torch.FloatTensor: 25 | # img: tensor(3,H,W) or tensor(B,3,H,W) 26 | # return: same as img 27 | assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim() 28 | if img.dim() == 3: 29 | assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (img.size(0), str(img.size())) 30 | img_perm = img.permute(1,2,0) 31 | mean = torch.Tensor(mean) 32 | std = torch.Tensor(std) 33 | img_res = img_perm * std + mean 34 | return img_res.permute(2,0,1) 35 | else: # img.dim() == 4 36 | assert img.size(1) == 3, 'img.size(1) shoule be 3 but "%d". (%s)' % (img.size(1), str(img.size())) 37 | img_perm = img.permute(0,2,3,1) 38 | mean = torch.Tensor(mean) 39 | std = torch.Tensor(std) 40 | img_res = img_perm * std + mean 41 | return img_res.permute(0,3,1,2) 42 | 43 | class ColorMap(): 44 | def __init__(self, basergb=[255,255,0]): 45 | self.basergb = np.array(basergb) 46 | def __call__(self, attnmap): 47 | # attnmap: h, w. np.uint8. 48 | # return: h, w, 4. np.uint8. 49 | assert attnmap.dtype == np.uint8 50 | h, w = attnmap.shape 51 | res = self.basergb.copy() 52 | res = res[None][None].repeat(h, 0).repeat(w, 1) # h, w, 3 53 | attn1 = attnmap.copy()[..., None] # h, w, 1 54 | res = np.concatenate((res, attn1), axis=-1).astype(np.uint8) 55 | return res 56 | 57 | 58 | class COCOVisualizer(): 59 | def __init__(self) -> None: 60 | pass 61 | 62 | def visualize(self, img, tgt, caption=None, dpi=120, savedir=None, show_in_console=True): 63 | """ 64 | img: tensor(3, H, W) 65 | tgt: make sure they are all on cpu. 66 | must have items: 'image_id', 'boxes', 'size' 67 | """ 68 | plt.figure(dpi=dpi) 69 | plt.rcParams['font.size'] = '5' 70 | ax = plt.gca() 71 | img = renorm(img).permute(1, 2, 0) 72 | ax.imshow(img) 73 | 74 | self.addtgt(tgt) 75 | if show_in_console: 76 | plt.show() 77 | 78 | if savedir is not None: 79 | if caption is None: 80 | savename = '{}/{}-{}.png'.format(savedir, int(tgt['image_id']), str(datetime.datetime.now()).replace(' ', '-')) 81 | else: 82 | savename = '{}/{}-{}-{}.png'.format(savedir, caption, int(tgt['image_id']), str(datetime.datetime.now()).replace(' ', '-')) 83 | print("savename: {}".format(savename)) 84 | os.makedirs(os.path.dirname(savename), exist_ok=True) 85 | plt.savefig(savename) 86 | plt.close() 87 | 88 | def addtgt(self, tgt): 89 | """ 90 | - tgt: dict. args: 91 | - boxes: num_boxes, 4. xywh, [0,1]. 92 | - box_label: num_boxes. 93 | """ 94 | assert 'boxes' in tgt 95 | ax = plt.gca() 96 | H, W = tgt['size'].tolist() 97 | numbox = tgt['boxes'].shape[0] 98 | 99 | color = [] 100 | polygons = [] 101 | boxes = [] 102 | for box in tgt['boxes'].cpu(): 103 | unnormbbox = box * torch.Tensor([W, H, W, H]) 104 | unnormbbox[:2] -= unnormbbox[2:] / 2 105 | [bbox_x, bbox_y, bbox_w, bbox_h] = unnormbbox.tolist() 106 | boxes.append([bbox_x, bbox_y, bbox_w, bbox_h]) 107 | poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]] 108 | np_poly = np.array(poly).reshape((4,2)) 109 | polygons.append(Polygon(np_poly)) 110 | c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] 111 | color.append(c) 112 | 113 | p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.1) 114 | ax.add_collection(p) 115 | p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) 116 | ax.add_collection(p) 117 | 118 | 119 | if 'box_label' in tgt: 120 | assert len(tgt['box_label']) == numbox, f"{len(tgt['box_label'])} = {numbox}, " 121 | for idx, bl in enumerate(tgt['box_label']): 122 | _string = str(bl) 123 | bbox_x, bbox_y, bbox_w, bbox_h = boxes[idx] 124 | # ax.text(bbox_x, bbox_y, _string, color='black', bbox={'facecolor': 'yellow', 'alpha': 1.0, 'pad': 1}) 125 | ax.text(bbox_x, bbox_y, _string, color='black', bbox={'facecolor': color[idx], 'alpha': 0.6, 'pad': 1}) 126 | 127 | if 'caption' in tgt: 128 | ax.set_title(tgt['caption'], wrap=True) 129 | 130 | 131 | --------------------------------------------------------------------------------