├── .gitignore
├── LICENSE
├── README.md
├── config
    └── DINO
    │   ├── DINO_4scale.py
    │   ├── DINO_4scale_convnext.py
    │   ├── DINO_4scale_swin.py
    │   ├── DINO_5scale.py
    │   └── coco_transformer.py
├── datasets
    ├── __init__.py
    ├── coco.py
    ├── coco_eval.py
    ├── coco_panoptic.py
    ├── data_util.py
    ├── dataset.py
    ├── panoptic_eval.py
    ├── random_crop.py
    ├── sltransform.py
    └── transforms.py
├── engine.py
├── figs
    ├── 12ep.png
    ├── 50ep.png
    ├── curve.png
    ├── dinosaur.png
    ├── framework.png
    ├── idea.jpg
    ├── sota.jpg
    └── sota_table.png
├── inference_and_visualization.ipynb
├── main.py
├── models
    ├── __init__.py
    ├── dino
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── backbone.py
    │   ├── convnext.py
    │   ├── deformable_transformer.py
    │   ├── dino.py
    │   ├── dn_components.py
    │   ├── matcher.py
    │   ├── ops
    │   │   ├── functions
    │   │   │   ├── __init__.py
    │   │   │   └── ms_deform_attn_func.py
    │   │   ├── make.sh
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   └── ms_deform_attn.py
    │   │   ├── setup.py
    │   │   ├── src
    │   │   │   ├── cpu
    │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   └── ms_deform_attn_cpu.h
    │   │   │   ├── cuda
    │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   ├── ms_deform_attn.h
    │   │   │   └── vision.cpp
    │   │   └── test.py
    │   ├── position_encoding.py
    │   ├── segmentation.py
    │   ├── swin_transformer.py
    │   ├── transformer_deformable.py
    │   └── utils.py
    └── registry.py
├── requirements.txt
├── run_with_submitit.py
├── scripts
    ├── DINO_eval.sh
    ├── DINO_eval_dist.sh
    ├── DINO_eval_submitit.sh
    ├── DINO_eval_submitit_5scale.sh
    ├── DINO_train.sh
    ├── DINO_train_convnext.sh
    ├── DINO_train_dist.sh
    ├── DINO_train_submitit.sh
    ├── DINO_train_submitit_5scale.sh
    ├── DINO_train_submitit_convnext.sh
    ├── DINO_train_submitit_swin.sh
    └── DINO_train_swin.sh
├── tools
    ├── README.md
    └── benchmark.py
└── util
    ├── __init__.py
    ├── box_loss.py
    ├── box_ops.py
    ├── coco_id2name.json
    ├── get_param_dicts.py
    ├── logger.py
    ├── misc.py
    ├── plot_utils.py
    ├── slconfig.py
    ├── slio.py
    ├── static_data_path.py
    ├── time_counter.py
    ├── utils.py
    ├── vis_utils.py
    └── visualizer.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .nfs*
 2 | *.ipynb
 3 | *.pyc
 4 | .dumbo.json
 5 | .DS_Store
 6 | .*.swp
 7 | *.pth
 8 | **/__pycache__/**
 9 | .ipynb_checkpoints/
10 | datasets/data/
11 | experiment-*
12 | *.tmp
13 | *.pkl
14 | **/.mypy_cache/*
15 | .mypy_cache/*
16 | not_tracked_dir/
17 | .vscode
18 | logs
19 | jobs
20 | subs
21 | tmp
22 | *.sub
23 | vis/
24 | model_zoo/
25 | model_zoo_old/
26 | scripts/
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 | Copyright 2022 IDEA
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |       http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 
203 | DAB-DETR(https://github.com/IDEA-Research/DAB-DETR)
204 | 
205 | Copyright 2022 IDEA
206 | 
207 | Licensed under the Apache License, Version 2.0 (the "License");
208 | you may not use this file except in compliance with the License.
209 | You may obtain a copy of the License at
210 | 
211 |       http://www.apache.org/licenses/LICENSE-2.0
212 | 
213 | Unless required by applicable law or agreed to in writing, software
214 | distributed under the License is distributed on an "AS IS" BASIS,
215 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
216 | See the License for the specific language governing permissions and
217 | limitations under the License.
218 | 
219 | Conditional DETR(https://github.com/Atten4Vis/ConditionalDETR)
220 | 
221 | Copyright 2021 Microsoft.
222 | 
223 | Licensed under the Apache License, Version 2.0 (the "License");
224 | you may not use this file except in compliance with the License.
225 | You may obtain a copy of the License at
226 | 
227 |       http://www.apache.org/licenses/LICENSE-2.0
228 | 
229 | Unless required by applicable law or agreed to in writing, software
230 | distributed under the License is distributed on an "AS IS" BASIS,
231 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
232 | See the License for the specific language governing permissions and
233 | limitations under the License.
234 | 
235 | 
236 | Deformable DETR(https://github.com/fundamentalvision/Deformable-DETR)
237 | 
238 | Copyright 2020 SenseTime
239 | 
240 | Licensed under the Apache License, Version 2.0 (the "License");
241 | you may not use this file except in compliance with the License.
242 | You may obtain a copy of the License at
243 | 
244 |       http://www.apache.org/licenses/LICENSE-2.0
245 | 
246 | Unless required by applicable law or agreed to in writing, software
247 | distributed under the License is distributed on an "AS IS" BASIS,
248 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
249 | See the License for the specific language governing permissions and
250 | limitations under the License.
251 | 
252 | 
253 | DETR(https://github.com/facebookresearch/detr)
254 | 
255 | Copyright 2020 - present, Facebook, Inc
256 | 
257 | Licensed under the Apache License, Version 2.0 (the "License");
258 | you may not use this file except in compliance with the License.
259 | You may obtain a copy of the License at
260 | 
261 |    http://www.apache.org/licenses/LICENSE-2.0
262 | 
263 | Unless required by applicable law or agreed to in writing, software
264 | distributed under the License is distributed on an "AS IS" BASIS,
265 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
266 | See the License for the specific language governing permissions and
267 | limitations under the License.
268 | 


--------------------------------------------------------------------------------
/config/DINO/DINO_4scale.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['coco_transformer.py']
  2 | 
  3 | num_classes=91
  4 | 
  5 | lr = 0.0001
  6 | param_dict_type = 'default'
  7 | lr_backbone = 1e-05
  8 | lr_backbone_names = ['backbone.0']
  9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets']
 10 | lr_linear_proj_mult = 0.1
 11 | ddetr_lr_param = False
 12 | batch_size = 2
 13 | weight_decay = 0.0001
 14 | epochs = 12
 15 | lr_drop = 11
 16 | save_checkpoint_interval = 1
 17 | clip_max_norm = 0.1
 18 | onecyclelr = False
 19 | multi_step_lr = False
 20 | lr_drop_list = [33, 45]
 21 | 
 22 | 
 23 | modelname = 'dino'
 24 | frozen_weights = None
 25 | backbone = 'resnet50'
 26 | use_checkpoint = False
 27 | 
 28 | dilation = False
 29 | position_embedding = 'sine'
 30 | pe_temperatureH = 20
 31 | pe_temperatureW = 20
 32 | return_interm_indices = [1, 2, 3]
 33 | backbone_freeze_keywords = None
 34 | enc_layers = 6
 35 | dec_layers = 6
 36 | unic_layers = 0
 37 | pre_norm = False
 38 | dim_feedforward = 2048
 39 | hidden_dim = 256
 40 | dropout = 0.0
 41 | nheads = 8
 42 | num_queries = 900
 43 | query_dim = 4
 44 | num_patterns = 0
 45 | pdetr3_bbox_embed_diff_each_layer = False
 46 | pdetr3_refHW = -1
 47 | random_refpoints_xy = False
 48 | fix_refpoints_hw = -1
 49 | dabdetr_yolo_like_anchor_update = False
 50 | dabdetr_deformable_encoder = False
 51 | dabdetr_deformable_decoder = False
 52 | use_deformable_box_attn = False
 53 | box_attn_type = 'roi_align'
 54 | dec_layer_number = None
 55 | num_feature_levels = 4
 56 | enc_n_points = 4
 57 | dec_n_points = 4
 58 | decoder_layer_noise = False
 59 | dln_xy_noise = 0.2
 60 | dln_hw_noise = 0.2
 61 | add_channel_attention = False
 62 | add_pos_value = False
 63 | two_stage_type = 'standard'
 64 | two_stage_pat_embed = 0
 65 | two_stage_add_query_num = 0
 66 | two_stage_bbox_embed_share = False
 67 | two_stage_class_embed_share = False
 68 | two_stage_learn_wh = False
 69 | two_stage_default_hw = 0.05
 70 | two_stage_keep_all_tokens = False
 71 | num_select = 300
 72 | transformer_activation = 'relu'
 73 | batch_norm_type = 'FrozenBatchNorm2d'
 74 | masks = False
 75 | aux_loss = True
 76 | set_cost_class = 2.0
 77 | set_cost_bbox = 5.0
 78 | set_cost_giou = 2.0
 79 | cls_loss_coef = 1.0
 80 | mask_loss_coef = 1.0
 81 | dice_loss_coef = 1.0
 82 | bbox_loss_coef = 5.0
 83 | giou_loss_coef = 2.0
 84 | enc_loss_coef = 1.0
 85 | interm_loss_coef = 1.0
 86 | no_interm_box_loss = False
 87 | focal_alpha = 0.25
 88 | 
 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
 91 | decoder_module_seq = ['sa', 'ca', 'ffn']
 92 | nms_iou_threshold = -1
 93 | 
 94 | dec_pred_bbox_embed_share = True
 95 | dec_pred_class_embed_share = True
 96 | 
 97 | # for dn
 98 | use_dn = True
 99 | dn_number = 100
100 | dn_box_noise_scale = 0.4
101 | dn_label_noise_ratio = 0.5
102 | embed_init_tgt = True
103 | dn_labelbook_size = 91
104 | 
105 | match_unstable_error = True
106 | 
107 | # for ema
108 | use_ema = False
109 | ema_decay = 0.9997
110 | ema_epoch = 0
111 | 
112 | use_detached_boxes_dec_out = False
113 | 
114 | 


--------------------------------------------------------------------------------
/config/DINO/DINO_4scale_convnext.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['coco_transformer.py']
  2 | 
  3 | num_classes=91
  4 | 
  5 | lr = 0.0001
  6 | param_dict_type = 'default'
  7 | lr_backbone = 1e-05
  8 | lr_backbone_names = ['backbone.0']
  9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets']
 10 | lr_linear_proj_mult = 0.1
 11 | ddetr_lr_param = False
 12 | batch_size = 2
 13 | weight_decay = 0.0001
 14 | epochs = 12
 15 | lr_drop = 11
 16 | save_checkpoint_interval = 1
 17 | clip_max_norm = 0.1
 18 | onecyclelr = False
 19 | multi_step_lr = False
 20 | lr_drop_list = [33, 45]
 21 | 
 22 | 
 23 | modelname = 'dino'
 24 | frozen_weights = None
 25 | backbone = 'convnext_xlarge_22k'
 26 | use_checkpoint = False
 27 | 
 28 | dilation = False
 29 | position_embedding = 'sine'
 30 | pe_temperatureH = 20
 31 | pe_temperatureW = 20
 32 | return_interm_indices = [1, 2, 3]
 33 | backbone_freeze_keywords = None
 34 | enc_layers = 6
 35 | dec_layers = 6
 36 | unic_layers = 0
 37 | pre_norm = False
 38 | dim_feedforward = 2048
 39 | hidden_dim = 256
 40 | dropout = 0.0
 41 | nheads = 8
 42 | num_queries = 900
 43 | query_dim = 4
 44 | num_patterns = 0
 45 | pdetr3_bbox_embed_diff_each_layer = False
 46 | pdetr3_refHW = -1
 47 | random_refpoints_xy = False
 48 | fix_refpoints_hw = -1
 49 | dabdetr_yolo_like_anchor_update = False
 50 | dabdetr_deformable_encoder = False
 51 | dabdetr_deformable_decoder = False
 52 | use_deformable_box_attn = False
 53 | box_attn_type = 'roi_align'
 54 | dec_layer_number = None
 55 | num_feature_levels = 4
 56 | enc_n_points = 4
 57 | dec_n_points = 4
 58 | decoder_layer_noise = False
 59 | dln_xy_noise = 0.2
 60 | dln_hw_noise = 0.2
 61 | add_channel_attention = False
 62 | add_pos_value = False
 63 | two_stage_type = 'standard'
 64 | two_stage_pat_embed = 0
 65 | two_stage_add_query_num = 0
 66 | two_stage_bbox_embed_share = False
 67 | two_stage_class_embed_share = False
 68 | two_stage_learn_wh = False
 69 | two_stage_default_hw = 0.05
 70 | two_stage_keep_all_tokens = False
 71 | num_select = 300
 72 | transformer_activation = 'relu'
 73 | batch_norm_type = 'FrozenBatchNorm2d'
 74 | masks = False
 75 | aux_loss = True
 76 | set_cost_class = 2.0
 77 | set_cost_bbox = 5.0
 78 | set_cost_giou = 2.0
 79 | cls_loss_coef = 1.0
 80 | mask_loss_coef = 1.0
 81 | dice_loss_coef = 1.0
 82 | bbox_loss_coef = 5.0
 83 | giou_loss_coef = 2.0
 84 | enc_loss_coef = 1.0
 85 | interm_loss_coef = 1.0
 86 | no_interm_box_loss = False
 87 | focal_alpha = 0.25
 88 | 
 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
 91 | decoder_module_seq = ['sa', 'ca', 'ffn']
 92 | nms_iou_threshold = -1
 93 | 
 94 | dec_pred_bbox_embed_share = True
 95 | dec_pred_class_embed_share = True
 96 | 
 97 | # for dn
 98 | use_dn = True
 99 | dn_number = 100
100 | dn_box_noise_scale = 0.4
101 | dn_label_noise_ratio = 0.5
102 | embed_init_tgt = True
103 | dn_labelbook_size = 91
104 | 
105 | match_unstable_error = True
106 | 
107 | # for ema
108 | use_ema = False
109 | ema_decay = 0.9997
110 | ema_epoch = 0
111 | 
112 | use_detached_boxes_dec_out = False
113 | 
114 | 


--------------------------------------------------------------------------------
/config/DINO/DINO_4scale_swin.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['coco_transformer.py']
  2 | 
  3 | num_classes=91
  4 | 
  5 | lr = 0.0001
  6 | param_dict_type = 'default'
  7 | lr_backbone = 1e-05
  8 | lr_backbone_names = ['backbone.0']
  9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets']
 10 | lr_linear_proj_mult = 0.1
 11 | ddetr_lr_param = False
 12 | batch_size = 2
 13 | weight_decay = 0.0001
 14 | epochs = 12
 15 | lr_drop = 11
 16 | save_checkpoint_interval = 1
 17 | clip_max_norm = 0.1
 18 | onecyclelr = False
 19 | multi_step_lr = False
 20 | lr_drop_list = [33, 45]
 21 | 
 22 | 
 23 | modelname = 'dino'
 24 | frozen_weights = None
 25 | backbone = 'swin_L_384_22k'
 26 | use_checkpoint = True
 27 | 
 28 | dilation = False
 29 | position_embedding = 'sine'
 30 | pe_temperatureH = 20
 31 | pe_temperatureW = 20
 32 | return_interm_indices = [1, 2, 3]
 33 | backbone_freeze_keywords = None
 34 | enc_layers = 6
 35 | dec_layers = 6
 36 | unic_layers = 0
 37 | pre_norm = False
 38 | dim_feedforward = 2048
 39 | hidden_dim = 256
 40 | dropout = 0.0
 41 | nheads = 8
 42 | num_queries = 900
 43 | query_dim = 4
 44 | num_patterns = 0
 45 | pdetr3_bbox_embed_diff_each_layer = False
 46 | pdetr3_refHW = -1
 47 | random_refpoints_xy = False
 48 | fix_refpoints_hw = -1
 49 | dabdetr_yolo_like_anchor_update = False
 50 | dabdetr_deformable_encoder = False
 51 | dabdetr_deformable_decoder = False
 52 | use_deformable_box_attn = False
 53 | box_attn_type = 'roi_align'
 54 | dec_layer_number = None
 55 | num_feature_levels = 4
 56 | enc_n_points = 4
 57 | dec_n_points = 4
 58 | decoder_layer_noise = False
 59 | dln_xy_noise = 0.2
 60 | dln_hw_noise = 0.2
 61 | add_channel_attention = False
 62 | add_pos_value = False
 63 | two_stage_type = 'standard'
 64 | two_stage_pat_embed = 0
 65 | two_stage_add_query_num = 0
 66 | two_stage_bbox_embed_share = False
 67 | two_stage_class_embed_share = False
 68 | two_stage_learn_wh = False
 69 | two_stage_default_hw = 0.05
 70 | two_stage_keep_all_tokens = False
 71 | num_select = 300
 72 | transformer_activation = 'relu'
 73 | batch_norm_type = 'FrozenBatchNorm2d'
 74 | masks = False
 75 | aux_loss = True
 76 | set_cost_class = 2.0
 77 | set_cost_bbox = 5.0
 78 | set_cost_giou = 2.0
 79 | cls_loss_coef = 1.0
 80 | mask_loss_coef = 1.0
 81 | dice_loss_coef = 1.0
 82 | bbox_loss_coef = 5.0
 83 | giou_loss_coef = 2.0
 84 | enc_loss_coef = 1.0
 85 | interm_loss_coef = 1.0
 86 | no_interm_box_loss = False
 87 | focal_alpha = 0.25
 88 | 
 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
 91 | decoder_module_seq = ['sa', 'ca', 'ffn']
 92 | nms_iou_threshold = -1
 93 | 
 94 | dec_pred_bbox_embed_share = True
 95 | dec_pred_class_embed_share = True
 96 | 
 97 | # for dn
 98 | use_dn = True
 99 | dn_number = 100
100 | dn_box_noise_scale = 0.4
101 | dn_label_noise_ratio = 0.5
102 | embed_init_tgt = True
103 | dn_labelbook_size = 91
104 | 
105 | match_unstable_error = True
106 | 
107 | # for ema
108 | use_ema = False
109 | ema_decay = 0.9997
110 | ema_epoch = 0
111 | 
112 | use_detached_boxes_dec_out = False
113 | 
114 | 


--------------------------------------------------------------------------------
/config/DINO/DINO_5scale.py:
--------------------------------------------------------------------------------
  1 | _base_ = ['coco_transformer.py']
  2 | 
  3 | num_classes=91
  4 | 
  5 | lr = 0.0001
  6 | param_dict_type = 'default'
  7 | lr_backbone = 1e-05
  8 | lr_backbone_names = ['backbone.0']
  9 | lr_linear_proj_names = ['reference_points', 'sampling_offsets']
 10 | lr_linear_proj_mult = 0.1
 11 | ddetr_lr_param = False
 12 | batch_size = 1
 13 | weight_decay = 0.0001
 14 | epochs = 12
 15 | lr_drop = 11
 16 | save_checkpoint_interval = 1
 17 | clip_max_norm = 0.1
 18 | onecyclelr = False
 19 | multi_step_lr = False
 20 | lr_drop_list = [33, 45]
 21 | 
 22 | 
 23 | modelname = 'dino'
 24 | frozen_weights = None
 25 | backbone = 'resnet50'
 26 | use_checkpoint = False
 27 | 
 28 | dilation = False
 29 | position_embedding = 'sine'
 30 | pe_temperatureH = 20
 31 | pe_temperatureW = 20
 32 | return_interm_indices = [0, 1, 2, 3]
 33 | backbone_freeze_keywords = None
 34 | enc_layers = 6
 35 | dec_layers = 6
 36 | unic_layers = 0
 37 | pre_norm = False
 38 | dim_feedforward = 2048
 39 | hidden_dim = 256
 40 | dropout = 0.0
 41 | nheads = 8
 42 | num_queries = 900
 43 | query_dim = 4
 44 | num_patterns = 0
 45 | pdetr3_bbox_embed_diff_each_layer = False
 46 | pdetr3_refHW = -1
 47 | random_refpoints_xy = False
 48 | fix_refpoints_hw = -1
 49 | dabdetr_yolo_like_anchor_update = False
 50 | dabdetr_deformable_encoder = False
 51 | dabdetr_deformable_decoder = False
 52 | use_deformable_box_attn = False
 53 | box_attn_type = 'roi_align'
 54 | dec_layer_number = None
 55 | num_feature_levels = 5
 56 | enc_n_points = 4
 57 | dec_n_points = 4
 58 | decoder_layer_noise = False
 59 | dln_xy_noise = 0.2
 60 | dln_hw_noise = 0.2
 61 | add_channel_attention = False
 62 | add_pos_value = False
 63 | two_stage_type = 'standard'
 64 | two_stage_pat_embed = 0
 65 | two_stage_add_query_num = 0
 66 | two_stage_bbox_embed_share = False
 67 | two_stage_class_embed_share = False
 68 | two_stage_learn_wh = False
 69 | two_stage_default_hw = 0.05
 70 | two_stage_keep_all_tokens = False
 71 | num_select = 300
 72 | transformer_activation = 'relu'
 73 | batch_norm_type = 'FrozenBatchNorm2d'
 74 | masks = False
 75 | aux_loss = True
 76 | set_cost_class = 2.0
 77 | set_cost_bbox = 5.0
 78 | set_cost_giou = 2.0
 79 | cls_loss_coef = 1.0
 80 | mask_loss_coef = 1.0
 81 | dice_loss_coef = 1.0
 82 | bbox_loss_coef = 5.0
 83 | giou_loss_coef = 2.0
 84 | enc_loss_coef = 1.0
 85 | interm_loss_coef = 1.0
 86 | no_interm_box_loss = False
 87 | focal_alpha = 0.25
 88 | 
 89 | decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
 90 | matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
 91 | decoder_module_seq = ['sa', 'ca', 'ffn']
 92 | nms_iou_threshold = -1
 93 | 
 94 | dec_pred_bbox_embed_share = True
 95 | dec_pred_class_embed_share = True
 96 | 
 97 | # for dn
 98 | use_dn = True
 99 | dn_number = 100
100 | dn_box_noise_scale = 0.4
101 | dn_label_noise_ratio = 0.5
102 | embed_init_tgt = True
103 | dn_labelbook_size = 91
104 | 
105 | match_unstable_error = True
106 | 
107 | # for ema
108 | use_ema = False
109 | ema_decay = 0.9997
110 | ema_epoch = 0
111 | 
112 | use_detached_boxes_dec_out = False
113 | 
114 | 


--------------------------------------------------------------------------------
/config/DINO/coco_transformer.py:
--------------------------------------------------------------------------------
1 | data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
2 | data_aug_max_size = 1333
3 | data_aug_scales2_resize = [400, 500, 600]
4 | data_aug_scales2_crop = [384, 600]
5 | 
6 | 
7 | data_aug_scale_overlap = None
8 | 
9 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import torch.utils.data
 3 | import torchvision
 4 | 
 5 | from .coco import build as build_coco
 6 | 
 7 | 
 8 | def get_coco_api_from_dataset(dataset):
 9 |     for _ in range(10):
10 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
11 |         #     break
12 |         if isinstance(dataset, torch.utils.data.Subset):
13 |             dataset = dataset.dataset
14 |     if isinstance(dataset, torchvision.datasets.CocoDetection):
15 |         return dataset.coco
16 | 
17 | 
18 | def build_dataset(image_set, args):
19 |     if args.dataset_file == 'coco':
20 |         return build_coco(image_set, args)
21 |     if args.dataset_file == 'coco_panoptic':
22 |         # to avoid making panopticapi required for coco
23 |         from .coco_panoptic import build as build_coco_panoptic
24 |         return build_coco_panoptic(image_set, args)
25 |     if args.dataset_file == 'o365':
26 |         from .o365 import build_o365_combine
27 |         return build_o365_combine(image_set, args)
28 |     if args.dataset_file == 'vanke':
29 |         from .vanke import build_vanke
30 |         return build_vanke(image_set, args)
31 |     raise ValueError(f'dataset {args.dataset_file} not supported')
32 | 


--------------------------------------------------------------------------------
/datasets/coco_eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | COCO evaluator that works in distributed mode.
  4 | 
  5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
  6 | The difference is that there is less copy-pasting from pycocotools
  7 | in the end of the file, as python3 can suppress prints with contextlib
  8 | """
  9 | import os
 10 | import contextlib
 11 | import copy
 12 | import numpy as np
 13 | import torch
 14 | 
 15 | from pycocotools.cocoeval import COCOeval
 16 | from pycocotools.coco import COCO
 17 | import pycocotools.mask as mask_util
 18 | 
 19 | from util.misc import all_gather
 20 | 
 21 | 
 22 | class CocoEvaluator(object):
 23 |     def __init__(self, coco_gt, iou_types, useCats=True):
 24 |         assert isinstance(iou_types, (list, tuple))
 25 |         coco_gt = copy.deepcopy(coco_gt)
 26 |         self.coco_gt = coco_gt
 27 | 
 28 |         self.iou_types = iou_types
 29 |         self.coco_eval = {}
 30 |         for iou_type in iou_types:
 31 |             self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
 32 |             self.coco_eval[iou_type].useCats = useCats
 33 | 
 34 |         self.img_ids = []
 35 |         self.eval_imgs = {k: [] for k in iou_types}
 36 |         self.useCats = useCats
 37 | 
 38 |     def update(self, predictions):
 39 |         img_ids = list(np.unique(list(predictions.keys())))
 40 |         self.img_ids.extend(img_ids)
 41 | 
 42 |         for iou_type in self.iou_types:
 43 |             results = self.prepare(predictions, iou_type)
 44 | 
 45 |             # suppress pycocotools prints
 46 |             with open(os.devnull, 'w') as devnull:
 47 |                 with contextlib.redirect_stdout(devnull):
 48 |                     coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
 49 |             coco_eval = self.coco_eval[iou_type]
 50 | 
 51 |             coco_eval.cocoDt = coco_dt
 52 |             coco_eval.params.imgIds = list(img_ids)
 53 |             coco_eval.params.useCats = self.useCats
 54 |             img_ids, eval_imgs = evaluate(coco_eval)
 55 | 
 56 |             self.eval_imgs[iou_type].append(eval_imgs)
 57 | 
 58 |     def synchronize_between_processes(self):
 59 |         for iou_type in self.iou_types:
 60 |             self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
 61 |             create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
 62 | 
 63 |     def accumulate(self):
 64 |         for coco_eval in self.coco_eval.values():
 65 |             coco_eval.accumulate()
 66 | 
 67 |     def summarize(self):
 68 |         for iou_type, coco_eval in self.coco_eval.items():
 69 |             print("IoU metric: {}".format(iou_type))
 70 |             coco_eval.summarize()
 71 | 
 72 |     def prepare(self, predictions, iou_type):
 73 |         if iou_type == "bbox":
 74 |             return self.prepare_for_coco_detection(predictions)
 75 |         elif iou_type == "segm":
 76 |             return self.prepare_for_coco_segmentation(predictions)
 77 |         elif iou_type == "keypoints":
 78 |             return self.prepare_for_coco_keypoint(predictions)
 79 |         else:
 80 |             raise ValueError("Unknown iou type {}".format(iou_type))
 81 | 
 82 |     def prepare_for_coco_detection(self, predictions):
 83 |         coco_results = []
 84 |         for original_id, prediction in predictions.items():
 85 |             if len(prediction) == 0:
 86 |                 continue
 87 | 
 88 |             boxes = prediction["boxes"]
 89 |             boxes = convert_to_xywh(boxes).tolist()
 90 |             if not isinstance(prediction["scores"], list):
 91 |                 scores = prediction["scores"].tolist()
 92 |             else:
 93 |                 scores = prediction["scores"]
 94 |             if not isinstance(prediction["labels"], list):
 95 |                 labels = prediction["labels"].tolist()
 96 |             else:
 97 |                 labels = prediction["labels"]
 98 | 
 99 |         
100 |             try:
101 |                 coco_results.extend(
102 |                     [
103 |                         {
104 |                             "image_id": original_id,
105 |                             "category_id": labels[k],
106 |                             "bbox": box,
107 |                             "score": scores[k],
108 |                         }
109 |                         for k, box in enumerate(boxes)
110 |                     ]
111 |                 )
112 |             except:
113 |                 import ipdb; ipdb.set_trace()
114 |         return coco_results
115 | 
116 |     def prepare_for_coco_segmentation(self, predictions):
117 |         coco_results = []
118 |         for original_id, prediction in predictions.items():
119 |             if len(prediction) == 0:
120 |                 continue
121 | 
122 |             scores = prediction["scores"]
123 |             labels = prediction["labels"]
124 |             masks = prediction["masks"]
125 | 
126 |             masks = masks > 0.5
127 | 
128 |             scores = prediction["scores"].tolist()
129 |             labels = prediction["labels"].tolist()
130 | 
131 |             rles = [
132 |                 mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
133 |                 for mask in masks
134 |             ]
135 |             for rle in rles:
136 |                 rle["counts"] = rle["counts"].decode("utf-8")
137 | 
138 |             coco_results.extend(
139 |                 [
140 |                     {
141 |                         "image_id": original_id,
142 |                         "category_id": labels[k],
143 |                         "segmentation": rle,
144 |                         "score": scores[k],
145 |                     }
146 |                     for k, rle in enumerate(rles)
147 |                 ]
148 |             )
149 |         return coco_results
150 | 
151 |     def prepare_for_coco_keypoint(self, predictions):
152 |         coco_results = []
153 |         for original_id, prediction in predictions.items():
154 |             if len(prediction) == 0:
155 |                 continue
156 | 
157 |             boxes = prediction["boxes"]
158 |             boxes = convert_to_xywh(boxes).tolist()
159 |             scores = prediction["scores"].tolist()
160 |             labels = prediction["labels"].tolist()
161 |             keypoints = prediction["keypoints"]
162 |             keypoints = keypoints.flatten(start_dim=1).tolist()
163 | 
164 |             coco_results.extend(
165 |                 [
166 |                     {
167 |                         "image_id": original_id,
168 |                         "category_id": labels[k],
169 |                         'keypoints': keypoint,
170 |                         "score": scores[k],
171 |                     }
172 |                     for k, keypoint in enumerate(keypoints)
173 |                 ]
174 |             )
175 |         return coco_results
176 | 
177 | 
178 | def convert_to_xywh(boxes):
179 |     xmin, ymin, xmax, ymax = boxes.unbind(1)
180 |     return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
181 | 
182 | 
183 | def merge(img_ids, eval_imgs):
184 |     all_img_ids = all_gather(img_ids)
185 |     all_eval_imgs = all_gather(eval_imgs)
186 | 
187 |     merged_img_ids = []
188 |     for p in all_img_ids:
189 |         merged_img_ids.extend(p)
190 | 
191 |     merged_eval_imgs = []
192 |     for p in all_eval_imgs:
193 |         merged_eval_imgs.append(p)
194 | 
195 |     merged_img_ids = np.array(merged_img_ids)
196 |     merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
197 | 
198 |     # keep only unique (and in sorted order) images
199 |     merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
200 |     merged_eval_imgs = merged_eval_imgs[..., idx]
201 | 
202 |     return merged_img_ids, merged_eval_imgs
203 | 
204 | 
205 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
206 |     img_ids, eval_imgs = merge(img_ids, eval_imgs)
207 |     img_ids = list(img_ids)
208 |     eval_imgs = list(eval_imgs.flatten())
209 | 
210 |     coco_eval.evalImgs = eval_imgs
211 |     coco_eval.params.imgIds = img_ids
212 |     coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
213 | 
214 | 
215 | #################################################################
216 | # From pycocotools, just removed the prints and fixed
217 | # a Python3 bug about unicode not defined
218 | #################################################################
219 | 
220 | 
221 | def evaluate(self):
222 |     '''
223 |     Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
224 |     :return: None
225 |     '''
226 |     p = self.params
227 |     # add backward compatibility if useSegm is specified in params
228 |     if p.useSegm is not None:
229 |         p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
230 |         print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
231 |     p.imgIds = list(np.unique(p.imgIds))
232 |     if p.useCats:
233 |         p.catIds = list(np.unique(p.catIds))
234 |     p.maxDets = sorted(p.maxDets)
235 |     self.params = p
236 | 
237 |     self._prepare()
238 |     # loop through images, area range, max detection number
239 |     catIds = p.catIds if p.useCats else [-1]
240 | 
241 |     if p.iouType == 'segm' or p.iouType == 'bbox':
242 |         computeIoU = self.computeIoU
243 |     elif p.iouType == 'keypoints':
244 |         computeIoU = self.computeOks
245 |     self.ious = {
246 |         (imgId, catId): computeIoU(imgId, catId)
247 |         for imgId in p.imgIds
248 |         for catId in catIds}
249 | 
250 |     evaluateImg = self.evaluateImg
251 |     maxDet = p.maxDets[-1]
252 |     evalImgs = [
253 |         evaluateImg(imgId, catId, areaRng, maxDet)
254 |         for catId in catIds
255 |         for areaRng in p.areaRng
256 |         for imgId in p.imgIds
257 |     ]
258 |     # this is NOT in the pycocotools code, but could be done outside
259 |     evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
260 |     self._paramsEval = copy.deepcopy(self.params)
261 | 
262 |     return p.imgIds, evalImgs
263 | 
264 | #################################################################
265 | # end of straight copy from pycocotools, just removing the prints
266 | #################################################################
267 | 


--------------------------------------------------------------------------------
/datasets/coco_panoptic.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image
  8 | 
  9 | from panopticapi.utils import rgb2id
 10 | from util.box_ops import masks_to_boxes
 11 | 
 12 | from .coco import make_coco_transforms
 13 | 
 14 | 
 15 | class CocoPanoptic:
 16 |     def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
 17 |         with open(ann_file, 'r') as f:
 18 |             self.coco = json.load(f)
 19 | 
 20 |         # sort 'images' field so that they are aligned with 'annotations'
 21 |         # i.e., in alphabetical order
 22 |         self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
 23 |         # sanity check
 24 |         if "annotations" in self.coco:
 25 |             for img, ann in zip(self.coco['images'], self.coco['annotations']):
 26 |                 assert img['file_name'][:-4] == ann['file_name'][:-4]
 27 | 
 28 |         self.img_folder = img_folder
 29 |         self.ann_folder = ann_folder
 30 |         self.ann_file = ann_file
 31 |         self.transforms = transforms
 32 |         self.return_masks = return_masks
 33 | 
 34 |     def __getitem__(self, idx):
 35 |         ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
 36 |         img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
 37 |         ann_path = Path(self.ann_folder) / ann_info['file_name']
 38 | 
 39 |         img = Image.open(img_path).convert('RGB')
 40 |         w, h = img.size
 41 |         if "segments_info" in ann_info:
 42 |             masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
 43 |             masks = rgb2id(masks)
 44 | 
 45 |             ids = np.array([ann['id'] for ann in ann_info['segments_info']])
 46 |             masks = masks == ids[:, None, None]
 47 | 
 48 |             masks = torch.as_tensor(masks, dtype=torch.uint8)
 49 |             labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
 50 | 
 51 |         target = {}
 52 |         target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
 53 |         if self.return_masks:
 54 |             target['masks'] = masks
 55 |         target['labels'] = labels
 56 | 
 57 |         target["boxes"] = masks_to_boxes(masks)
 58 | 
 59 |         target['size'] = torch.as_tensor([int(h), int(w)])
 60 |         target['orig_size'] = torch.as_tensor([int(h), int(w)])
 61 |         if "segments_info" in ann_info:
 62 |             for name in ['iscrowd', 'area']:
 63 |                 target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
 64 | 
 65 |         if self.transforms is not None:
 66 |             img, target = self.transforms(img, target)
 67 | 
 68 |         return img, target
 69 | 
 70 |     def __len__(self):
 71 |         return len(self.coco['images'])
 72 | 
 73 |     def get_height_and_width(self, idx):
 74 |         img_info = self.coco['images'][idx]
 75 |         height = img_info['height']
 76 |         width = img_info['width']
 77 |         return height, width
 78 | 
 79 | 
 80 | def build(image_set, args):
 81 |     img_folder_root = Path(args.coco_path)
 82 |     ann_folder_root = Path(args.coco_panoptic_path)
 83 |     assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
 84 |     assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
 85 |     mode = 'panoptic'
 86 |     PATHS = {
 87 |         "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
 88 |         "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
 89 |     }
 90 | 
 91 |     img_folder, ann_file = PATHS[image_set]
 92 |     img_folder_path = img_folder_root / img_folder
 93 |     ann_folder = ann_folder_root / f'{mode}_{img_folder}'
 94 |     ann_file = ann_folder_root / ann_file
 95 | 
 96 |     dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
 97 |                            transforms=make_coco_transforms(image_set), return_masks=args.masks)
 98 | 
 99 |     return dataset
100 | 


--------------------------------------------------------------------------------
/datasets/data_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import shutil
  4 | import time
  5 | import datetime
  6 | 
  7 | import torch
  8 | 
  9 | from util.slconfig import SLConfig
 10 | 
 11 | class Error(OSError):
 12 |     pass
 13 | 
 14 | def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile,
 15 |              ignore_dangling_symlinks=False):
 16 |     """
 17 |     modified from shutil.copytree without copystat.
 18 |     
 19 |     Recursively copy a directory tree.
 20 | 
 21 |     The destination directory must not already exist.
 22 |     If exception(s) occur, an Error is raised with a list of reasons.
 23 | 
 24 |     If the optional symlinks flag is true, symbolic links in the
 25 |     source tree result in symbolic links in the destination tree; if
 26 |     it is false, the contents of the files pointed to by symbolic
 27 |     links are copied. If the file pointed by the symlink doesn't
 28 |     exist, an exception will be added in the list of errors raised in
 29 |     an Error exception at the end of the copy process.
 30 | 
 31 |     You can set the optional ignore_dangling_symlinks flag to true if you
 32 |     want to silence this exception. Notice that this has no effect on
 33 |     platforms that don't support os.symlink.
 34 | 
 35 |     The optional ignore argument is a callable. If given, it
 36 |     is called with the `src` parameter, which is the directory
 37 |     being visited by copytree(), and `names` which is the list of
 38 |     `src` contents, as returned by os.listdir():
 39 | 
 40 |         callable(src, names) -> ignored_names
 41 | 
 42 |     Since copytree() is called recursively, the callable will be
 43 |     called once for each directory that is copied. It returns a
 44 |     list of names relative to the `src` directory that should
 45 |     not be copied.
 46 | 
 47 |     The optional copy_function argument is a callable that will be used
 48 |     to copy each file. It will be called with the source path and the
 49 |     destination path as arguments. By default, copy2() is used, but any
 50 |     function that supports the same signature (like copy()) can be used.
 51 | 
 52 |     """
 53 |     errors = []
 54 |     if os.path.isdir(src):
 55 |         names = os.listdir(src)
 56 |         if ignore is not None:
 57 |             ignored_names = ignore(src, names)
 58 |         else:
 59 |             ignored_names = set()
 60 | 
 61 |         os.makedirs(dst)
 62 |         for name in names:
 63 |             if name in ignored_names:
 64 |                 continue
 65 |             srcname = os.path.join(src, name)
 66 |             dstname = os.path.join(dst, name)
 67 |             try:
 68 |                 if os.path.islink(srcname):
 69 |                     linkto = os.readlink(srcname)
 70 |                     if symlinks:
 71 |                         # We can't just leave it to `copy_function` because legacy
 72 |                         # code with a custom `copy_function` may rely on copytree
 73 |                         # doing the right thing.
 74 |                         os.symlink(linkto, dstname)
 75 |                     else:
 76 |                         # ignore dangling symlink if the flag is on
 77 |                         if not os.path.exists(linkto) and ignore_dangling_symlinks:
 78 |                             continue
 79 |                         # otherwise let the copy occurs. copy2 will raise an error
 80 |                         if os.path.isdir(srcname):
 81 |                             slcopytree(srcname, dstname, symlinks, ignore,
 82 |                                     copy_function)
 83 |                         else:
 84 |                             copy_function(srcname, dstname)
 85 |                 elif os.path.isdir(srcname):
 86 |                     slcopytree(srcname, dstname, symlinks, ignore, copy_function)
 87 |                 else:
 88 |                     # Will raise a SpecialFileError for unsupported file types
 89 |                     copy_function(srcname, dstname)
 90 |             # catch the Error from the recursive copytree so that we can
 91 |             # continue with other files
 92 |             except Error as err:
 93 |                 errors.extend(err.args[0])
 94 |             except OSError as why:
 95 |                 errors.append((srcname, dstname, str(why)))
 96 |     else:
 97 |         copy_function(src, dst)
 98 | 
 99 |     if errors:
100 |         raise Error(errors)
101 |     return dst
102 | 
103 | def check_and_copy(src_path, tgt_path):
104 |     if os.path.exists(tgt_path):
105 |         return None
106 | 
107 |     return slcopytree(src_path, tgt_path)
108 | 
109 | 
110 | def remove(srcpath):
111 |     if os.path.isdir(srcpath):
112 |         return shutil.rmtree(srcpath)
113 |     else:
114 |         return os.remove(srcpath)  
115 | 
116 | 
117 | def preparing_dataset(pathdict, image_set, args):
118 |     start_time = time.time()
119 |     dataset_file = args.dataset_file
120 |     data_static_info = SLConfig.fromfile('util/static_data_path.py')
121 |     static_dict = data_static_info[dataset_file][image_set]
122 | 
123 |     copyfilelist = []
124 |     for k,tgt_v in pathdict.items():
125 |         if os.path.exists(tgt_v):
126 |             if args.local_rank == 0:
127 |                 print("path <{}> exist. remove it!".format(tgt_v))
128 |                 remove(tgt_v)
129 |             # continue
130 |         
131 |         if args.local_rank == 0:
132 |             src_v = static_dict[k]
133 |             assert isinstance(src_v, str)
134 |             if src_v.endswith('.zip'):
135 |                 # copy
136 |                 cp_tgt_dir = os.path.dirname(tgt_v)
137 |                 filename = os.path.basename(src_v)
138 |                 cp_tgt_path = os.path.join(cp_tgt_dir, filename)
139 |                 print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path))
140 |                 os.makedirs(cp_tgt_dir, exist_ok=True)
141 |                 check_and_copy(src_v, cp_tgt_path)          
142 | 
143 |                 # unzip
144 |                 import zipfile
145 |                 print("Starting unzip <{}>".format(cp_tgt_path))
146 |                 with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref:
147 |                     zip_ref.extractall(os.path.dirname(cp_tgt_path))      
148 | 
149 |                 copyfilelist.append(cp_tgt_path)
150 |                 copyfilelist.append(tgt_v)
151 |             else:
152 |                 print('Copy from <{}> to <{}>.'.format(src_v, tgt_v))
153 |                 os.makedirs(os.path.dirname(tgt_v), exist_ok=True)
154 |                 check_and_copy(src_v, tgt_v)
155 |                 copyfilelist.append(tgt_v)
156 |     
157 |     if len(copyfilelist) == 0:
158 |         copyfilelist = None
159 |     args.copyfilelist = copyfilelist
160 |         
161 |     if args.distributed:
162 |         torch.distributed.barrier()
163 |     total_time = time.time() - start_time
164 |     if copyfilelist:
165 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
166 |         print('Data copy time {}'.format(total_time_str))
167 |     return copyfilelist
168 | 
169 | 
170 |     


--------------------------------------------------------------------------------
/datasets/dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torchvision.datasets as datasets
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | from .tsv_io import TSVFile
 8 | import numpy as np
 9 | import base64
10 | import io
11 | 
12 | 
13 | class TSVDataset(Dataset):
14 |     """ TSV dataset for ImageNet 1K training
15 |     """    
16 |     def __init__(self, tsv_file, transform=None, target_transform=None):
17 |         self.tsv = TSVFile(tsv_file)
18 |         self.transform = transform
19 |         self.target_transform = target_transform
20 | 
21 |     def __getitem__(self, index):
22 |         """
23 |         Args:
24 |             index (int): Index
25 |         Returns:
26 |             tuple: (image, target) where target is class_index of the target class.
27 |         """
28 |         row = self.tsv.seek(index)
29 |         image_data = base64.b64decode(row[-1])
30 |         image = Image.open(io.BytesIO(image_data))
31 |         image = image.convert('RGB')
32 |         target = int(row[1])
33 | 
34 |         if self.transform is not None:
35 |             img = self.transform(image)
36 |         else:
37 |             img = image
38 |         if self.target_transform is not None:
39 |             target = self.target_transform(target)
40 | 
41 |         return img, target
42 | 
43 |     def __len__(self):
44 |         return self.tsv.num_rows()
45 | 


--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import json
 3 | import os
 4 | 
 5 | import util.misc as utils
 6 | 
 7 | try:
 8 |     from panopticapi.evaluation import pq_compute
 9 | except ImportError:
10 |     pass
11 | 
12 | 
13 | class PanopticEvaluator(object):
14 |     def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
15 |         self.gt_json = ann_file
16 |         self.gt_folder = ann_folder
17 |         if utils.is_main_process():
18 |             if not os.path.exists(output_dir):
19 |                 os.mkdir(output_dir)
20 |         self.output_dir = output_dir
21 |         self.predictions = []
22 | 
23 |     def update(self, predictions):
24 |         for p in predictions:
25 |             with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
26 |                 f.write(p.pop("png_string"))
27 | 
28 |         self.predictions += predictions
29 | 
30 |     def synchronize_between_processes(self):
31 |         all_predictions = utils.all_gather(self.predictions)
32 |         merged_predictions = []
33 |         for p in all_predictions:
34 |             merged_predictions += p
35 |         self.predictions = merged_predictions
36 | 
37 |     def summarize(self):
38 |         if utils.is_main_process():
39 |             json_data = {"annotations": self.predictions}
40 |             predictions_json = os.path.join(self.output_dir, "predictions.json")
41 |             with open(predictions_json, "w") as f:
42 |                 f.write(json.dumps(json_data))
43 |             return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
44 |         return None
45 | 


--------------------------------------------------------------------------------
/datasets/random_crop.py:
--------------------------------------------------------------------------------
  1 | import PIL #version 1.2.0
  2 | import torch
  3 | import os
  4 | import torchvision.transforms.functional as F
  5 | import numpy as np
  6 | import random
  7 | 
  8 | 
  9 | def intersect(boxes1, boxes2):
 10 |     '''
 11 |         Find intersection of every box combination between two sets of box
 12 |         boxes1: bounding boxes 1, a tensor of dimensions (n1, 4)
 13 |         boxes2: bounding boxes 2, a tensor of dimensions (n2, 4)
 14 |         
 15 |         Out: Intersection each of boxes1 with respect to each of boxes2, 
 16 |              a tensor of dimensions (n1, n2)
 17 |     '''
 18 |     n1 = boxes1.size(0)
 19 |     n2 = boxes2.size(0)
 20 |     max_xy =  torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2),
 21 |                         boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2))
 22 |     
 23 |     min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2),
 24 |                        boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2))
 25 |     inter = torch.clamp(max_xy - min_xy , min=0)  # (n1, n2, 2)
 26 |     return inter[:, :, 0] * inter[:, :, 1]  #(n1, n2)
 27 | def find_IoU(boxes1, boxes2):
 28 |     '''
 29 |         Find IoU between every boxes set of boxes 
 30 |         boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom)
 31 |         boxes2: a tensor of dimensions (n2, 4)
 32 |         
 33 |         Out: IoU each of boxes1 with respect to each of boxes2, a tensor of 
 34 |              dimensions (n1, n2)
 35 |         
 36 |         Formula: 
 37 |         (box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 ))
 38 |     '''
 39 |     inter = intersect(boxes1, boxes2)
 40 |     area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
 41 |     area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
 42 |     
 43 |     area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2)
 44 |     area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter)  #(n1, n2)
 45 |     union = (area_boxes1 + area_boxes2 - inter)
 46 |     return inter / union
 47 | 
 48 | 
 49 | def random_crop(image, boxes, labels, difficulties=None):
 50 |     '''
 51 |         image: A PIL image
 52 |         boxes: Bounding boxes, a tensor of dimensions (#objects, 4)
 53 |         labels: labels of object, a tensor of dimensions (#objects)
 54 |         difficulties: difficulties of detect object, a tensor of dimensions (#objects)
 55 |         
 56 |         Out: cropped image , new boxes, new labels, new difficulties
 57 |     '''
 58 |     if type(image) == PIL.Image.Image:
 59 |         image = F.to_tensor(image)
 60 |     original_h = image.size(1)
 61 |     original_w = image.size(2)
 62 |     
 63 |     while True:
 64 |         mode = random.choice([0.1, 0.3, 0.5, 0.9, None])
 65 |         
 66 |         if mode is None:
 67 |             return F.to_pil_image(image), boxes, labels, difficulties
 68 |         
 69 |         new_image = image
 70 |         new_boxes = boxes
 71 |         new_difficulties = difficulties
 72 |         new_labels = labels
 73 |         for _ in range(50):
 74 |             # Crop dimensions: [0.3, 1] of original dimensions
 75 |             new_h = random.uniform(0.3*original_h, original_h)
 76 |             new_w = random.uniform(0.3*original_w, original_w)
 77 |             
 78 |             # Aspect ratio constraint b/t .5 & 2
 79 |             if new_h/new_w < 0.5 or new_h/new_w > 2:
 80 |                 continue
 81 |             
 82 |             #Crop coordinate
 83 |             left = random.uniform(0, original_w - new_w)
 84 |             right = left + new_w
 85 |             top = random.uniform(0, original_h - new_h)
 86 |             bottom = top + new_h
 87 |             crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)])
 88 |             
 89 |             # Calculate IoU  between the crop and the bounding boxes
 90 |             overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects)
 91 |             overlap = overlap.squeeze(0)
 92 | 
 93 |             # If not a single bounding box has a IoU of greater than the minimum, try again
 94 |             if overlap.shape[0] == 0:
 95 |                 continue
 96 |             if overlap.max().item() < mode:
 97 |                 continue
 98 |             
 99 |             #Crop
100 |             new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w)
101 |             
102 |             #Center of bounding boxes
103 |             center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0
104 |             
105 |             #Find bounding box has been had center in crop
106 |             center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right
107 |                              ) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom)    #( #objects)
108 |             
109 |             if not center_in_crop.any():
110 |                 continue
111 |             
112 |             #take matching bounding box
113 |             new_boxes = boxes[center_in_crop, :]
114 |             
115 |             #take matching labels
116 |             new_labels = labels[center_in_crop]
117 |             
118 |             #take matching difficulities
119 |             if difficulties is not None:
120 |                 new_difficulties = difficulties[center_in_crop]
121 |             else:
122 |                 new_difficulties = None
123 |             
124 |             #Use the box left and top corner or the crop's
125 |             new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2])
126 |             
127 |             #adjust to crop
128 |             new_boxes[:, :2] -= crop[:2]
129 |             
130 |             new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:])
131 |             
132 |             #adjust to crop
133 |             new_boxes[:, 2:] -= crop[:2]
134 |             
135 |             return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties


--------------------------------------------------------------------------------
/datasets/sltransform.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb
  2 | 
  3 | import PIL #version 1.2.0
  4 | from PIL import Image #version 6.1.0
  5 | import torch
  6 | import os
  7 | import torchvision.transforms.functional as F
  8 | import numpy as np
  9 | import random
 10 | 
 11 | from .random_crop import random_crop
 12 | from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
 13 | 
 14 | class AdjustContrast:
 15 |     def __init__(self, contrast_factor):
 16 |         self.contrast_factor = contrast_factor
 17 | 
 18 |     def __call__(self, img, target):
 19 |         """
 20 |         img (PIL Image or Tensor): Image to be adjusted.
 21 |         """
 22 |         _contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor
 23 |         img = F.adjust_contrast(img, _contrast_factor)
 24 |         return img, target
 25 | 
 26 | class AdjustBrightness:
 27 |     def __init__(self, brightness_factor):
 28 |         self.brightness_factor = brightness_factor
 29 | 
 30 |     def __call__(self, img, target):
 31 |         """
 32 |         img (PIL Image or Tensor): Image to be adjusted.
 33 |         """
 34 |         _brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor
 35 |         img = F.adjust_brightness(img, _brightness_factor)
 36 |         return img, target
 37 | 
 38 | def lighting_noise(image):
 39 |     '''
 40 |         color channel swap in image
 41 |         image: A PIL image
 42 |     '''
 43 |     new_image = image
 44 |     perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2), 
 45 |              (1, 2, 0), (2, 0, 1), (2, 1, 0))
 46 |     swap = perms[random.randint(0, len(perms)- 1)]
 47 |     new_image = F.to_tensor(new_image)
 48 |     new_image = new_image[swap, :, :]
 49 |     new_image = F.to_pil_image(new_image)
 50 |     return new_image
 51 | 
 52 | class LightingNoise:
 53 |     def __init__(self) -> None:
 54 |         pass
 55 | 
 56 |     def __call__(self, img, target):
 57 |         return lighting_noise(img), target
 58 | 
 59 | 
 60 | def rotate(image, boxes, angle):
 61 |     '''
 62 |         Rotate image and bounding box
 63 |         image: A Pil image (w, h)
 64 |         boxes: A tensors of dimensions (#objects, 4)
 65 |         
 66 |         Out: rotated image (w, h), rotated boxes
 67 |     '''
 68 |     new_image = image.copy()
 69 |     new_boxes = boxes.clone()
 70 |     
 71 |     #Rotate image, expand = True
 72 |     w = image.width
 73 |     h = image.height
 74 |     cx = w/2
 75 |     cy = h/2
 76 |     new_image = new_image.rotate(angle, expand=True)
 77 |     angle = np.radians(angle)
 78 |     alpha = np.cos(angle)
 79 |     beta = np.sin(angle)
 80 |     #Get affine matrix
 81 |     AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy],
 82 |                                  [-beta, alpha, beta*cx + (1-alpha)*cy]])
 83 |     
 84 |     #Rotation boxes
 85 |     box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1)
 86 |     box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1)
 87 |     
 88 |     #Get corners for boxes
 89 |     x1 = boxes[:,0].reshape(-1,1)
 90 |     y1 = boxes[:,1].reshape(-1,1)
 91 |     
 92 |     x2 = x1 + box_width
 93 |     y2 = y1 
 94 |     
 95 |     x3 = x1
 96 |     y3 = y1 + box_height
 97 |     
 98 |     x4 = boxes[:,2].reshape(-1,1)
 99 |     y4 = boxes[:,3].reshape(-1,1)
100 |     
101 |     corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1)
102 |     # corners.reshape(-1, 8)    #Tensors of dimensions (#objects, 8)
103 |     corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2)
104 |     corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3))
105 |     
106 |     cos = np.abs(AffineMatrix[0, 0])
107 |     sin = np.abs(AffineMatrix[0, 1])
108 |     
109 |     nW = int((h * sin) + (w * cos))
110 |     nH = int((h * cos) + (w * sin))
111 |     AffineMatrix[0, 2] += (nW / 2) - cx
112 |     AffineMatrix[1, 2] += (nH / 2) - cy
113 |     
114 | 
115 |     #Apply affine transform
116 |     rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t()
117 |     rotate_corners = rotate_corners.reshape(-1,8)
118 |     
119 |     x_corners = rotate_corners[:,[0,2,4,6]]
120 |     y_corners = rotate_corners[:,[1,3,5,7]]
121 |     
122 |     #Get (x_min, y_min, x_max, y_max)
123 |     x_min, _ = torch.min(x_corners, dim= 1)
124 |     x_min = x_min.reshape(-1, 1)
125 |     y_min, _ = torch.min(y_corners, dim= 1)
126 |     y_min = y_min.reshape(-1, 1)
127 |     x_max, _ = torch.max(x_corners, dim= 1)
128 |     x_max = x_max.reshape(-1, 1)
129 |     y_max, _ = torch.max(y_corners, dim= 1)
130 |     y_max = y_max.reshape(-1, 1)
131 |     
132 |     new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1)
133 |     
134 |     scale_x = new_image.width / w
135 |     scale_y = new_image.height / h
136 |     
137 |     #Resize new image to (w, h)
138 | 
139 |     new_image = new_image.resize((w, h))
140 |     
141 |     #Resize boxes
142 |     new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y])
143 |     new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w)
144 |     new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h)
145 |     new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w)
146 |     new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h)
147 |     return new_image, new_boxes
148 | 
149 | # def convert_xywh_to_xyxy(boxes: torch.Tensor):
150 | #     _boxes = boxes.clone()
151 | #     box_xy = _boxes[:, :2]
152 | #     box_wh = _boxes[:, 2:]
153 | #     box_x1y1 = box_xy - box_wh/2 
154 | #     box_x2y2 = box_xy + box_wh/2
155 | #     box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1)
156 | #     return box_xyxy
157 | 
158 | class Rotate:
159 |     def __init__(self, angle=10) -> None:
160 |         self.angle = angle
161 | 
162 |     def __call__(self, img, target):
163 |         w,h = img.size
164 |         whwh = torch.Tensor([w, h, w, h])
165 |         boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh
166 |         img, boxes_new = rotate(img, boxes_xyxy, self.angle)
167 |         target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3)
168 |         return img, target
169 | 
170 | 
171 | class RandomCrop:
172 |     def __init__(self) -> None:
173 |         pass
174 | 
175 |     def __call__(self, img, target):
176 |         w,h = img.size
177 |         try:
178 |             boxes_xyxy = target['boxes']
179 |             labels = target['labels']
180 |             img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
181 |             target['boxes'] = new_boxes
182 |             target['labels'] = new_labels
183 |         except Exception as e:
184 |             pass
185 |         return img, target
186 | 
187 | 
188 | class RandomCropDebug:
189 |     def __init__(self) -> None:
190 |         pass
191 | 
192 |     def __call__(self, img, target):
193 |         boxes_xyxy = target['boxes'].clone()
194 |         labels = target['labels'].clone()
195 |         img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
196 |         target['boxes'] = new_boxes
197 |         target['labels'] = new_labels
198 | 
199 | 
200 |         return img, target
201 |         
202 | class RandomSelectMulti(object):
203 |     """
204 |     Randomly selects between transforms1 and transforms2,
205 |     """
206 |     def __init__(self, transformslist, p=-1):
207 |         self.transformslist = transformslist
208 |         self.p = p
209 |         assert p == -1
210 | 
211 |     def __call__(self, img, target):
212 |         if self.p == -1:
213 |             return random.choice(self.transformslist)(img, target)
214 | 
215 | 
216 | class Albumentations:
217 |     def __init__(self):
218 |         import albumentations as A
219 |         self.transform = A.Compose([
220 |             A.Blur(p=0.01),
221 |             A.MedianBlur(p=0.01),
222 |             A.ToGray(p=0.01),
223 |             A.CLAHE(p=0.01),
224 |             A.RandomBrightnessContrast(p=0.005),
225 |             A.RandomGamma(p=0.005),
226 |             A.ImageCompression(quality_lower=75, p=0.005)],
227 |             bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))
228 | 
229 |     def __call__(self, img, target, p=1.0):
230 |         """
231 |         Input:
232 |             target['boxes']: xyxy, unnormalized data.
233 |         
234 |         """
235 |         boxes_raw = target['boxes']
236 |         labels_raw = target['labels']
237 |         img_np = np.array(img)
238 |         if self.transform and random.random() < p:
239 |             new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw)  # transformed
240 |             boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw)
241 |             img_np = new_res['image']
242 |             labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype)
243 |         img_new = Image.fromarray(img_np)
244 |         target['boxes'] = boxes_new
245 |         target['labels'] = labels_new
246 |         
247 |         return img_new, target


--------------------------------------------------------------------------------
/datasets/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Transforms and data augmentation for both image + bbox.
  4 | """
  5 | import random
  6 | 
  7 | import PIL
  8 | import torch
  9 | import torchvision.transforms as T
 10 | import torchvision.transforms.functional as F
 11 | 
 12 | from util.box_ops import box_xyxy_to_cxcywh
 13 | from util.misc import interpolate
 14 | 
 15 | 
 16 | def crop(image, target, region):
 17 |     cropped_image = F.crop(image, *region)
 18 | 
 19 |     target = target.copy()
 20 |     i, j, h, w = region
 21 | 
 22 |     # should we do something wrt the original size?
 23 |     target["size"] = torch.tensor([h, w])
 24 | 
 25 |     fields = ["labels", "area", "iscrowd"]
 26 | 
 27 |     if "boxes" in target:
 28 |         boxes = target["boxes"]
 29 |         max_size = torch.as_tensor([w, h], dtype=torch.float32)
 30 |         cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
 31 |         cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
 32 |         cropped_boxes = cropped_boxes.clamp(min=0)
 33 |         area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
 34 |         target["boxes"] = cropped_boxes.reshape(-1, 4)
 35 |         target["area"] = area
 36 |         fields.append("boxes")
 37 | 
 38 |     if "masks" in target:
 39 |         # FIXME should we update the area here if there are no boxes?
 40 |         target['masks'] = target['masks'][:, i:i + h, j:j + w]
 41 |         fields.append("masks")
 42 | 
 43 | 
 44 |     # remove elements for which the boxes or masks that have zero area
 45 |     if "boxes" in target or "masks" in target:
 46 |         # favor boxes selection when defining which elements to keep
 47 |         # this is compatible with previous implementation
 48 |         if "boxes" in target:
 49 |             cropped_boxes = target['boxes'].reshape(-1, 2, 2)
 50 |             keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
 51 |         else:
 52 |             keep = target['masks'].flatten(1).any(1)
 53 | 
 54 |         for field in fields:
 55 |             target[field] = target[field][keep]
 56 | 
 57 |     return cropped_image, target
 58 | 
 59 | 
 60 | def hflip(image, target):
 61 |     flipped_image = F.hflip(image)
 62 | 
 63 |     w, h = image.size
 64 | 
 65 |     target = target.copy()
 66 |     if "boxes" in target:
 67 |         boxes = target["boxes"]
 68 |         boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
 69 |         target["boxes"] = boxes
 70 | 
 71 |     if "masks" in target:
 72 |         target['masks'] = target['masks'].flip(-1)
 73 | 
 74 |     return flipped_image, target
 75 | 
 76 | 
 77 | def resize(image, target, size, max_size=None):
 78 |     # size can be min_size (scalar) or (w, h) tuple
 79 | 
 80 |     def get_size_with_aspect_ratio(image_size, size, max_size=None):
 81 |         w, h = image_size
 82 |         if max_size is not None:
 83 |             min_original_size = float(min((w, h)))
 84 |             max_original_size = float(max((w, h)))
 85 |             if max_original_size / min_original_size * size > max_size:
 86 |                 size = int(round(max_size * min_original_size / max_original_size))
 87 | 
 88 |         if (w <= h and w == size) or (h <= w and h == size):
 89 |             return (h, w)
 90 | 
 91 |         if w < h:
 92 |             ow = size
 93 |             oh = int(size * h / w)
 94 |         else:
 95 |             oh = size
 96 |             ow = int(size * w / h)
 97 | 
 98 |         return (oh, ow)
 99 | 
100 |     def get_size(image_size, size, max_size=None):
101 |         if isinstance(size, (list, tuple)):
102 |             return size[::-1]
103 |         else:
104 |             return get_size_with_aspect_ratio(image_size, size, max_size)
105 | 
106 |     size = get_size(image.size, size, max_size)
107 |     rescaled_image = F.resize(image, size)
108 | 
109 |     if target is None:
110 |         return rescaled_image, None
111 | 
112 |     ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
113 |     ratio_width, ratio_height = ratios
114 | 
115 |     target = target.copy()
116 |     if "boxes" in target:
117 |         boxes = target["boxes"]
118 |         scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
119 |         target["boxes"] = scaled_boxes
120 | 
121 |     if "area" in target:
122 |         area = target["area"]
123 |         scaled_area = area * (ratio_width * ratio_height)
124 |         target["area"] = scaled_area
125 | 
126 |     h, w = size
127 |     target["size"] = torch.tensor([h, w])
128 | 
129 |     if "masks" in target:
130 |         target['masks'] = interpolate(
131 |             target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
132 | 
133 |     return rescaled_image, target
134 | 
135 | 
136 | def pad(image, target, padding):
137 |     # assumes that we only pad on the bottom right corners
138 |     padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
139 |     if target is None:
140 |         return padded_image, None
141 |     target = target.copy()
142 |     # should we do something wrt the original size?
143 |     target["size"] = torch.tensor(padded_image.size[::-1])
144 |     if "masks" in target:
145 |         target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
146 |     return padded_image, target
147 | 
148 | 
149 | class ResizeDebug(object):
150 |     def __init__(self, size):
151 |         self.size = size
152 | 
153 |     def __call__(self, img, target):
154 |         return resize(img, target, self.size)
155 | 
156 | 
157 | class RandomCrop(object):
158 |     def __init__(self, size):
159 |         self.size = size
160 | 
161 |     def __call__(self, img, target):
162 |         region = T.RandomCrop.get_params(img, self.size)
163 |         return crop(img, target, region)
164 | 
165 | 
166 | class RandomSizeCrop(object):
167 |     def __init__(self, min_size: int, max_size: int):
168 |         self.min_size = min_size
169 |         self.max_size = max_size
170 | 
171 |     def __call__(self, img: PIL.Image.Image, target: dict):
172 |         w = random.randint(self.min_size, min(img.width, self.max_size))
173 |         h = random.randint(self.min_size, min(img.height, self.max_size))
174 |         region = T.RandomCrop.get_params(img, [h, w])
175 |         return crop(img, target, region)
176 | 
177 | 
178 | class CenterCrop(object):
179 |     def __init__(self, size):
180 |         self.size = size
181 | 
182 |     def __call__(self, img, target):
183 |         image_width, image_height = img.size
184 |         crop_height, crop_width = self.size
185 |         crop_top = int(round((image_height - crop_height) / 2.))
186 |         crop_left = int(round((image_width - crop_width) / 2.))
187 |         return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
188 | 
189 | 
190 | class RandomHorizontalFlip(object):
191 |     def __init__(self, p=0.5):
192 |         self.p = p
193 | 
194 |     def __call__(self, img, target):
195 |         if random.random() < self.p:
196 |             return hflip(img, target)
197 |         return img, target
198 | 
199 | 
200 | class RandomResize(object):
201 |     def __init__(self, sizes, max_size=None):
202 |         assert isinstance(sizes, (list, tuple))
203 |         self.sizes = sizes
204 |         self.max_size = max_size
205 | 
206 |     def __call__(self, img, target=None):
207 |         size = random.choice(self.sizes)
208 |         return resize(img, target, size, self.max_size)
209 | 
210 | 
211 | class RandomPad(object):
212 |     def __init__(self, max_pad):
213 |         self.max_pad = max_pad
214 | 
215 |     def __call__(self, img, target):
216 |         pad_x = random.randint(0, self.max_pad)
217 |         pad_y = random.randint(0, self.max_pad)
218 |         return pad(img, target, (pad_x, pad_y))
219 | 
220 | 
221 | class RandomSelect(object):
222 |     """
223 |     Randomly selects between transforms1 and transforms2,
224 |     with probability p for transforms1 and (1 - p) for transforms2
225 |     """
226 |     def __init__(self, transforms1, transforms2, p=0.5):
227 |         self.transforms1 = transforms1
228 |         self.transforms2 = transforms2
229 |         self.p = p
230 | 
231 |     def __call__(self, img, target):
232 |         if random.random() < self.p:
233 |             return self.transforms1(img, target)
234 |         return self.transforms2(img, target)
235 | 
236 | 
237 | class ToTensor(object):
238 |     def __call__(self, img, target):
239 |         return F.to_tensor(img), target
240 | 
241 | 
242 | class RandomErasing(object):
243 | 
244 |     def __init__(self, *args, **kwargs):
245 |         self.eraser = T.RandomErasing(*args, **kwargs)
246 | 
247 |     def __call__(self, img, target):
248 |         return self.eraser(img), target
249 | 
250 | 
251 | class Normalize(object):
252 |     def __init__(self, mean, std):
253 |         self.mean = mean
254 |         self.std = std
255 | 
256 |     def __call__(self, image, target=None):
257 |         image = F.normalize(image, mean=self.mean, std=self.std)
258 |         if target is None:
259 |             return image, None
260 |         target = target.copy()
261 |         h, w = image.shape[-2:]
262 |         if "boxes" in target:
263 |             boxes = target["boxes"]
264 |             boxes = box_xyxy_to_cxcywh(boxes)
265 |             boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
266 |             target["boxes"] = boxes
267 |         return image, target
268 | 
269 | 
270 | class Compose(object):
271 |     def __init__(self, transforms):
272 |         self.transforms = transforms
273 | 
274 |     def __call__(self, image, target):
275 |         for t in self.transforms:
276 |             image, target = t(image, target)
277 |         return image, target
278 | 
279 |     def __repr__(self):
280 |         format_string = self.__class__.__name__ + "("
281 |         for t in self.transforms:
282 |             format_string += "\n"
283 |             format_string += "    {0}".format(t)
284 |         format_string += "\n)"
285 |         return format_string
286 | 


--------------------------------------------------------------------------------
/figs/12ep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/12ep.png


--------------------------------------------------------------------------------
/figs/50ep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/50ep.png


--------------------------------------------------------------------------------
/figs/curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/curve.png


--------------------------------------------------------------------------------
/figs/dinosaur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/dinosaur.png


--------------------------------------------------------------------------------
/figs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/framework.png


--------------------------------------------------------------------------------
/figs/idea.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/idea.jpg


--------------------------------------------------------------------------------
/figs/sota.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/sota.jpg


--------------------------------------------------------------------------------
/figs/sota_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/DINO/d84a491d41898b3befd8294d1cf2614661fc0953/figs/sota_table.png


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # DINO
 3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 7 | from .dino import build_dino
 8 | 
 9 | def build_model(args):
10 |     return build(args)
11 | 


--------------------------------------------------------------------------------
/models/dino/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Conditional DETR
 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Copied from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .dino import build_dino
11 | 


--------------------------------------------------------------------------------
/models/dino/backbone.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Conditional DETR
  7 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
  8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  9 | # ------------------------------------------------------------------------
 10 | # Copied from DETR (https://github.com/facebookresearch/detr)
 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 12 | # ------------------------------------------------------------------------
 13 | 
 14 | """
 15 | Backbone modules.
 16 | """
 17 | from collections import OrderedDict
 18 | import os
 19 | 
 20 | import torch
 21 | import torch.nn.functional as F
 22 | import torchvision
 23 | from torch import nn
 24 | from torchvision.models._utils import IntermediateLayerGetter
 25 | from typing import Dict, List
 26 | 
 27 | 
 28 | from util.misc import NestedTensor, clean_state_dict, is_main_process
 29 | 
 30 | from .position_encoding import build_position_encoding
 31 | from .convnext import build_convnext
 32 | from .swin_transformer import build_swin_transformer
 33 | 
 34 | 
 35 | 
 36 | class FrozenBatchNorm2d(torch.nn.Module):
 37 |     """
 38 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 39 | 
 40 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 41 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 42 |     produce nans.
 43 |     """
 44 | 
 45 |     def __init__(self, n):
 46 |         super(FrozenBatchNorm2d, self).__init__()
 47 |         self.register_buffer("weight", torch.ones(n))
 48 |         self.register_buffer("bias", torch.zeros(n))
 49 |         self.register_buffer("running_mean", torch.zeros(n))
 50 |         self.register_buffer("running_var", torch.ones(n))
 51 | 
 52 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 53 |                               missing_keys, unexpected_keys, error_msgs):
 54 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 55 |         if num_batches_tracked_key in state_dict:
 56 |             del state_dict[num_batches_tracked_key]
 57 | 
 58 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 59 |             state_dict, prefix, local_metadata, strict,
 60 |             missing_keys, unexpected_keys, error_msgs)
 61 | 
 62 |     def forward(self, x):
 63 |         # move reshapes to the beginning
 64 |         # to make it fuser-friendly
 65 |         w = self.weight.reshape(1, -1, 1, 1)
 66 |         b = self.bias.reshape(1, -1, 1, 1)
 67 |         rv = self.running_var.reshape(1, -1, 1, 1)
 68 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 69 |         eps = 1e-5
 70 |         scale = w * (rv + eps).rsqrt()
 71 |         bias = b - rm * scale
 72 |         return x * scale + bias
 73 | 
 74 | 
 75 | class BackboneBase(nn.Module):
 76 | 
 77 |     def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_indices: list):
 78 |         super().__init__()
 79 |         for name, parameter in backbone.named_parameters():
 80 |             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
 81 |                 parameter.requires_grad_(False)
 82 | 
 83 |         return_layers = {}
 84 |         for idx, layer_index in enumerate(return_interm_indices):
 85 |             return_layers.update({"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)})
 86 | 
 87 |         # if len:
 88 |         #     if use_stage1_feature:
 89 |         #         return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
 90 |         #     else:
 91 |         #         return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
 92 |         # else:
 93 |         #     return_layers = {'layer4': "0"}
 94 |         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
 95 |         self.num_channels = num_channels
 96 | 
 97 |     def forward(self, tensor_list: NestedTensor):
 98 |         xs = self.body(tensor_list.tensors)
 99 |         out: Dict[str, NestedTensor] = {}
100 |         for name, x in xs.items():
101 |             m = tensor_list.mask
102 |             assert m is not None
103 |             mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
104 |             out[name] = NestedTensor(x, mask)
105 | 
106 |         return out
107 | 
108 | 
109 | class Backbone(BackboneBase):
110 |     """ResNet backbone with frozen BatchNorm."""
111 |     def __init__(self, name: str,
112 |                  train_backbone: bool,
113 |                  dilation: bool,
114 |                  return_interm_indices:list,
115 |                  batch_norm=FrozenBatchNorm2d,
116 |                  ):
117 |         if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']:
118 |             backbone = getattr(torchvision.models, name)(
119 |                 replace_stride_with_dilation=[False, False, dilation],
120 |                 pretrained=is_main_process(), norm_layer=batch_norm)
121 |         else:
122 |             raise NotImplementedError("Why you can get here with name {}".format(name))
123 |         # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
124 |         assert name not in ('resnet18', 'resnet34'), "Only resnet50 and resnet101 are available."
125 |         assert return_interm_indices in [[0,1,2,3], [1,2,3], [3]]
126 |         num_channels_all = [256, 512, 1024, 2048]
127 |         num_channels = num_channels_all[4-len(return_interm_indices):]
128 |         super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
129 | 
130 | 
131 | class Joiner(nn.Sequential):
132 |     def __init__(self, backbone, position_embedding):
133 |         super().__init__(backbone, position_embedding)
134 | 
135 |     def forward(self, tensor_list: NestedTensor):
136 |         xs = self[0](tensor_list)
137 |         out: List[NestedTensor] = []
138 |         pos = []
139 |         for name, x in xs.items():
140 |             out.append(x)
141 |             # position encoding
142 |             pos.append(self[1](x).to(x.tensors.dtype))
143 | 
144 |         return out, pos
145 | 
146 | 
147 | def build_backbone(args):
148 |     """
149 |     Useful args:
150 |         - backbone: backbone name
151 |         - lr_backbone: 
152 |         - dilation
153 |         - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
154 |         - backbone_freeze_keywords: 
155 |         - use_checkpoint: for swin only for now
156 | 
157 |     """
158 |     position_embedding = build_position_encoding(args)
159 |     train_backbone = args.lr_backbone > 0
160 |     if not train_backbone:
161 |         raise ValueError("Please set lr_backbone > 0")
162 |     return_interm_indices = args.return_interm_indices
163 |     assert return_interm_indices in [[0,1,2,3], [1,2,3], [3]]
164 |     backbone_freeze_keywords = args.backbone_freeze_keywords
165 |     use_checkpoint = getattr(args, 'use_checkpoint', False)
166 | 
167 |     if args.backbone in ['resnet50', 'resnet101']:
168 |         backbone = Backbone(args.backbone, train_backbone, args.dilation,   
169 |                                 return_interm_indices,   
170 |                                 batch_norm=FrozenBatchNorm2d)
171 |         bb_num_channels = backbone.num_channels
172 |     elif args.backbone in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']:
173 |         pretrain_img_size = int(args.backbone.split('_')[-2])
174 |         backbone = build_swin_transformer(args.backbone, \
175 |                     pretrain_img_size=pretrain_img_size, \
176 |                     out_indices=tuple(return_interm_indices), \
177 |                 dilation=args.dilation, use_checkpoint=use_checkpoint)
178 | 
179 |         # freeze some layers
180 |         if backbone_freeze_keywords is not None:
181 |             for name, parameter in backbone.named_parameters():
182 |                 for keyword in backbone_freeze_keywords:
183 |                     if keyword in name:
184 |                         parameter.requires_grad_(False)
185 |                         break
186 |         if "backbone_dir" in args:
187 |             pretrained_dir = args.backbone_dir
188 |             PTDICT = {
189 |                 'swin_T_224_1k': 'swin_tiny_patch4_window7_224.pth',
190 |                 'swin_B_384_22k': 'swin_base_patch4_window12_384.pth',
191 |                 'swin_L_384_22k': 'swin_large_patch4_window12_384_22k.pth',
192 |             }
193 |             pretrainedpath = os.path.join(pretrained_dir, PTDICT[args.backbone])
194 |             checkpoint = torch.load(pretrainedpath, map_location='cpu')['model']
195 |             from collections import OrderedDict
196 |             def key_select_function(keyname):
197 |                 if 'head' in keyname:
198 |                     return False
199 |                 if args.dilation and 'layers.3' in keyname:
200 |                     return False
201 |                 return True
202 |             _tmp_st = OrderedDict({k:v for k, v in clean_state_dict(checkpoint).items() if key_select_function(k)})
203 |             _tmp_st_output = backbone.load_state_dict(_tmp_st, strict=False)
204 |             print(str(_tmp_st_output))
205 |         bb_num_channels = backbone.num_features[4 - len(return_interm_indices):]
206 |     elif args.backbone in ['convnext_xlarge_22k']:
207 |         backbone = build_convnext(modelname=args.backbone, pretrained=True, out_indices=tuple(return_interm_indices),backbone_dir=args.backbone_dir)
208 |         bb_num_channels = backbone.dims[4 - len(return_interm_indices):]
209 |     else:
210 |         raise NotImplementedError("Unknown backbone {}".format(args.backbone))
211 |     
212 | 
213 |     assert len(bb_num_channels) == len(return_interm_indices), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
214 | 
215 | 
216 |     model = Joiner(backbone, position_embedding)
217 |     model.num_channels = bb_num_channels 
218 |     assert isinstance(bb_num_channels, List), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
219 |     return model
220 | 


--------------------------------------------------------------------------------
/models/dino/convnext.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | 
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | 
  9 | from functools import partial
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | from timm.models.layers import trunc_normal_, DropPath
 14 | 
 15 | from util.misc import NestedTensor
 16 | # from timm.models.registry import register_model
 17 | 
 18 | class Block(nn.Module):
 19 |     r""" ConvNeXt Block. There are two equivalent implementations:
 20 |     (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
 21 |     (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
 22 |     We use (2) as we find it slightly faster in PyTorch
 23 |     
 24 |     Args:
 25 |         dim (int): Number of input channels.
 26 |         drop_path (float): Stochastic depth rate. Default: 0.0
 27 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 28 |     """
 29 |     def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
 30 |         super().__init__()
 31 |         self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
 32 |         self.norm = LayerNorm(dim, eps=1e-6)
 33 |         self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
 34 |         self.act = nn.GELU()
 35 |         self.pwconv2 = nn.Linear(4 * dim, dim)
 36 |         self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 
 37 |                                     requires_grad=True) if layer_scale_init_value > 0 else None
 38 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 39 | 
 40 |     def forward(self, x):
 41 |         input = x
 42 |         x = self.dwconv(x)
 43 |         x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
 44 |         x = self.norm(x)
 45 |         x = self.pwconv1(x)
 46 |         x = self.act(x)
 47 |         x = self.pwconv2(x)
 48 |         if self.gamma is not None:
 49 |             x = self.gamma * x
 50 |         x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
 51 | 
 52 |         x = input + self.drop_path(x)
 53 |         return x
 54 | 
 55 | class ConvNeXt(nn.Module):
 56 |     r""" ConvNeXt
 57 |         A PyTorch impl of : `A ConvNet for the 2020s`  -
 58 |           https://arxiv.org/pdf/2201.03545.pdf
 59 | 
 60 |     Args:
 61 |         in_chans (int): Number of input image channels. Default: 3
 62 |         num_classes (int): Number of classes for classification head. Default: 1000
 63 |         depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
 64 |         dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
 65 |         drop_path_rate (float): Stochastic depth rate. Default: 0.
 66 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 67 |         head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
 68 |     """
 69 |     def __init__(self, in_chans=3, num_classes=1000, 
 70 |                  depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., 
 71 |                  layer_scale_init_value=1e-6, head_init_scale=1.,
 72 |                  out_indices=[0, 1, 2, 3]
 73 |                  ):
 74 |         super().__init__()
 75 |         self.dims = dims
 76 | 
 77 |         self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
 78 |         stem = nn.Sequential(
 79 |             nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
 80 |             LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
 81 |         )
 82 |         self.downsample_layers.append(stem)
 83 |         for i in range(3):
 84 |             downsample_layer = nn.Sequential(
 85 |                     LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
 86 |                     nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
 87 |             )
 88 |             self.downsample_layers.append(downsample_layer)
 89 | 
 90 |         self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
 91 |         dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
 92 |         cur = 0
 93 |         for i in range(4):
 94 |             stage = nn.Sequential(
 95 |                 *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 
 96 |                 layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
 97 |             )
 98 |             self.stages.append(stage)
 99 |             cur += depths[i]
100 | 
101 |         self.out_indices = out_indices
102 | 
103 |         norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
104 |         for i_layer in range(4):
105 |             layer = norm_layer(dims[i_layer])
106 |             layer_name = f'norm{i_layer}'
107 |             self.add_module(layer_name, layer)
108 | 
109 |         # self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
110 |         # self.head = nn.Linear(dims[-1], num_classes)
111 | 
112 |         # self.apply(self._init_weights)
113 |         # self.head.weight.data.mul_(head_init_scale)
114 |         # self.head.bias.data.mul_(head_init_scale)
115 | 
116 |     def _init_weights(self, m):
117 |         if isinstance(m, (nn.Conv2d, nn.Linear)):
118 |             trunc_normal_(m.weight, std=.02)
119 |             nn.init.constant_(m.bias, 0)
120 | 
121 |     def forward_features(self, x):
122 |         outs = []
123 |         for i in range(4):
124 |             x = self.downsample_layers[i](x)
125 |             x = self.stages[i](x)
126 |             if i in self.out_indices:
127 |                 norm_layer = getattr(self, f'norm{i}')
128 |                 x_out = norm_layer(x)
129 |                 outs.append(x_out)
130 |         # return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)
131 |         return tuple(outs)
132 | 
133 |     # def forward(self, x):
134 |     #     x = self.forward_features(x)
135 |     #     return x
136 | 
137 | 
138 |     def forward(self, tensor_list: NestedTensor):
139 |         x = tensor_list.tensors
140 |         outs = self.forward_features(x)
141 | 
142 |         # collect for nesttensors        
143 |         outs_dict = {}
144 |         for idx, out_i in enumerate(outs):
145 |             m = tensor_list.mask
146 |             assert m is not None
147 |             mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
148 |             outs_dict[idx] = NestedTensor(out_i, mask)
149 | 
150 |         return outs_dict
151 | 
152 | class LayerNorm(nn.Module):
153 |     r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
154 |     The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
155 |     shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
156 |     with shape (batch_size, channels, height, width).
157 |     """
158 |     def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
159 |         super().__init__()
160 |         self.weight = nn.Parameter(torch.ones(normalized_shape))
161 |         self.bias = nn.Parameter(torch.zeros(normalized_shape))
162 |         self.eps = eps
163 |         self.data_format = data_format
164 |         if self.data_format not in ["channels_last", "channels_first"]:
165 |             raise NotImplementedError 
166 |         self.normalized_shape = (normalized_shape, )
167 |     
168 |     def forward(self, x):
169 |         if self.data_format == "channels_last":
170 |             return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
171 |         elif self.data_format == "channels_first":
172 |             u = x.mean(1, keepdim=True)
173 |             s = (x - u).pow(2).mean(1, keepdim=True)
174 |             x = (x - u) / torch.sqrt(s + self.eps)
175 |             x = self.weight[:, None, None] * x + self.bias[:, None, None]
176 |             return x
177 | 
178 | 
179 | model_urls = {
180 |     "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
181 |     "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
182 |     "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
183 |     "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
184 |     "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
185 |     "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
186 |     "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
187 | }
188 | 
189 | # @register_model
190 | # def convnext_tiny(pretrained=False, **kwargs):
191 | #     model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
192 | #     if pretrained:
193 | #         url = model_urls['convnext_tiny_1k']
194 | #         checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
195 | #         model.load_state_dict(checkpoint["model"])
196 | #     return model
197 | 
198 | # @register_model
199 | # def convnext_small(pretrained=False, **kwargs):
200 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
201 | #     if pretrained:
202 | #         url = model_urls['convnext_small_1k']
203 | #         checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
204 | #         model.load_state_dict(checkpoint["model"])
205 | #     return model
206 | 
207 | # @register_model
208 | # def convnext_base(pretrained=False, in_22k=False, **kwargs):
209 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
210 | #     if pretrained:
211 | #         url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
212 | #         checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
213 | #         model.load_state_dict(checkpoint["model"])
214 | #     return model
215 | 
216 | # @register_model
217 | # def convnext_large(pretrained=False, in_22k=False, **kwargs):
218 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
219 | #     if pretrained:
220 | #         url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
221 | #         checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
222 | #         model.load_state_dict(checkpoint["model"])
223 | #     return model
224 | 
225 | # @register_model
226 | # def convnext_xlarge(pretrained=False, in_22k=False, **kwargs):
227 | #     model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
228 | #     if pretrained:
229 | #         url = model_urls['convnext_xlarge_22k'] if in_22k else model_urls['convnext_xlarge_1k']
230 | #         checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
231 | #         model.load_state_dict(checkpoint["model"])
232 | #     return model
233 | 
234 | def build_convnext(modelname, pretrained,backbone_dir=None, **kw):
235 |     assert modelname in ['convnext_xlarge_22k']
236 | 
237 |     model_para_dict = {
238 |         'convnext_xlarge_22k': dict(
239 |             depths=[3, 3, 27, 3],
240 |             dims=[256, 512, 1024, 2048],
241 |         ),
242 |     }
243 |     kw_cgf = model_para_dict[modelname]
244 |     kw_cgf.update(kw)
245 |     model = ConvNeXt(**kw_cgf)
246 |     if pretrained:
247 |         url = model_urls[modelname]
248 |         checkpoint = torch.hub.load_state_dict_from_url(url=url, model_dir=backbone_dir, map_location="cpu", check_hash=True)
249 |         _tmp_st_output = model.load_state_dict(checkpoint["model"], strict=False)
250 |         print(str(_tmp_st_output))
251 | 
252 |     return model


--------------------------------------------------------------------------------
/models/dino/dn_components.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # DN-DETR
  7 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  9 | 
 10 | 
 11 | import torch
 12 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
 13 |                        accuracy, get_world_size, interpolate,
 14 |                        is_dist_avail_and_initialized, inverse_sigmoid)
 15 | # from .DABDETR import sigmoid_focal_loss
 16 | from util import box_ops
 17 | import torch.nn.functional as F
 18 | 
 19 | 
 20 | def prepare_for_cdn(dn_args, training, num_queries, num_classes, hidden_dim, label_enc):
 21 |     """
 22 |         A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
 23 |         forward function and use learnable tgt embedding, so we change this function a little bit.
 24 |         :param dn_args: targets, dn_number, label_noise_ratio, box_noise_scale
 25 |         :param training: if it is training or inference
 26 |         :param num_queries: number of queires
 27 |         :param num_classes: number of classes
 28 |         :param hidden_dim: transformer hidden dim
 29 |         :param label_enc: encode labels in dn
 30 |         :return:
 31 |         """
 32 |     if training:
 33 |         targets, dn_number, label_noise_ratio, box_noise_scale = dn_args
 34 |         # positive and negative dn queries
 35 |         dn_number = dn_number * 2
 36 |         known = [(torch.ones_like(t['labels'])).cuda() for t in targets]
 37 |         batch_size = len(known)
 38 |         known_num = [sum(k) for k in known]
 39 |         if int(max(known_num)) == 0:
 40 |             dn_number = 1
 41 |         else:
 42 |             if dn_number >= 100:
 43 |                 dn_number = dn_number // (int(max(known_num) * 2))
 44 |             elif dn_number < 1:
 45 |                 dn_number = 1
 46 |         if dn_number == 0:
 47 |             dn_number = 1
 48 |         unmask_bbox = unmask_label = torch.cat(known)
 49 |         labels = torch.cat([t['labels'] for t in targets])
 50 |         boxes = torch.cat([t['boxes'] for t in targets])
 51 |         batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
 52 | 
 53 |         known_indice = torch.nonzero(unmask_label + unmask_bbox)
 54 |         known_indice = known_indice.view(-1)
 55 | 
 56 |         known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
 57 |         known_labels = labels.repeat(2 * dn_number, 1).view(-1)
 58 |         known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
 59 |         known_bboxs = boxes.repeat(2 * dn_number, 1)
 60 |         known_labels_expaned = known_labels.clone()
 61 |         known_bbox_expand = known_bboxs.clone()
 62 | 
 63 |         if label_noise_ratio > 0:
 64 |             p = torch.rand_like(known_labels_expaned.float())
 65 |             chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1)  # half of bbox prob
 66 |             new_label = torch.randint_like(chosen_indice, 0, num_classes)  # randomly put a new one here
 67 |             known_labels_expaned.scatter_(0, chosen_indice, new_label)
 68 |         single_pad = int(max(known_num))
 69 | 
 70 |         pad_size = int(single_pad * 2 * dn_number)
 71 |         positive_idx = torch.tensor(range(len(boxes))).long().cuda().unsqueeze(0).repeat(dn_number, 1)
 72 |         positive_idx += (torch.tensor(range(dn_number)) * len(boxes) * 2).long().cuda().unsqueeze(1)
 73 |         positive_idx = positive_idx.flatten()
 74 |         negative_idx = positive_idx + len(boxes)
 75 |         if box_noise_scale > 0:
 76 |             known_bbox_ = torch.zeros_like(known_bboxs)
 77 |             known_bbox_[:, :2] = known_bboxs[:, :2] - known_bboxs[:, 2:] / 2
 78 |             known_bbox_[:, 2:] = known_bboxs[:, :2] + known_bboxs[:, 2:] / 2
 79 | 
 80 |             diff = torch.zeros_like(known_bboxs)
 81 |             diff[:, :2] = known_bboxs[:, 2:] / 2
 82 |             diff[:, 2:] = known_bboxs[:, 2:] / 2
 83 | 
 84 |             rand_sign = torch.randint_like(known_bboxs, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0
 85 |             rand_part = torch.rand_like(known_bboxs)
 86 |             rand_part[negative_idx] += 1.0
 87 |             rand_part *= rand_sign
 88 |             known_bbox_ = known_bbox_ + torch.mul(rand_part,
 89 |                                                   diff).cuda() * box_noise_scale
 90 |             known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0)
 91 |             known_bbox_expand[:, :2] = (known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2
 92 |             known_bbox_expand[:, 2:] = known_bbox_[:, 2:] - known_bbox_[:, :2]
 93 | 
 94 |         m = known_labels_expaned.long().to('cuda')
 95 |         input_label_embed = label_enc(m)
 96 |         input_bbox_embed = inverse_sigmoid(known_bbox_expand)
 97 | 
 98 |         padding_label = torch.zeros(pad_size, hidden_dim).cuda()
 99 |         padding_bbox = torch.zeros(pad_size, 4).cuda()
100 | 
101 |         input_query_label = padding_label.repeat(batch_size, 1, 1)
102 |         input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
103 | 
104 |         map_known_indice = torch.tensor([]).to('cuda')
105 |         if len(known_num):
106 |             map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
107 |             map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
108 |         if len(known_bid):
109 |             input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
110 |             input_query_bbox[(known_bid.long(), map_known_indice)] = input_bbox_embed
111 | 
112 |         tgt_size = pad_size + num_queries
113 |         attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
114 |         # match query cannot see the reconstruct
115 |         attn_mask[pad_size:, :pad_size] = True
116 |         # reconstruct cannot see each other
117 |         for i in range(dn_number):
118 |             if i == 0:
119 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
120 |             if i == dn_number - 1:
121 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
122 |             else:
123 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
124 |                 attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
125 | 
126 |         dn_meta = {
127 |             'pad_size': pad_size,
128 |             'num_dn_group': dn_number,
129 |         }
130 |     else:
131 | 
132 |         input_query_label = None
133 |         input_query_bbox = None
134 |         attn_mask = None
135 |         dn_meta = None
136 | 
137 |     return input_query_label, input_query_bbox, attn_mask, dn_meta
138 | 
139 | 
140 | def dn_post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
141 |     """
142 |         post process of dn after output from the transformer
143 |         put the dn part in the dn_meta
144 |     """
145 |     if dn_meta and dn_meta['pad_size'] > 0:
146 |         output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :]
147 |         output_known_coord = outputs_coord[:, :, :dn_meta['pad_size'], :]
148 |         outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :]
149 |         outputs_coord = outputs_coord[:, :, dn_meta['pad_size']:, :]
150 |         out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
151 |         if aux_loss:
152 |             out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
153 |         dn_meta['output_known_lbs_bboxes'] = out
154 |     return outputs_class, outputs_coord
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/models/dino/matcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modules to compute the matching cost and solve the corresponding LSAP.
  7 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
  8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  9 | # ------------------------------------------------------------------------
 10 | # Modified from DETR (https://github.com/facebookresearch/detr)
 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 12 | # ------------------------------------------------------------------------
 13 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 14 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 15 | # ------------------------------------------------------------------------
 16 | 
 17 | 
 18 | import torch, os
 19 | from torch import nn
 20 | from scipy.optimize import linear_sum_assignment
 21 | 
 22 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
 23 | 
 24 | 
 25 | class HungarianMatcher(nn.Module):
 26 |     """This class computes an assignment between the targets and the predictions of the network
 27 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 28 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 29 |     while the others are un-matched (and thus treated as non-objects).
 30 |     """
 31 | 
 32 |     def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25):
 33 |         """Creates the matcher
 34 |         Params:
 35 |             cost_class: This is the relative weight of the classification error in the matching cost
 36 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 37 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 38 |         """
 39 |         super().__init__()
 40 |         self.cost_class = cost_class
 41 |         self.cost_bbox = cost_bbox
 42 |         self.cost_giou = cost_giou
 43 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
 44 | 
 45 |         self.focal_alpha = focal_alpha
 46 | 
 47 |     @torch.no_grad()
 48 |     def forward(self, outputs, targets):
 49 |         """ Performs the matching
 50 |         Params:
 51 |             outputs: This is a dict that contains at least these entries:
 52 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
 53 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
 54 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
 55 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
 56 |                            objects in the target) containing the class labels
 57 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
 58 |         Returns:
 59 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
 60 |                 - index_i is the indices of the selected predictions (in order)
 61 |                 - index_j is the indices of the corresponding selected targets (in order)
 62 |             For each batch element, it holds:
 63 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
 64 |         """
 65 | 
 66 |         bs, num_queries = outputs["pred_logits"].shape[:2]
 67 | 
 68 |         # We flatten to compute the cost matrices in a batch
 69 |         out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
 70 |         out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 71 | 
 72 |         # Also concat the target labels and boxes
 73 |         tgt_ids = torch.cat([v["labels"] for v in targets])
 74 |         tgt_bbox = torch.cat([v["boxes"] for v in targets])
 75 | 
 76 |         # Compute the classification cost.
 77 |         alpha = self.focal_alpha
 78 |         gamma = 2.0
 79 |         neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
 80 |         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
 81 |         cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 82 | 
 83 |         # Compute the L1 cost between boxes
 84 |         cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 85 |             
 86 |         # Compute the giou cost betwen boxes            
 87 |         cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
 88 | 
 89 |         # Final cost matrix
 90 |         C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
 91 |         C = C.view(bs, num_queries, -1).cpu()
 92 | 
 93 |         sizes = [len(v["boxes"]) for v in targets]
 94 |         indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
 95 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 96 | 
 97 | 
 98 | class SimpleMinsumMatcher(nn.Module):
 99 |     """This class computes an assignment between the targets and the predictions of the network
100 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
101 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
102 |     while the others are un-matched (and thus treated as non-objects).
103 |     """
104 | 
105 |     def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25):
106 |         """Creates the matcher
107 |         Params:
108 |             cost_class: This is the relative weight of the classification error in the matching cost
109 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
110 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
111 |         """
112 |         super().__init__()
113 |         self.cost_class = cost_class
114 |         self.cost_bbox = cost_bbox
115 |         self.cost_giou = cost_giou
116 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
117 | 
118 |         self.focal_alpha = focal_alpha
119 | 
120 |     @torch.no_grad()
121 |     def forward(self, outputs, targets):
122 |         """ Performs the matching
123 |         Params:
124 |             outputs: This is a dict that contains at least these entries:
125 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
126 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
127 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
128 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
129 |                            objects in the target) containing the class labels
130 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
131 |         Returns:
132 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
133 |                 - index_i is the indices of the selected predictions (in order)
134 |                 - index_j is the indices of the corresponding selected targets (in order)
135 |             For each batch element, it holds:
136 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
137 |         """
138 | 
139 |         bs, num_queries = outputs["pred_logits"].shape[:2]
140 | 
141 |         # We flatten to compute the cost matrices in a batch
142 |         out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
143 |         out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
144 | 
145 |         # Also concat the target labels and boxes
146 |         tgt_ids = torch.cat([v["labels"] for v in targets])
147 |         tgt_bbox = torch.cat([v["boxes"] for v in targets])
148 | 
149 |         # Compute the classification cost.
150 |         alpha = self.focal_alpha
151 |         gamma = 2.0
152 |         neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
153 |         pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
154 |         cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
155 | 
156 |         # Compute the L1 cost between boxes
157 |         cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
158 |             
159 |         # Compute the giou cost betwen boxes            
160 |         cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
161 | 
162 |         # Final cost matrix
163 |         C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
164 |         C = C.view(bs, num_queries, -1)
165 | 
166 |         sizes = [len(v["boxes"]) for v in targets]
167 |         indices = []
168 |         device = C.device
169 |         for i, (c, _size) in enumerate(zip(C.split(sizes, -1), sizes)):
170 |             weight_mat = c[i]
171 |             idx_i = weight_mat.min(0)[1]
172 |             idx_j = torch.arange(_size).to(device)
173 |             indices.append((idx_i, idx_j))
174 | 
175 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
176 | 
177 | 
178 | def build_matcher(args):
179 |     assert args.matcher_type in ['HungarianMatcher', 'SimpleMinsumMatcher'], "Unknown args.matcher_type: {}".format(args.matcher_type)
180 |     if args.matcher_type == 'HungarianMatcher':
181 |         return HungarianMatcher(
182 |             cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou,
183 |             focal_alpha=args.focal_alpha
184 |         )
185 |     elif args.matcher_type == 'SimpleMinsumMatcher':
186 |         return SimpleMinsumMatcher(
187 |             cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou,
188 |             focal_alpha=args.focal_alpha
189 |         )    
190 |     else:
191 |         raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type))


--------------------------------------------------------------------------------
/models/dino/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/models/dino/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/models/dino/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | 
11 | # TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='/path/to/your/cuda/dir'  
12 | python setup.py build install
13 | 


--------------------------------------------------------------------------------
/models/dino/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/models/dino/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | 
 13 | import warnings
 14 | import math
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.nn.functional as F
 19 | from torch.nn.init import xavier_uniform_, constant_
 20 | 
 21 | from ..functions import MSDeformAttnFunction
 22 | 
 23 | 
 24 | def _is_power_of_2(n):
 25 |     if (not isinstance(n, int)) or (n < 0):
 26 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 27 |     return (n & (n-1) == 0) and n != 0
 28 | 
 29 | 
 30 | class MSDeformAttn(nn.Module):
 31 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 32 |         """
 33 |         Multi-Scale Deformable Attention Module
 34 |         :param d_model      hidden dimension
 35 |         :param n_levels     number of feature levels
 36 |         :param n_heads      number of attention heads
 37 |         :param n_points     number of sampling points per attention head per feature level
 38 |         """
 39 |         super().__init__()
 40 |         if d_model % n_heads != 0:
 41 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 42 |         _d_per_head = d_model // n_heads
 43 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 44 |         if not _is_power_of_2(_d_per_head):
 45 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 46 |                           "which is more efficient in our CUDA implementation.")
 47 | 
 48 |         self.im2col_step = 64
 49 | 
 50 |         self.d_model = d_model
 51 |         self.n_levels = n_levels
 52 |         self.n_heads = n_heads
 53 |         self.n_points = n_points
 54 | 
 55 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 56 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 57 |         self.value_proj = nn.Linear(d_model, d_model)
 58 |         self.output_proj = nn.Linear(d_model, d_model)
 59 | 
 60 |         self._reset_parameters()
 61 | 
 62 |     def _reset_parameters(self):
 63 |         constant_(self.sampling_offsets.weight.data, 0.)
 64 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 65 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 66 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 67 |         for i in range(self.n_points):
 68 |             grid_init[:, :, i, :] *= i + 1
 69 |         with torch.no_grad():
 70 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 71 |         constant_(self.attention_weights.weight.data, 0.)
 72 |         constant_(self.attention_weights.bias.data, 0.)
 73 |         xavier_uniform_(self.value_proj.weight.data)
 74 |         constant_(self.value_proj.bias.data, 0.)
 75 |         xavier_uniform_(self.output_proj.weight.data)
 76 |         constant_(self.output_proj.bias.data, 0.)
 77 | 
 78 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 79 |         """
 80 |         :param query                       (N, Length_{query}, C)
 81 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 82 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 83 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 84 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 85 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 86 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 87 | 
 88 |         :return output                     (N, Length_{query}, C)
 89 |         """
 90 |         N, Len_q, _ = query.shape
 91 |         N, Len_in, _ = input_flatten.shape
 92 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 93 | 
 94 |         value = self.value_proj(input_flatten)
 95 |         if input_padding_mask is not None:
 96 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 97 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 98 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
 99 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
100 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
101 |         # N, Len_q, n_heads, n_levels, n_points, 2
102 |         if reference_points.shape[-1] == 2:
103 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
104 |             sampling_locations = reference_points[:, :, None, :, None, :] \
105 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
106 |         elif reference_points.shape[-1] == 4:
107 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
108 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
109 |         else:
110 |             raise ValueError(
111 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
112 | 
113 |         # for amp
114 |         if value.dtype == torch.float16:
115 |             # for mixed precision
116 |             output = MSDeformAttnFunction.apply(
117 |             value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
118 |             output = output.to(torch.float16)
119 |             output = self.output_proj(output)
120 |             return output
121 | 
122 | 
123 |         output = MSDeformAttnFunction.apply(
124 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
125 |         output = self.output_proj(output)
126 |         return output
127 | 


--------------------------------------------------------------------------------
/models/dino/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 | 
37 | 
38 |     if torch.cuda.is_available() and CUDA_HOME is not None:
39 |         extension = CUDAExtension
40 |         sources += source_cuda
41 |         define_macros += [("WITH_CUDA", None)]
42 |         extra_compile_args["nvcc"] = [
43 |             "-DCUDA_HAS_FP16=1",
44 |             "-D__CUDA_NO_HALF_OPERATORS__",
45 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
46 |             "-D__CUDA_NO_HALF2_OPERATORS__",
47 |         ]
48 |     else:
49 |         raise NotImplementedError('Cuda is not availabel')
50 | 
51 |     sources = [os.path.join(extensions_dir, s) for s in sources]
52 |     include_dirs = [extensions_dir]
53 |     ext_modules = [
54 |         extension(
55 |             "MultiScaleDeformableAttention",
56 |             sources,
57 |             include_dirs=include_dirs,
58 |             define_macros=define_macros,
59 |             extra_compile_args=extra_compile_args,
60 |         )
61 |     ]
62 |     return ext_modules
63 | 
64 | setup(
65 |     name="MultiScaleDeformableAttention",
66 |     version="1.0",
67 |     author="Weijie Su",
68 |     url="https://github.com/fundamentalvision/Deformable-DETR",
69 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
70 |     packages=find_packages(exclude=("configs", "tests",)),
71 |     ext_modules=get_extensions(),
72 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
73 | )
74 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | #include <vector>
 12 | #include "cuda/ms_deform_im2col_cuda.cuh"
 13 | 
 14 | #include <ATen/ATen.h>
 15 | #include <ATen/cuda/CUDAContext.h>
 16 | #include <cuda.h>
 17 | #include <cuda_runtime.h>
 18 | 
 19 | 
 20 | at::Tensor ms_deform_attn_cuda_forward(
 21 |     const at::Tensor &value, 
 22 |     const at::Tensor &spatial_shapes,
 23 |     const at::Tensor &level_start_index,
 24 |     const at::Tensor &sampling_loc,
 25 |     const at::Tensor &attn_weight,
 26 |     const int im2col_step)
 27 | {
 28 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 29 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 30 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 31 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 32 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 33 | 
 34 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 35 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 36 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 37 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 38 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 39 | 
 40 |     const int batch = value.size(0);
 41 |     const int spatial_size = value.size(1);
 42 |     const int num_heads = value.size(2);
 43 |     const int channels = value.size(3);
 44 | 
 45 |     const int num_levels = spatial_shapes.size(0);
 46 | 
 47 |     const int num_query = sampling_loc.size(1);
 48 |     const int num_point = sampling_loc.size(4);
 49 | 
 50 |     const int im2col_step_ = std::min(batch, im2col_step);
 51 | 
 52 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 53 |     
 54 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 55 | 
 56 |     const int batch_n = im2col_step_;
 57 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 58 |     auto per_value_size = spatial_size * num_heads * channels;
 59 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 60 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 61 |     for (int n = 0; n < batch/im2col_step_; ++n)
 62 |     {
 63 |         auto columns = output_n.select(0, n);
 64 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 65 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 66 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 67 |                 spatial_shapes.data<int64_t>(),
 68 |                 level_start_index.data<int64_t>(),
 69 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 70 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 71 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 72 |                 columns.data<scalar_t>());
 73 | 
 74 |         }));
 75 |     }
 76 | 
 77 |     output = output.view({batch, num_query, num_heads*channels});
 78 | 
 79 |     return output;
 80 | }
 81 | 
 82 | 
 83 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 84 |     const at::Tensor &value, 
 85 |     const at::Tensor &spatial_shapes,
 86 |     const at::Tensor &level_start_index,
 87 |     const at::Tensor &sampling_loc,
 88 |     const at::Tensor &attn_weight,
 89 |     const at::Tensor &grad_output,
 90 |     const int im2col_step)
 91 | {
 92 | 
 93 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 94 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 95 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 96 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 97 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 98 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
 99 | 
100 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 | 
107 |     const int batch = value.size(0);
108 |     const int spatial_size = value.size(1);
109 |     const int num_heads = value.size(2);
110 |     const int channels = value.size(3);
111 | 
112 |     const int num_levels = spatial_shapes.size(0);
113 | 
114 |     const int num_query = sampling_loc.size(1);
115 |     const int num_point = sampling_loc.size(4);
116 | 
117 |     const int im2col_step_ = std::min(batch, im2col_step);
118 | 
119 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 | 
121 |     auto grad_value = at::zeros_like(value);
122 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 |     auto grad_attn_weight = at::zeros_like(attn_weight);
124 | 
125 |     const int batch_n = im2col_step_;
126 |     auto per_value_size = spatial_size * num_heads * channels;
127 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |     
131 |     for (int n = 0; n < batch/im2col_step_; ++n)
132 |     {
133 |         auto grad_output_g = grad_output_n.select(0, n);
134 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 |                                     grad_output_g.data<scalar_t>(),
137 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138 |                                     spatial_shapes.data<int64_t>(),
139 |                                     level_start_index.data<int64_t>(),
140 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
144 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146 | 
147 |         }));
148 |     }
149 | 
150 |     return {
151 |         grad_value, grad_sampling_loc, grad_attn_weight
152 |     };
153 | }


--------------------------------------------------------------------------------
/models/dino/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/models/dino/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/models/dino/position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Conditional DETR
  7 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
  8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  9 | # ------------------------------------------------------------------------
 10 | # Copied from DETR (https://github.com/facebookresearch/detr)
 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 12 | # ------------------------------------------------------------------------
 13 | 
 14 | """
 15 | Various positional encodings for the transformer.
 16 | """
 17 | import math
 18 | import torch
 19 | from torch import nn
 20 | 
 21 | from util.misc import NestedTensor
 22 | 
 23 | 
 24 | class PositionEmbeddingSine(nn.Module):
 25 |     """
 26 |     This is a more standard version of the position embedding, very similar to the one
 27 |     used by the Attention is all you need paper, generalized to work on images.
 28 |     """
 29 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 30 |         super().__init__()
 31 |         self.num_pos_feats = num_pos_feats
 32 |         self.temperature = temperature
 33 |         self.normalize = normalize
 34 |         if scale is not None and normalize is False:
 35 |             raise ValueError("normalize should be True if scale is passed")
 36 |         if scale is None:
 37 |             scale = 2 * math.pi
 38 |         self.scale = scale
 39 | 
 40 |     def forward(self, tensor_list: NestedTensor):
 41 |         x = tensor_list.tensors
 42 |         mask = tensor_list.mask
 43 |         assert mask is not None
 44 |         not_mask = ~mask
 45 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 46 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 47 |         if self.normalize:
 48 |             eps = 1e-6
 49 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 50 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 51 | 
 52 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 53 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 54 | 
 55 |         pos_x = x_embed[:, :, :, None] / dim_t
 56 |         pos_y = y_embed[:, :, :, None] / dim_t
 57 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 58 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 59 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 60 |         return pos
 61 | 
 62 | class PositionEmbeddingSineHW(nn.Module):
 63 |     """
 64 |     This is a more standard version of the position embedding, very similar to the one
 65 |     used by the Attention is all you need paper, generalized to work on images.
 66 |     """
 67 |     def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
 68 |         super().__init__()
 69 |         self.num_pos_feats = num_pos_feats
 70 |         self.temperatureH = temperatureH
 71 |         self.temperatureW = temperatureW
 72 |         self.normalize = normalize
 73 |         if scale is not None and normalize is False:
 74 |             raise ValueError("normalize should be True if scale is passed")
 75 |         if scale is None:
 76 |             scale = 2 * math.pi
 77 |         self.scale = scale
 78 | 
 79 |     def forward(self, tensor_list: NestedTensor):
 80 |         x = tensor_list.tensors
 81 |         mask = tensor_list.mask
 82 |         assert mask is not None
 83 |         not_mask = ~mask
 84 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 85 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 86 | 
 87 | 
 88 | 
 89 |         if self.normalize:
 90 |             eps = 1e-6
 91 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 92 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 93 | 
 94 |         dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 95 |         dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
 96 |         pos_x = x_embed[:, :, :, None] / dim_tx
 97 | 
 98 |         dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 99 |         dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
100 |         pos_y = y_embed[:, :, :, None] / dim_ty
101 | 
102 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
103 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
104 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
105 | 
106 | 
107 | 
108 |         return pos
109 | 
110 | class PositionEmbeddingLearned(nn.Module):
111 |     """
112 |     Absolute pos embedding, learned.
113 |     """
114 |     def __init__(self, num_pos_feats=256):
115 |         super().__init__()
116 |         self.row_embed = nn.Embedding(50, num_pos_feats)
117 |         self.col_embed = nn.Embedding(50, num_pos_feats)
118 |         self.reset_parameters()
119 | 
120 |     def reset_parameters(self):
121 |         nn.init.uniform_(self.row_embed.weight)
122 |         nn.init.uniform_(self.col_embed.weight)
123 | 
124 |     def forward(self, tensor_list: NestedTensor):
125 |         x = tensor_list.tensors
126 |         h, w = x.shape[-2:]
127 |         i = torch.arange(w, device=x.device)
128 |         j = torch.arange(h, device=x.device)
129 |         x_emb = self.col_embed(i)
130 |         y_emb = self.row_embed(j)
131 |         pos = torch.cat([
132 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
133 |             y_emb.unsqueeze(1).repeat(1, w, 1),
134 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
135 |         return pos
136 | 
137 | 
138 | def build_position_encoding(args):
139 |     N_steps = args.hidden_dim // 2
140 |     if args.position_embedding in ('v2', 'sine'):
141 |         # TODO find a better way of exposing other arguments
142 |         position_embedding = PositionEmbeddingSineHW(
143 |             N_steps, 
144 |             temperatureH=args.pe_temperatureH,
145 |             temperatureW=args.pe_temperatureW,
146 |             normalize=True
147 |         )
148 |     elif args.position_embedding in ('v3', 'learned'):
149 |         position_embedding = PositionEmbeddingLearned(N_steps)
150 |     else:
151 |         raise ValueError(f"not supported {args.position_embedding}")
152 | 
153 |     return position_embedding
154 | 


--------------------------------------------------------------------------------
/models/dino/utils.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | 
  7 | import torch
  8 | from torch import nn, Tensor
  9 | 
 10 | import math
 11 | import torch.nn.functional as F
 12 | from torch import nn
 13 | 
 14 | 
 15 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor, learnedwh=None):
 16 |     """
 17 |     Input:
 18 |         - memory: bs, \sum{hw}, d_model
 19 |         - memory_padding_mask: bs, \sum{hw}
 20 |         - spatial_shapes: nlevel, 2
 21 |         - learnedwh: 2
 22 |     Output:
 23 |         - output_memory: bs, \sum{hw}, d_model
 24 |         - output_proposals: bs, \sum{hw}, 4
 25 |     """
 26 |     N_, S_, C_ = memory.shape
 27 |     base_scale = 4.0
 28 |     proposals = []
 29 |     _cur = 0
 30 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
 31 |         mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
 32 |         valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
 33 |         valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 34 | 
 35 |         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
 36 |                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
 37 |         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
 38 | 
 39 |         scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
 40 |         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
 41 | 
 42 |         if learnedwh is not None:
 43 |             wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
 44 |         else:
 45 |             wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
 46 | 
 47 |         proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
 48 |         proposals.append(proposal)
 49 |         _cur += (H_ * W_)
 50 | 
 51 |     output_proposals = torch.cat(proposals, 1)
 52 |     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
 53 |     output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
 54 |     output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
 55 |     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
 56 | 
 57 |     output_memory = memory
 58 |     output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
 59 |     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
 60 | 
 61 |     return output_memory, output_proposals
 62 | 
 63 | 
 64 | class RandomBoxPerturber():
 65 |     def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
 66 |         self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
 67 | 
 68 |     def __call__(self, refanchors: Tensor) -> Tensor:
 69 |         nq, bs, query_dim = refanchors.shape
 70 |         device = refanchors.device
 71 | 
 72 |         noise_raw = torch.rand_like(refanchors)
 73 |         noise_scale = self.noise_scale.to(device)[:query_dim]
 74 | 
 75 |         new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
 76 |         return new_refanchors.clamp_(0, 1)
 77 | 
 78 | 
 79 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
 80 |     """
 81 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
 82 |     Args:
 83 |         inputs: A float tensor of arbitrary shape.
 84 |                 The predictions for each example.
 85 |         targets: A float tensor with the same shape as inputs. Stores the binary
 86 |                  classification label for each element in inputs
 87 |                 (0 for the negative class and 1 for the positive class).
 88 |         alpha: (optional) Weighting factor in range (0,1) to balance
 89 |                 positive vs negative examples. Default = -1 (no weighting).
 90 |         gamma: Exponent of the modulating factor (1 - p_t) to
 91 |                balance easy vs hard examples.
 92 |     Returns:
 93 |         Loss tensor
 94 |     """
 95 |     prob = inputs.sigmoid()
 96 |     ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
 97 |     p_t = prob * targets + (1 - prob) * (1 - targets)
 98 |     loss = ce_loss * ((1 - p_t) ** gamma)
 99 | 
100 |     if alpha >= 0:
101 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
102 |         loss = alpha_t * loss
103 | 
104 |     return loss.mean(1).sum() / num_boxes
105 | 
106 | 
107 | class MLP(nn.Module):
108 |     """ Very simple multi-layer perceptron (also called FFN)"""
109 | 
110 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
111 |         super().__init__()
112 |         self.num_layers = num_layers
113 |         h = [hidden_dim] * (num_layers - 1)
114 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
115 | 
116 |     def forward(self, x):
117 |         for i, layer in enumerate(self.layers):
118 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
119 |         return x
120 | 
121 | 
122 | def _get_activation_fn(activation, d_model=256, batch_dim=0):
123 |     """Return an activation function given a string"""
124 |     if activation == "relu":
125 |         return F.relu
126 |     if activation == "gelu":
127 |         return F.gelu
128 |     if activation == "glu":
129 |         return F.glu
130 |     if activation == "prelu":
131 |         return nn.PReLU()
132 |     if activation == "selu":
133 |         return F.selu
134 | 
135 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
136 | 
137 | 
138 | def gen_sineembed_for_position(pos_tensor):
139 |     # n_query, bs, _ = pos_tensor.size()
140 |     # sineembed_tensor = torch.zeros(n_query, bs, 256)
141 |     scale = 2 * math.pi
142 |     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
143 |     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
144 |     x_embed = pos_tensor[:, :, 0] * scale
145 |     y_embed = pos_tensor[:, :, 1] * scale
146 |     pos_x = x_embed[:, :, None] / dim_t
147 |     pos_y = y_embed[:, :, None] / dim_t
148 |     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
149 |     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
150 |     if pos_tensor.size(-1) == 2:
151 |         pos = torch.cat((pos_y, pos_x), dim=2)
152 |     elif pos_tensor.size(-1) == 4:
153 |         w_embed = pos_tensor[:, :, 2] * scale
154 |         pos_w = w_embed[:, :, None] / dim_t
155 |         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
156 | 
157 |         h_embed = pos_tensor[:, :, 3] * scale
158 |         pos_h = h_embed[:, :, None] / dim_t
159 |         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
160 | 
161 |         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
162 |     else:
163 |         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
164 |     return pos


--------------------------------------------------------------------------------
/models/registry.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Yihao Chen
 3 | # @Date:   2021-08-16 16:03:17
 4 | # @Last Modified by:   Shilong Liu
 5 | # @Last Modified time: 2022-01-23 15:26
 6 | # modified from mmcv
 7 | 
 8 | import inspect
 9 | from functools import partial
10 | 
11 | 
12 | class Registry(object):
13 | 
14 |     def __init__(self, name):
15 |         self._name = name
16 |         self._module_dict = dict()
17 | 
18 |     def __repr__(self):
19 |         format_str = self.__class__.__name__ + '(name={}, items={})'.format(
20 |             self._name, list(self._module_dict.keys()))
21 |         return format_str
22 | 
23 |     def __len__(self):
24 |         return len(self._module_dict)
25 | 
26 |     @property
27 |     def name(self):
28 |         return self._name
29 | 
30 |     @property
31 |     def module_dict(self):
32 |         return self._module_dict
33 | 
34 |     def get(self, key):
35 |         return self._module_dict.get(key, None)
36 | 
37 |     def registe_with_name(self, module_name=None, force=False):
38 |         return partial(self.register, module_name=module_name, force=force)
39 | 
40 |     def register(self, module_build_function, module_name=None, force=False):
41 |         """Register a module build function.
42 |         Args:
43 |             module (:obj:`nn.Module`): Module to be registered.
44 |         """
45 |         if not inspect.isfunction(module_build_function):
46 |             raise TypeError('module_build_function must be a function, but got {}'.format(
47 |                 type(module_build_function)))
48 |         if module_name is None:
49 |             module_name = module_build_function.__name__
50 |         if not force and module_name in self._module_dict:
51 |             raise KeyError('{} is already registered in {}'.format(
52 |                 module_name, self.name))
53 |         self._module_dict[module_name] = module_build_function
54 | 
55 |         return module_build_function
56 | 
57 | MODULE_BUILD_FUNCS = Registry('model build functions')
58 | 
59 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cython
 2 | git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools
 3 | submitit
 4 | torch>=1.5.0
 5 | torchvision>=0.6.0
 6 | git+https://github.com/cocodataset/panopticapi.git#egg=panopticapi
 7 | scipy
 8 | termcolor
 9 | addict
10 | yapf
11 | timm


--------------------------------------------------------------------------------
/run_with_submitit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | A script to run multinode training with submitit.
  4 | """
  5 | import argparse
  6 | import os, sys
  7 | import uuid
  8 | from pathlib import Path
  9 | 
 10 | import main as detection
 11 | import submitit
 12 | 
 13 | 
 14 | def parse_args():
 15 |     detection_parser = detection.get_args_parser()
 16 |     parser = argparse.ArgumentParser("Submitit for detection", parents=[detection_parser])
 17 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 18 |     parser.add_argument("--nodes", default=1, type=int, help="Number of nodes to request")
 19 |     parser.add_argument("--timeout", default=60, type=int, help="Duration of the job")
 20 |     parser.add_argument("--cpus_per_task", default=16, type=int, help="Duration of the job")
 21 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 22 |     parser.add_argument("--job_name", type=str, help="Job name.")
 23 |     parser.add_argument("--qos", type=str, default=None, help="specify preemptive QOS.")
 24 |     parser.add_argument("--requeue", action='store_true', help="job requeue if preempted.")
 25 |     parser.add_argument("--mail_type", type=str, default='ALL', help=" send email when job begins, ends, fails or preempted.")
 26 |     parser.add_argument("--mail_user", type=str, default='', help=" email address.")
 27 |     # refer to https://slurm.schedmd.com/sbatch.html & \
 28 |     # https://github.com/facebookincubator/submitit/blob/11d8f87f785669e8a01aa9773a107f9180a63b09/submitit/slurm/slurm.py \
 29 |     # for more details about parameters of slurm.
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | def get_shared_folder() -> Path:
 34 |     user = os.getenv("USER")
 35 |     if Path("/comp_robot").is_dir():
 36 |         p = Path(f"/comp_robot/{user}/experiments")
 37 |         p.mkdir(exist_ok=True)
 38 |         return p
 39 |     raise RuntimeError("No shared folder available")
 40 | 
 41 | 
 42 | def get_init_file():
 43 |     # Init file must not exist, but it's parent dir must exist.
 44 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 45 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 46 |     if init_file.exists():
 47 |         os.remove(str(init_file))
 48 |     return init_file
 49 | 
 50 | 
 51 | class Trainer(object):
 52 |     def __init__(self, args):
 53 |         self.args = args
 54 | 
 55 |     def __call__(self):
 56 |         self._setup_gpu_args()
 57 |         detection.main(self.args)
 58 | 
 59 |     def checkpoint(self):
 60 |         import os
 61 |         import submitit
 62 | 
 63 |         checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth")
 64 |         if os.path.exists(checkpoint_file):
 65 |             self.args.resume = checkpoint_file
 66 |         print("Requeuing ", self.args)
 67 |         empty_trainer = type(self)(self.args)
 68 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 69 | 
 70 |     def _setup_gpu_args(self):
 71 |         import submitit
 72 | 
 73 |         job_env = submitit.JobEnvironment()
 74 |         self.args.output_dir = self.args.job_dir
 75 |         self.args.output_dir = str(self.args.output_dir).replace("%j", str(job_env.job_id))
 76 |         self.args.gpu = job_env.local_rank
 77 |         self.args.rank = job_env.global_rank
 78 |         self.args.world_size = job_env.num_tasks
 79 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 80 | 
 81 | 
 82 | 
 83 | def main():
 84 |     args = parse_args()
 85 |     args.commad_txt = "Command: "+' '.join(sys.argv)
 86 |     if args.job_dir == "":
 87 |         raise ValueError("You must set job_dir mannually.")
 88 | 
 89 |     # Note that the folder will depend on the job_id, to easily track experiments
 90 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 91 | 
 92 |     # cluster setup is defined by environment variables
 93 |     num_gpus_per_node = args.ngpus
 94 |     nodes = args.nodes
 95 |     timeout_min = args.timeout
 96 |     qos = args.qos
 97 | 
 98 |     additional_parameters = {
 99 |         'mail-user': args.mail_user,
100 |         'mail-type': args.mail_type,
101 |     }
102 |     if args.requeue:
103 |         additional_parameters['requeue'] = args.requeue
104 | 
105 | 
106 |     executor.update_parameters(
107 |         mem_gb=50 * num_gpus_per_node,
108 |         gpus_per_node=num_gpus_per_node,
109 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
110 |         cpus_per_task=16,
111 |         nodes=nodes,
112 |         timeout_min=timeout_min,  # max is 60 * 72
113 |         qos=qos,
114 |         slurm_additional_parameters=additional_parameters
115 |     )
116 | 
117 |     executor.update_parameters(name=args.job_name)
118 |     args.dist_url = get_init_file().as_uri()
119 | 
120 |     # run and submit
121 |     trainer = Trainer(args)
122 |     job = executor.submit(trainer)
123 | 
124 |     print("Submitted job_id:", job.job_id)
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 


--------------------------------------------------------------------------------
/scripts/DINO_eval.sh:
--------------------------------------------------------------------------------
 1 | coco_path=$1
 2 | checkpoint=$2
 3 | python main.py \
 4 |   --output_dir logs/DINO/R50-MS4-%j \
 5 | 	-c config/DINO/DINO_4scale.py --coco_path $coco_path  \
 6 | 	--eval --resume $checkpoint \
 7 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
 8 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
 9 | 	dn_box_noise_scale=1.0
10 | 


--------------------------------------------------------------------------------
/scripts/DINO_eval_dist.sh:
--------------------------------------------------------------------------------
 1 | coco_path=$1
 2 | checkpoint=$2
 3 | python -m torch.distributed.launch --nproc_per_node=8 main.py \
 4 |   --output_dir logs/DINO/R50-MS4-%j \
 5 | 	-c config/DINO/DINO_4scale.py --coco_path $coco_path  \
 6 | 	--eval --resume $checkpoint \
 7 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
 8 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
 9 | 	dn_box_noise_scale=1.0
10 | 


--------------------------------------------------------------------------------
/scripts/DINO_eval_submitit.sh:
--------------------------------------------------------------------------------
 1 | coco_path=$1
 2 | checkpoint=$2
 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \
 4 | 	--job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \
 5 | 	-c config/DINO/DINO_4scale.py --coco_path $coco_path  \
 6 | 	--eval --resume $checkpoint \
 7 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
 8 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
 9 | 	dn_box_noise_scale=1.0
10 | 


--------------------------------------------------------------------------------
/scripts/DINO_eval_submitit_5scale.sh:
--------------------------------------------------------------------------------
 1 | coco_path=$1
 2 | checkpoint=$2
 3 | python run_with_submitit.py --timeout 3000 --job_name DINO \
 4 | 	--job_dir logs/DINO/R50-MS5-%j --ngpus 8 --nodes 1 \
 5 | 	-c config/DINO/DINO_5scale.py --coco_path $coco_path  \
 6 | 	--eval --resume $checkpoint \
 7 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
 8 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
 9 | 	dn_box_noise_scale=1.0
10 | 


--------------------------------------------------------------------------------
/scripts/DINO_train.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | python main.py \
3 | 	--output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale.py --coco_path $coco_path \
4 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
5 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
6 | 	dn_box_noise_scale=1.0
7 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_convnext.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | backbone_dir=$2
3 | export CUDA_VISIBLE_DEVICES=$3 && python main.py \
4 | 	--output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale_convnext.py --coco_path $coco_path \
5 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
6 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
7 | 	dn_box_noise_scale=1.0 backbone_dir=$backbone_dir


--------------------------------------------------------------------------------
/scripts/DINO_train_dist.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | python -m torch.distributed.launch --nproc_per_node=8 main.py \
3 | 	--output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale.py --coco_path $coco_path \
4 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
5 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
6 | 	dn_box_noise_scale=1.0
7 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_submitit.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | python run_with_submitit.py --timeout 3000 --job_name DINO \
3 | 	--job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \
4 | 	-c config/DINO/DINO_4scale.py --coco_path $coco_path \
5 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
6 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
7 | 	dn_box_noise_scale=1.0
8 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_submitit_5scale.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | python run_with_submitit.py --timeout 3000 --job_name DINO \
3 | 	--job_dir logs/DINO/R50-MS5-%j --ngpus 8 --nodes 2 \
4 | 	-c config/DINO/DINO_5scale.py --coco_path $coco_path \
5 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
6 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
7 | 	dn_box_noise_scale=1.0
8 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_submitit_convnext.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | backbone_dir=$2
3 | python run_with_submitit.py --timeout 3000 --job_name DINO \
4 | 	--job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \
5 | 	-c config/DINO/DINO_4scale_convnext.py --coco_path $coco_path \
6 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
7 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
8 | 	dn_box_noise_scale=1.0 backbone_dir=$backbone_dir
9 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_submitit_swin.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | backbone_dir=$2
3 | python run_with_submitit.py --timeout 3000 --job_name DINO \
4 | 	--job_dir logs/DINO/R50-MS4-%j --ngpus 8 --nodes 1 \
5 | 	-c config/DINO/DINO_4scale_swin.py --coco_path $coco_path \
6 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
7 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
8 | 	dn_box_noise_scale=1.0 backbone_dir=$backbone_dir
9 | 


--------------------------------------------------------------------------------
/scripts/DINO_train_swin.sh:
--------------------------------------------------------------------------------
1 | coco_path=$1
2 | backbone_dir=$2
3 | export CUDA_VISIBLE_DEVICES=$3 && python main.py \
4 | 	--output_dir logs/DINO/R50-MS4 -c config/DINO/DINO_4scale_swin.py --coco_path $coco_path \
5 | 	--options dn_scalar=100 embed_init_tgt=TRUE \
6 | 	dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
7 | 	dn_box_noise_scale=1.0 backbone_dir=$backbone_dir
8 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | We provide a scirpt to calculate model size, GFLOPS, and FPS.
 2 | 
 3 | An example to use it:
 4 | ```bash
 5 | python tools/benchmark.py \
 6 |     --output_dir logs/test_flops \
 7 |     -c config/DINO/DINO_4scale.py \
 8 |     --options batch_size=1 \
 9 |     --coco_path /path/to/your/coco/dir
10 | ```
11 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/util/box_loss.py:
--------------------------------------------------------------------------------
  1 | # borrow from https://github.com/Zzh-tju/CIoU/blob/master/layers/modules/multibox_loss.py
  2 | 
  3 | import torch, math
  4 | 
  5 | 
  6 | 
  7 | def ciou(bboxes1, bboxes2):
  8 |     bboxes1 = torch.sigmoid(bboxes1)
  9 |     bboxes2 = torch.sigmoid(bboxes2)
 10 |     rows = bboxes1.shape[0]
 11 |     cols = bboxes2.shape[0]
 12 |     cious = torch.zeros((rows, cols))
 13 |     if rows * cols == 0:
 14 |         return cious
 15 |     exchange = False
 16 |     if bboxes1.shape[0] > bboxes2.shape[0]:
 17 |         bboxes1, bboxes2 = bboxes2, bboxes1
 18 |         cious = torch.zeros((cols, rows))
 19 |         exchange = True
 20 |     w1 = torch.exp(bboxes1[:, 2])
 21 |     h1 = torch.exp(bboxes1[:, 3])
 22 |     w2 = torch.exp(bboxes2[:, 2])
 23 |     h2 = torch.exp(bboxes2[:, 3])
 24 |     area1 = w1 * h1
 25 |     area2 = w2 * h2
 26 |     center_x1 = bboxes1[:, 0]
 27 |     center_y1 = bboxes1[:, 1]
 28 |     center_x2 = bboxes2[:, 0]
 29 |     center_y2 = bboxes2[:, 1]
 30 | 
 31 |     inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2)
 32 |     inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2)
 33 |     inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2)
 34 |     inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2)
 35 |     inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0)
 36 | 
 37 |     c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2)
 38 |     c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2)
 39 |     c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2)
 40 |     c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2)
 41 | 
 42 |     inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
 43 |     c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2
 44 | 
 45 |     union = area1+area2-inter_area
 46 |     u = (inter_diag) / c_diag
 47 |     iou = inter_area / union
 48 |     v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(w2 / h2) - torch.atan(w1 / h1)), 2)
 49 |     with torch.no_grad():
 50 |         S = (iou>0.5).float()
 51 |         alpha= S*v/(1-iou+v)
 52 |     cious = iou - u - alpha * v
 53 |     cious = torch.clamp(cious,min=-1.0,max = 1.0)
 54 |     if exchange:
 55 |         cious = cious.T
 56 |     return 1-cious
 57 | 
 58 | def diou(bboxes1, bboxes2):
 59 |     bboxes1 = torch.sigmoid(bboxes1)
 60 |     bboxes2 = torch.sigmoid(bboxes2)
 61 |     rows = bboxes1.shape[0]
 62 |     cols = bboxes2.shape[0]
 63 |     cious = torch.zeros((rows, cols))
 64 |     if rows * cols == 0:
 65 |         return cious
 66 |     exchange = False
 67 |     if bboxes1.shape[0] > bboxes2.shape[0]:
 68 |         bboxes1, bboxes2 = bboxes2, bboxes1
 69 |         cious = torch.zeros((cols, rows))
 70 |         exchange = True
 71 |     w1 = torch.exp(bboxes1[:, 2])
 72 |     h1 = torch.exp(bboxes1[:, 3])
 73 |     w2 = torch.exp(bboxes2[:, 2])
 74 |     h2 = torch.exp(bboxes2[:, 3])
 75 |     area1 = w1 * h1
 76 |     area2 = w2 * h2
 77 |     center_x1 = bboxes1[:, 0]
 78 |     center_y1 = bboxes1[:, 1]
 79 |     center_x2 = bboxes2[:, 0]
 80 |     center_y2 = bboxes2[:, 1]
 81 | 
 82 |     inter_l = torch.max(center_x1 - w1 / 2,center_x2 - w2 / 2)
 83 |     inter_r = torch.min(center_x1 + w1 / 2,center_x2 + w2 / 2)
 84 |     inter_t = torch.max(center_y1 - h1 / 2,center_y2 - h2 / 2)
 85 |     inter_b = torch.min(center_y1 + h1 / 2,center_y2 + h2 / 2)
 86 |     inter_area = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0)
 87 | 
 88 |     c_l = torch.min(center_x1 - w1 / 2,center_x2 - w2 / 2)
 89 |     c_r = torch.max(center_x1 + w1 / 2,center_x2 + w2 / 2)
 90 |     c_t = torch.min(center_y1 - h1 / 2,center_y2 - h2 / 2)
 91 |     c_b = torch.max(center_y1 + h1 / 2,center_y2 + h2 / 2)
 92 | 
 93 |     inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
 94 |     c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2
 95 | 
 96 |     union = area1+area2-inter_area
 97 |     u = (inter_diag) / c_diag
 98 |     iou = inter_area / union
 99 |     dious = iou - u
100 |     dious = torch.clamp(dious,min=-1.0,max = 1.0)
101 |     if exchange:
102 |         dious = dious.T
103 |     return 1-dious
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     x = torch.rand(10, 4)
108 |     y = torch.rand(10,4)
109 |     import ipdb;ipdb.set_trace()
110 |     cxy = ciou(x, y)
111 |     dxy = diou(x, y)
112 |     print(cxy.shape, dxy.shape)
113 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch, os
  6 | from torchvision.ops.boxes import box_area
  7 | 
  8 | 
  9 | def box_cxcywh_to_xyxy(x):
 10 |     x_c, y_c, w, h = x.unbind(-1)
 11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 12 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 13 |     return torch.stack(b, dim=-1)
 14 | 
 15 | 
 16 | def box_xyxy_to_cxcywh(x):
 17 |     x0, y0, x1, y1 = x.unbind(-1)
 18 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 19 |          (x1 - x0), (y1 - y0)]
 20 |     return torch.stack(b, dim=-1)
 21 | 
 22 | 
 23 | # modified from torchvision to also return the union
 24 | def box_iou(boxes1, boxes2):
 25 |     area1 = box_area(boxes1)
 26 |     area2 = box_area(boxes2)
 27 | 
 28 | 
 29 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 30 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 31 | 
 32 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 33 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 34 | 
 35 |     union = area1[:, None] + area2 - inter
 36 | 
 37 |     iou = inter / (union + 1e-6)
 38 |     return iou, union
 39 | 
 40 | 
 41 | def generalized_box_iou(boxes1, boxes2):
 42 |     """
 43 |     Generalized IoU from https://giou.stanford.edu/
 44 | 
 45 |     The boxes should be in [x0, y0, x1, y1] format
 46 | 
 47 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 48 |     and M = len(boxes2)
 49 |     """
 50 |     # degenerate boxes gives inf / nan results
 51 |     # so do an early check
 52 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 53 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 54 | 
 55 |     iou, union = box_iou(boxes1, boxes2)
 56 | 
 57 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 58 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 59 | 
 60 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 61 |     area = wh[:, :, 0] * wh[:, :, 1]
 62 | 
 63 |     return iou - (area - union) / (area + 1e-6)
 64 | 
 65 | 
 66 | 
 67 | # modified from torchvision to also return the union
 68 | def box_iou_pairwise(boxes1, boxes2):
 69 |     area1 = box_area(boxes1)
 70 |     area2 = box_area(boxes2)
 71 | 
 72 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
 73 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
 74 | 
 75 |     wh = (rb - lt).clamp(min=0)  # [N,2]
 76 |     inter = wh[:, 0] * wh[:, 1]  # [N]
 77 | 
 78 |     union = area1 + area2 - inter
 79 | 
 80 |     iou = inter / union
 81 |     return iou, union
 82 | 
 83 | 
 84 | def generalized_box_iou_pairwise(boxes1, boxes2):
 85 |     """
 86 |     Generalized IoU from https://giou.stanford.edu/
 87 | 
 88 |     Input:
 89 |         - boxes1, boxes2: N,4
 90 |     Output:
 91 |         - giou: N, 4
 92 |     """
 93 |     # degenerate boxes gives inf / nan results
 94 |     # so do an early check
 95 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 96 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 97 |     assert boxes1.shape == boxes2.shape
 98 |     iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
 99 | 
100 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
101 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
102 | 
103 |     wh = (rb - lt).clamp(min=0)  # [N,2]
104 |     area = wh[:, 0] * wh[:, 1]
105 | 
106 |     return iou - (area - union) / area
107 | 
108 | def masks_to_boxes(masks):
109 |     """Compute the bounding boxes around the provided masks
110 | 
111 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
112 | 
113 |     Returns a [N, 4] tensors, with the boxes in xyxy format
114 |     """
115 |     if masks.numel() == 0:
116 |         return torch.zeros((0, 4), device=masks.device)
117 | 
118 |     h, w = masks.shape[-2:]
119 | 
120 |     y = torch.arange(0, h, dtype=torch.float)
121 |     x = torch.arange(0, w, dtype=torch.float)
122 |     y, x = torch.meshgrid(y, x)
123 | 
124 |     x_mask = (masks * x.unsqueeze(0))
125 |     x_max = x_mask.flatten(1).max(-1)[0]
126 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
127 | 
128 |     y_mask = (masks * y.unsqueeze(0))
129 |     y_max = y_mask.flatten(1).max(-1)[0]
130 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
131 | 
132 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
133 | 
134 | if __name__ == '__main__':
135 |     x = torch.rand(5, 4)
136 |     y = torch.rand(3, 4)
137 |     iou, union = box_iou(x, y)
138 |     import ipdb; ipdb.set_trace()


--------------------------------------------------------------------------------
/util/coco_id2name.json:
--------------------------------------------------------------------------------
1 | {"1": "person", "2": "bicycle", "3": "car", "4": "motorcycle", "5": "airplane", "6": "bus", "7": "train", "8": "truck", "9": "boat", "10": "traffic light", "11": "fire hydrant", "13": "stop sign", "14": "parking meter", "15": "bench", "16": "bird", "17": "cat", "18": "dog", "19": "horse", "20": "sheep", "21": "cow", "22": "elephant", "23": "bear", "24": "zebra", "25": "giraffe", "27": "backpack", "28": "umbrella", "31": "handbag", "32": "tie", "33": "suitcase", "34": "frisbee", "35": "skis", "36": "snowboard", "37": "sports ball", "38": "kite", "39": "baseball bat", "40": "baseball glove", "41": "skateboard", "42": "surfboard", "43": "tennis racket", "44": "bottle", "46": "wine glass", "47": "cup", "48": "fork", "49": "knife", "50": "spoon", "51": "bowl", "52": "banana", "53": "apple", "54": "sandwich", "55": "orange", "56": "broccoli", "57": "carrot", "58": "hot dog", "59": "pizza", "60": "donut", "61": "cake", "62": "chair", "63": "couch", "64": "potted plant", "65": "bed", "67": "dining table", "70": "toilet", "72": "tv", "73": "laptop", "74": "mouse", "75": "remote", "76": "keyboard", "77": "cell phone", "78": "microwave", "79": "oven", "80": "toaster", "81": "sink", "82": "refrigerator", "84": "book", "85": "clock", "86": "vase", "87": "scissors", "88": "teddy bear", "89": "hair drier", "90": "toothbrush"}


--------------------------------------------------------------------------------
/util/get_param_dicts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | def match_name_keywords(n: str, name_keywords: list):
 7 |     out = False
 8 |     for b in name_keywords:
 9 |         if b in n:
10 |             out = True
11 |             break
12 |     return out
13 | 
14 | 
15 | def get_param_dict(args, model_without_ddp: nn.Module):
16 |     try:
17 |         param_dict_type = args.param_dict_type
18 |     except:
19 |         param_dict_type = 'default'
20 |     assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd']
21 | 
22 |     # by default
23 |     if param_dict_type == 'default':
24 |         param_dicts = [
25 |             {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
26 |             {
27 |                 "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
28 |                 "lr": args.lr_backbone,
29 |             }
30 |         ]
31 |         return param_dicts
32 | 
33 |     if param_dict_type == 'ddetr_in_mmdet':
34 |         param_dicts = [
35 |             {
36 |                 "params":
37 |                     [p for n, p in model_without_ddp.named_parameters()
38 |                         if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
39 |                 "lr": args.lr,
40 |             },
41 |             {
42 |                 "params": [p for n, p in model_without_ddp.named_parameters() 
43 |                         if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
44 |                 "lr": args.lr_backbone,
45 |             },
46 |             {
47 |                 "params": [p for n, p in model_without_ddp.named_parameters() 
48 |                         if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
49 |                 "lr": args.lr * args.lr_linear_proj_mult,
50 |             }
51 |         ]        
52 |         return param_dicts
53 | 
54 |     if param_dict_type == 'large_wd':
55 |         param_dicts = [
56 |                 {
57 |                     "params":
58 |                         [p for n, p in model_without_ddp.named_parameters()
59 |                             if not match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
60 |                 },
61 |                 {
62 |                     "params": [p for n, p in model_without_ddp.named_parameters() 
63 |                             if match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
64 |                     "lr": args.lr_backbone,
65 |                     "weight_decay": 0.0,
66 |                 },
67 |                 {
68 |                     "params": [p for n, p in model_without_ddp.named_parameters() 
69 |                             if match_name_keywords(n, ['backbone']) and not match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
70 |                     "lr": args.lr_backbone,
71 |                     "weight_decay": args.weight_decay,
72 |                 },
73 |                 {
74 |                     "params":
75 |                         [p for n, p in model_without_ddp.named_parameters()
76 |                             if not match_name_keywords(n, ['backbone']) and match_name_keywords(n, ['norm', 'bias']) and p.requires_grad],
77 |                     "lr": args.lr,
78 |                     "weight_decay": 0.0,
79 |                 }
80 |             ]
81 | 
82 |         # print("param_dicts: {}".format(param_dicts))
83 | 
84 |     return param_dicts


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import functools
 3 | import logging
 4 | import os
 5 | import sys
 6 | from termcolor import colored
 7 | 
 8 | 
 9 | class _ColorfulFormatter(logging.Formatter):
10 |     def __init__(self, *args, **kwargs):
11 |         self._root_name = kwargs.pop("root_name") + "."
12 |         self._abbrev_name = kwargs.pop("abbrev_name", "")
13 |         if len(self._abbrev_name):
14 |             self._abbrev_name = self._abbrev_name + "."
15 |         super(_ColorfulFormatter, self).__init__(*args, **kwargs)
16 | 
17 |     def formatMessage(self, record):
18 |         record.name = record.name.replace(self._root_name, self._abbrev_name)
19 |         log = super(_ColorfulFormatter, self).formatMessage(record)
20 |         if record.levelno == logging.WARNING:
21 |             prefix = colored("WARNING", "red", attrs=["blink"])
22 |         elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
23 |             prefix = colored("ERROR", "red", attrs=["blink", "underline"])
24 |         else:
25 |             return log
26 |         return prefix + " " + log
27 | 
28 | 
29 | # so that calling setup_logger multiple times won't add many handlers
30 | @functools.lru_cache()
31 | def setup_logger(
32 |     output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None
33 | ):
34 |     """
35 |     Initialize the detectron2 logger and set its verbosity level to "INFO".
36 | 
37 |     Args:
38 |         output (str): a file name or a directory to save log. If None, will not save log file.
39 |             If ends with ".txt" or ".log", assumed to be a file name.
40 |             Otherwise, logs will be saved to `output/log.txt`.
41 |         name (str): the root module name of this logger
42 | 
43 |     Returns:
44 |         logging.Logger: a logger
45 |     """
46 |     logger = logging.getLogger(name)
47 |     logger.setLevel(logging.DEBUG)
48 |     logger.propagate = False
49 | 
50 |     if abbrev_name is None:
51 |         abbrev_name = name
52 | 
53 |     plain_formatter = logging.Formatter(
54 |         '[%(asctime)s.%(msecs)03d]: %(message)s',
55 |         datefmt='%m/%d %H:%M:%S'
56 |     )
57 |     # stdout logging: master only
58 |     if distributed_rank == 0:
59 |         ch = logging.StreamHandler(stream=sys.stdout)
60 |         ch.setLevel(logging.DEBUG)
61 |         if color:
62 |             formatter = _ColorfulFormatter(
63 |                 colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s",
64 |                 datefmt="%m/%d %H:%M:%S",
65 |                 root_name=name,
66 |                 abbrev_name=str(abbrev_name),
67 |             )
68 |         else:
69 |             formatter = plain_formatter
70 |         ch.setFormatter(formatter)
71 |         logger.addHandler(ch)
72 | 
73 |     # file logging: all workers
74 |     if output is not None:
75 |         if output.endswith(".txt") or output.endswith(".log"):
76 |             filename = output
77 |         else:
78 |             filename = os.path.join(output, "log.txt")
79 |         if distributed_rank > 0:
80 |             filename = filename + f".rank{distributed_rank}"
81 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
82 | 
83 |         fh = logging.StreamHandler(_cached_log_stream(filename))
84 |         fh.setLevel(logging.DEBUG)
85 |         fh.setFormatter(plain_formatter)
86 |         logger.addHandler(fh)
87 | 
88 |     return logger
89 | 
90 | 
91 | # cache the opened file object, so that different calls to `setup_logger`
92 | # with the same file name can safely write to the same file.
93 | @functools.lru_cache(maxsize=None)
94 | def _cached_log_stream(filename):
95 |     return open(filename, "a")
96 | 


--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Plotting utilities to visualize training logs.
  3 | """
  4 | import torch
  5 | import pandas as pd
  6 | import numpy as np
  7 | import seaborn as sns
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | from pathlib import Path, PurePath
 11 | 
 12 | 
 13 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
 14 |     '''
 15 |     Function to plot specific fields from training log(s). Plots both training and test results.
 16 | 
 17 |     :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
 18 |               - fields = which results to plot from each log file - plots both training and test for each field.
 19 |               - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
 20 |               - log_name = optional, name of log file if different than default 'log.txt'.
 21 | 
 22 |     :: Outputs - matplotlib plots of results in fields, color coded for each log file.
 23 |                - solid lines are training results, dashed lines are test results.
 24 | 
 25 |     '''
 26 |     func_name = "plot_utils.py::plot_logs"
 27 | 
 28 |     # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
 29 |     # convert single Path to list to avoid 'not iterable' error
 30 | 
 31 |     if not isinstance(logs, list):
 32 |         if isinstance(logs, PurePath):
 33 |             logs = [logs]
 34 |             print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
 35 |         else:
 36 |             raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
 37 |             Expect list[Path] or single Path obj, received {type(logs)}")
 38 | 
 39 |     # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
 40 |     for i, dir in enumerate(logs):
 41 |         if not isinstance(dir, PurePath):
 42 |             raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
 43 |         if not dir.exists():
 44 |             raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
 45 |         # verify log_name exists
 46 |         fn = Path(dir / log_name)
 47 |         if not fn.exists():
 48 |             print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
 49 |             print(f"--> full path of missing log file: {fn}")
 50 |             return
 51 | 
 52 |     # load log file(s) and plot
 53 |     dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
 54 | 
 55 |     fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
 56 | 
 57 |     for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
 58 |         for j, field in enumerate(fields):
 59 |             if field == 'mAP':
 60 |                 coco_eval = pd.DataFrame(
 61 |                     np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
 62 |                 ).ewm(com=ewm_col).mean()
 63 |                 axs[j].plot(coco_eval, c=color)
 64 |             else:
 65 |                 df.interpolate().ewm(com=ewm_col).mean().plot(
 66 |                     y=[f'train_{field}', f'test_{field}'],
 67 |                     ax=axs[j],
 68 |                     color=[color] * 2,
 69 |                     style=['-', '--']
 70 |                 )
 71 |     for ax, field in zip(axs, fields):
 72 |         if field == 'mAP':
 73 |             ax.legend([Path(p).name for p in logs])
 74 |             ax.set_title(field)
 75 |         else:
 76 |             ax.legend([f'train', f'test'])
 77 |             ax.set_title(field)
 78 | 
 79 |     return fig, axs
 80 | 
 81 | def plot_precision_recall(files, naming_scheme='iter'):
 82 |     if naming_scheme == 'exp_id':
 83 |         # name becomes exp_id
 84 |         names = [f.parts[-3] for f in files]
 85 |     elif naming_scheme == 'iter':
 86 |         names = [f.stem for f in files]
 87 |     else:
 88 |         raise ValueError(f'not supported {naming_scheme}')
 89 |     fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
 90 |     for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
 91 |         data = torch.load(f)
 92 |         # precision is n_iou, n_points, n_cat, n_area, max_det
 93 |         precision = data['precision']
 94 |         recall = data['params'].recThrs
 95 |         scores = data['scores']
 96 |         # take precision for all classes, all areas and 100 detections
 97 |         precision = precision[0, :, :, 0, -1].mean(1)
 98 |         scores = scores[0, :, :, 0, -1].mean(1)
 99 |         prec = precision.mean()
100 |         rec = data['recall'][0, :, 0, -1].mean()
101 |         print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
102 |               f'score={scores.mean():0.3f}, ' +
103 |               f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
104 |               )
105 |         axs[0].plot(recall, precision, c=color)
106 |         axs[1].plot(recall, scores, c=color)
107 | 
108 |     axs[0].set_title('Precision / Recall')
109 |     axs[0].legend(names)
110 |     axs[1].set_title('Scores / Recall')
111 |     axs[1].legend(names)
112 |     return fig, axs
113 | 


--------------------------------------------------------------------------------
/util/slio.py:
--------------------------------------------------------------------------------
  1 | # ==========================================================
  2 | # Modified from mmcv
  3 | # ==========================================================
  4 | 
  5 | import json, pickle, yaml
  6 | try:
  7 |     from yaml import CLoader as Loader, CDumper as Dumper
  8 | except ImportError:
  9 |     from yaml import Loader, Dumper
 10 | 
 11 | from pathlib import Path
 12 | from abc import ABCMeta, abstractmethod
 13 | 
 14 | # ===========================
 15 | # Rigister handler
 16 | # ===========================
 17 | 
 18 | class BaseFileHandler(metaclass=ABCMeta):
 19 | 
 20 |     @abstractmethod
 21 |     def load_from_fileobj(self, file, **kwargs):
 22 |         pass
 23 | 
 24 |     @abstractmethod
 25 |     def dump_to_fileobj(self, obj, file, **kwargs):
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def dump_to_str(self, obj, **kwargs):
 30 |         pass
 31 | 
 32 |     def load_from_path(self, filepath, mode='r', **kwargs):
 33 |         with open(filepath, mode) as f:
 34 |             return self.load_from_fileobj(f, **kwargs)
 35 | 
 36 |     def dump_to_path(self, obj, filepath, mode='w', **kwargs):
 37 |         with open(filepath, mode) as f:
 38 |             self.dump_to_fileobj(obj, f, **kwargs)
 39 | 
 40 | class JsonHandler(BaseFileHandler):
 41 | 
 42 |     def load_from_fileobj(self, file):
 43 |         return json.load(file)
 44 | 
 45 |     def dump_to_fileobj(self, obj, file, **kwargs):
 46 |         json.dump(obj, file, **kwargs)
 47 | 
 48 |     def dump_to_str(self, obj, **kwargs):
 49 |         return json.dumps(obj, **kwargs)
 50 | 
 51 | class PickleHandler(BaseFileHandler):
 52 | 
 53 |     def load_from_fileobj(self, file, **kwargs):
 54 |         return pickle.load(file, **kwargs)
 55 | 
 56 |     def load_from_path(self, filepath, **kwargs):
 57 |         return super(PickleHandler, self).load_from_path(
 58 |             filepath, mode='rb', **kwargs)
 59 | 
 60 |     def dump_to_str(self, obj, **kwargs):
 61 |         kwargs.setdefault('protocol', 2)
 62 |         return pickle.dumps(obj, **kwargs)
 63 | 
 64 |     def dump_to_fileobj(self, obj, file, **kwargs):
 65 |         kwargs.setdefault('protocol', 2)
 66 |         pickle.dump(obj, file, **kwargs)
 67 | 
 68 |     def dump_to_path(self, obj, filepath, **kwargs):
 69 |         super(PickleHandler, self).dump_to_path(
 70 |             obj, filepath, mode='wb', **kwargs)
 71 | 
 72 | class YamlHandler(BaseFileHandler):
 73 | 
 74 |     def load_from_fileobj(self, file, **kwargs):
 75 |         kwargs.setdefault('Loader', Loader)
 76 |         return yaml.load(file, **kwargs)
 77 | 
 78 |     def dump_to_fileobj(self, obj, file, **kwargs):
 79 |         kwargs.setdefault('Dumper', Dumper)
 80 |         yaml.dump(obj, file, **kwargs)
 81 | 
 82 |     def dump_to_str(self, obj, **kwargs):
 83 |         kwargs.setdefault('Dumper', Dumper)
 84 |         return yaml.dump(obj, **kwargs)
 85 | 
 86 | file_handlers = {
 87 |     'json': JsonHandler(),
 88 |     'yaml': YamlHandler(),
 89 |     'yml': YamlHandler(),
 90 |     'pickle': PickleHandler(),
 91 |     'pkl': PickleHandler()
 92 | }
 93 | 
 94 | # ===========================
 95 | # load and dump
 96 | # ===========================
 97 | 
 98 | def is_str(x):
 99 |     """Whether the input is an string instance.
100 | 
101 |     Note: This method is deprecated since python 2 is no longer supported.
102 |     """
103 |     return isinstance(x, str)
104 | 
105 | def slload(file, file_format=None, **kwargs):
106 |     """Load data from json/yaml/pickle files.
107 | 
108 |     This method provides a unified api for loading data from serialized files.
109 | 
110 |     Args:
111 |         file (str or :obj:`Path` or file-like object): Filename or a file-like
112 |             object.
113 |         file_format (str, optional): If not specified, the file format will be
114 |             inferred from the file extension, otherwise use the specified one.
115 |             Currently supported formats include "json", "yaml/yml" and
116 |             "pickle/pkl".
117 | 
118 |     Returns:
119 |         The content from the file.
120 |     """
121 |     if isinstance(file, Path):
122 |         file = str(file)
123 |     if file_format is None and is_str(file):
124 |         file_format = file.split('.')[-1]
125 |     if file_format not in file_handlers:
126 |         raise TypeError(f'Unsupported format: {file_format}')
127 | 
128 |     handler = file_handlers[file_format]
129 |     if is_str(file):
130 |         obj = handler.load_from_path(file, **kwargs)
131 |     elif hasattr(file, 'read'):
132 |         obj = handler.load_from_fileobj(file, **kwargs)
133 |     else:
134 |         raise TypeError('"file" must be a filepath str or a file-object')
135 |     return obj
136 | 
137 | 
138 | def sldump(obj, file=None, file_format=None, **kwargs):
139 |     """Dump data to json/yaml/pickle strings or files.
140 | 
141 |     This method provides a unified api for dumping data as strings or to files,
142 |     and also supports custom arguments for each file format.
143 | 
144 |     Args:
145 |         obj (any): The python object to be dumped.
146 |         file (str or :obj:`Path` or file-like object, optional): If not
147 |             specified, then the object is dump to a str, otherwise to a file
148 |             specified by the filename or file-like object.
149 |         file_format (str, optional): Same as :func:`load`.
150 | 
151 |     Returns:
152 |         bool: True for success, False otherwise.
153 |     """
154 |     if isinstance(file, Path):
155 |         file = str(file)
156 |     if file_format is None:
157 |         if is_str(file):
158 |             file_format = file.split('.')[-1]
159 |         elif file is None:
160 |             raise ValueError(
161 |                 'file_format must be specified since file is None')
162 |     if file_format not in file_handlers:
163 |         raise TypeError(f'Unsupported format: {file_format}')
164 | 
165 |     handler = file_handlers[file_format]
166 |     if file is None:
167 |         return handler.dump_to_str(obj, **kwargs)
168 |     elif is_str(file):
169 |         handler.dump_to_path(obj, file, **kwargs)
170 |     elif hasattr(file, 'write'):
171 |         handler.dump_to_fileobj(obj, file, **kwargs)
172 |     else:
173 |         raise TypeError('"file" must be a filename str or a file-object')
174 | 


--------------------------------------------------------------------------------
/util/static_data_path.py:
--------------------------------------------------------------------------------
 1 | coco = dict(
 2 |     train = dict(
 3 |         img_folder = '/comp_robot/cv_public_dataset/COCO2017/train2017',
 4 |         ann_file = '/comp_robot/cv_public_dataset/COCO2017/annotations/instances_train2017.json'
 5 |     ),
 6 |     val = dict(
 7 |         img_folder = '/comp_robot/cv_public_dataset/COCO2017/val2017',
 8 |         ann_file = '/comp_robot/cv_public_dataset/COCO2017/annotations/instances_val2017.json'
 9 |     )
10 | )


--------------------------------------------------------------------------------
/util/time_counter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | class TimeCounter:
 5 |     def __init__(self) -> None:
 6 |         pass
 7 |     
 8 |     def clear(self):
 9 |         self.timedict = {}
10 |         self.basetime = time.perf_counter()
11 | 
12 |     def timeit(self, name):
13 |         nowtime = time.perf_counter() - self.basetime
14 |         self.timedict[name] = nowtime
15 |         self.basetime = time.perf_counter()
16 | 
17 | 
18 | class TimeHolder:
19 |     def __init__(self) -> None:
20 |         self.timedict = {}
21 | 
22 |     def update(self, _timedict:dict):
23 |         for k,v in _timedict.items():
24 |             if k not in self.timedict:
25 |                 self.timedict[k] = AverageMeter(name=k, val_only=True)
26 |             self.timedict[k].update(val=v)
27 | 
28 |     def final_res(self):
29 |         return {k:v.avg for k,v in self.timedict.items()}
30 |         
31 |     def __str__(self):
32 |         return json.dumps(self.final_res(), indent=2)
33 | 
34 | 
35 | class AverageMeter(object):
36 |     """Computes and stores the average and current value"""
37 |     def __init__(self, name, fmt=':f', val_only=False):
38 |         self.name = name
39 |         self.fmt = fmt
40 |         self.val_only = val_only
41 |         self.reset()
42 | 
43 |     def reset(self):
44 |         self.val = 0
45 |         self.avg = 0
46 |         self.sum = 0
47 |         self.count = 0
48 | 
49 |     def update(self, val, n=1):
50 |         self.val = val
51 |         self.sum += val * n
52 |         self.count += n
53 |         self.avg = self.sum / self.count
54 | 
55 |     def __str__(self):
56 |         if self.val_only:
57 |             fmtstr = '{name} {val' + self.fmt + '}'
58 |         else:
59 |             fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
60 |         return fmtstr.format(**self.__dict__)


--------------------------------------------------------------------------------
/util/vis_utils.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | from util.utils import renorm
 5 | from util.misc import color_sys
 6 | 
 7 | _color_getter = color_sys(100)
 8 | 
 9 | # plot known and unknown box
10 | def add_box_to_img(img, boxes, colorlist, brands=None):
11 |     """[summary]
12 | 
13 |     Args:
14 |         img ([type]): np.array, H,W,3
15 |         boxes ([type]): list of list(4)
16 |         colorlist: list of colors.
17 |         brands: text.
18 | 
19 |     Return:
20 |         img: np.array. H,W,3.
21 |     """
22 |     H, W = img.shape[:2]
23 |     for _i, (box, color) in enumerate(zip(boxes, colorlist)):
24 |         x, y, w, h = box[0] * W, box[1] * H, box[2] * W, box[3] * H
25 |         img = cv2.rectangle(img.copy(), (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), color, 2)
26 |         if brands is not None:
27 |             brand = brands[_i]
28 |             org = (int(x-w/2), int(y+h/2))
29 |             font = cv2.FONT_HERSHEY_SIMPLEX
30 |             fontScale = 0.5
31 |             thickness = 1
32 |             img = cv2.putText(img.copy(), str(brand), org, font, 
33 |                 fontScale, color, thickness, cv2.LINE_AA)
34 |     return img
35 | 
36 | def plot_dual_img(img, boxes, labels, idxs, probs=None):
37 |     """[summary]
38 | 
39 |     Args:
40 |         img ([type]): 3,H,W. tensor.
41 |         boxes (): tensor(Kx4) or list of tensor(1x4).
42 |         labels ([type]): list of ints.
43 |         idxs ([type]): list of ints.
44 |         probs (optional): listof floats.
45 | 
46 |     Returns:
47 |         img_classcolor: np.array. H,W,3. img with class-wise label.
48 |         img_seqcolor: np.array. H,W,3. img with seq-wise label.
49 |     """
50 | 
51 |     boxes = [i.cpu().tolist() for i in boxes]
52 |     img = (renorm(img.cpu()).permute(1,2,0).numpy() * 255).astype(np.uint8)
53 |     # plot with class
54 |     class_colors = [_color_getter(i) for i in labels]
55 |     if probs is not None:
56 |         brands = ["{},{:.2f}".format(j,k) for j,k in zip(labels, probs)]
57 |     else:
58 |         brands = labels
59 |     img_classcolor = add_box_to_img(img, boxes, class_colors, brands=brands)
60 |     # plot with seq
61 |     seq_colors = [_color_getter((i * 11) % 100) for i in idxs]
62 |     img_seqcolor = add_box_to_img(img, boxes, seq_colors, brands=idxs)
63 |     return img_classcolor, img_seqcolor
64 | 
65 | 
66 | def plot_raw_img(img, boxes, labels):
67 |     """[summary]
68 | 
69 |     Args:
70 |         img ([type]): 3,H,W. tensor. 
71 |         boxes ([type]): Kx4. tensor
72 |         labels ([type]): K. tensor.
73 | 
74 |     return:
75 |         img: np.array. H,W,3. img with bbox annos.
76 |     
77 |     """
78 |     img = (renorm(img.cpu()).permute(1,2,0).numpy() * 255).astype(np.uint8)
79 |     H, W = img.shape[:2]
80 |     for box, label in zip(boxes.tolist(), labels.tolist()):
81 |         x, y, w, h = box[0] * W, box[1] * H, box[2] * W, box[3] * H
82 | 
83 |         img = cv2.rectangle(img.copy(), (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), _color_getter(label), 2)
84 |         # add text
85 |         org = (int(x-w/2), int(y+h/2))
86 |         font = cv2.FONT_HERSHEY_SIMPLEX
87 |         fontScale = 1
88 |         thickness = 1
89 |         img = cv2.putText(img.copy(), str(label), org, font, 
90 |             fontScale, _color_getter(label), thickness, cv2.LINE_AA)
91 | 
92 |     return img


--------------------------------------------------------------------------------
/util/visualizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | @File    :   visualizer.py
  4 | @Time    :   2022/04/05 11:39:33
  5 | @Author  :   Shilong Liu 
  6 | @Contact :   liusl20@mail.tsinghua.edu.cn; slongliu86@gmail.com
  7 | Modified from COCO evaluator
  8 | '''
  9 | 
 10 | import os, sys
 11 | from textwrap import wrap
 12 | import torch
 13 | import numpy as np
 14 | import cv2
 15 | import datetime
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | from matplotlib.collections import PatchCollection
 19 | from matplotlib.patches import Polygon
 20 | from pycocotools import mask as maskUtils
 21 | from matplotlib import transforms
 22 | 
 23 | def renorm(img: torch.FloatTensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) \
 24 |         -> torch.FloatTensor:
 25 |     # img: tensor(3,H,W) or tensor(B,3,H,W)
 26 |     # return: same as img
 27 |     assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim() 
 28 |     if img.dim() == 3:
 29 |         assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (img.size(0), str(img.size()))
 30 |         img_perm = img.permute(1,2,0)
 31 |         mean = torch.Tensor(mean)
 32 |         std = torch.Tensor(std)
 33 |         img_res = img_perm * std + mean
 34 |         return img_res.permute(2,0,1)
 35 |     else: # img.dim() == 4
 36 |         assert img.size(1) == 3, 'img.size(1) shoule be 3 but "%d". (%s)' % (img.size(1), str(img.size()))
 37 |         img_perm = img.permute(0,2,3,1)
 38 |         mean = torch.Tensor(mean)
 39 |         std = torch.Tensor(std)
 40 |         img_res = img_perm * std + mean
 41 |         return img_res.permute(0,3,1,2)
 42 | 
 43 | class ColorMap():
 44 |     def __init__(self, basergb=[255,255,0]):
 45 |         self.basergb = np.array(basergb)
 46 |     def __call__(self, attnmap):
 47 |         # attnmap: h, w. np.uint8.
 48 |         # return: h, w, 4. np.uint8.
 49 |         assert attnmap.dtype == np.uint8
 50 |         h, w = attnmap.shape
 51 |         res = self.basergb.copy()
 52 |         res = res[None][None].repeat(h, 0).repeat(w, 1) # h, w, 3
 53 |         attn1 = attnmap.copy()[..., None] # h, w, 1
 54 |         res = np.concatenate((res, attn1), axis=-1).astype(np.uint8)
 55 |         return res
 56 | 
 57 | 
 58 | class COCOVisualizer():
 59 |     def __init__(self) -> None:
 60 |         pass
 61 | 
 62 |     def visualize(self, img, tgt, caption=None, dpi=120, savedir=None, show_in_console=True):
 63 |         """
 64 |         img: tensor(3, H, W)
 65 |         tgt: make sure they are all on cpu.
 66 |             must have items: 'image_id', 'boxes', 'size'
 67 |         """
 68 |         plt.figure(dpi=dpi)
 69 |         plt.rcParams['font.size'] = '5'
 70 |         ax = plt.gca()
 71 |         img = renorm(img).permute(1, 2, 0)
 72 |         ax.imshow(img)
 73 |         
 74 |         self.addtgt(tgt)
 75 |         if show_in_console:
 76 |             plt.show()
 77 | 
 78 |         if savedir is not None:
 79 |             if caption is None:
 80 |                 savename = '{}/{}-{}.png'.format(savedir, int(tgt['image_id']), str(datetime.datetime.now()).replace(' ', '-'))
 81 |             else:
 82 |                 savename = '{}/{}-{}-{}.png'.format(savedir, caption, int(tgt['image_id']), str(datetime.datetime.now()).replace(' ', '-'))
 83 |             print("savename: {}".format(savename))
 84 |             os.makedirs(os.path.dirname(savename), exist_ok=True)
 85 |             plt.savefig(savename)
 86 |         plt.close()
 87 | 
 88 |     def addtgt(self, tgt):
 89 |         """
 90 |         - tgt: dict. args:
 91 |             - boxes: num_boxes, 4. xywh, [0,1].
 92 |             - box_label: num_boxes.
 93 |         """
 94 |         assert 'boxes' in tgt
 95 |         ax = plt.gca()
 96 |         H, W = tgt['size'].tolist() 
 97 |         numbox = tgt['boxes'].shape[0]
 98 | 
 99 |         color = []
100 |         polygons = []
101 |         boxes = []
102 |         for box in tgt['boxes'].cpu():
103 |             unnormbbox = box * torch.Tensor([W, H, W, H])
104 |             unnormbbox[:2] -= unnormbbox[2:] / 2
105 |             [bbox_x, bbox_y, bbox_w, bbox_h] = unnormbbox.tolist()
106 |             boxes.append([bbox_x, bbox_y, bbox_w, bbox_h])
107 |             poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
108 |             np_poly = np.array(poly).reshape((4,2))
109 |             polygons.append(Polygon(np_poly))
110 |             c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
111 |             color.append(c)
112 | 
113 |         p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.1)
114 |         ax.add_collection(p)
115 |         p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
116 |         ax.add_collection(p)
117 | 
118 | 
119 |         if 'box_label' in tgt:
120 |             assert len(tgt['box_label']) == numbox, f"{len(tgt['box_label'])} = {numbox}, "
121 |             for idx, bl in enumerate(tgt['box_label']):
122 |                 _string = str(bl)
123 |                 bbox_x, bbox_y, bbox_w, bbox_h = boxes[idx]
124 |                 # ax.text(bbox_x, bbox_y, _string, color='black', bbox={'facecolor': 'yellow', 'alpha': 1.0, 'pad': 1})
125 |                 ax.text(bbox_x, bbox_y, _string, color='black', bbox={'facecolor': color[idx], 'alpha': 0.6, 'pad': 1})
126 | 
127 |         if 'caption' in tgt:
128 |             ax.set_title(tgt['caption'], wrap=True)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------