├── .gitignore
├── LICENSE
├── README.md
├── classification
    ├── LICENSE
    ├── README.md
    ├── datasets.py
    ├── dist_resume.sh
    ├── dist_train.sh
    ├── engine.py
    ├── hubconf.py
    ├── losses.py
    ├── main.py
    ├── mcloader
    │   ├── __init__.py
    │   ├── classification.py
    │   ├── data_prefetcher.py
    │   ├── image_list.py
    │   ├── imagenet.py
    │   └── mcloader.py
    ├── models
    │   ├── __init__.py
    │   ├── dpt
    │   │   ├── __init__.py
    │   │   ├── box_coder.py
    │   │   ├── depatch_embed.py
    │   │   ├── dpt.py
    │   │   └── ms_deform_attn_func.py
    │   └── pvt.py
    ├── requirements.txt
    ├── run_with_submitit.py
    ├── samplers.py
    ├── tox.ini
    └── utils.py
├── detection
    ├── README.md
    ├── configs
    │   ├── _base_
    │   │   ├── datasets
    │   │   │   ├── coco_detection.py
    │   │   │   └── coco_instance.py
    │   │   ├── default_runtime.py
    │   │   └── models
    │   │   │   ├── mask_rcnn_r50_fpn.py
    │   │   │   └── retinanet_r50_fpn.py
    │   ├── detr_dpt_s_8x2_50ep_coco.py
    │   ├── detr_pvt_s_8x2_50ep_coco.py
    │   ├── detr_r50_8x2_50ep_coco_baseline.py
    │   ├── mask_rcnn_dpt_m_fpn_1x_coco.py
    │   ├── mask_rcnn_dpt_m_fpn_mstrain-poly_3x_coco.py
    │   ├── mask_rcnn_dpt_s_fpn_1x_coco.py
    │   ├── mask_rcnn_dpt_s_fpn_mstrain-poly_3x_coco.py
    │   ├── mask_rcnn_dpt_t_fpn_1x_coco.py
    │   ├── mask_rcnn_dpt_t_fpn_mstrain-poly_3x_coco.py
    │   ├── mask_rcnn_pvt_s_fpn_1x_coco.py
    │   ├── mask_rcnn_pvt_t_fpn_1x_coco.py
    │   ├── retinanet_dpt_m_fpn_1x_coco.py
    │   ├── retinanet_dpt_m_fpn_mstrain_3x_coco.py
    │   ├── retinanet_dpt_s_fpn_1x_coco.py
    │   ├── retinanet_dpt_s_fpn_mstrain_3x_coco.py
    │   ├── retinanet_dpt_t_fpn_1x_coco.py
    │   ├── retinanet_dpt_t_fpn_mstrain_3x_coco.py
    │   ├── retinanet_pvt_s_fpn_1x_coco.py
    │   ├── retinanet_pvt_s_fpn_1x_coco_640.py
    │   └── retinanet_pvt_t_fpn_1x_coco.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── dpt_models
    │   ├── __init__.py
    │   ├── box_coder.py
    │   ├── depatch_embed.py
    │   ├── dpt.py
    │   └── ms_deform_attn_func.py
    ├── pvt.py
    ├── test.py
    └── train.py
└── ops
    ├── functions
        ├── __init__.py
        └── ms_deform_attn_func.py
    ├── make.sh
    ├── modules
        ├── __init__.py
        └── ms_deform_attn.py
    ├── setup.py
    ├── src
        ├── cpu
        │   ├── ms_deform_attn_cpu.cpp
        │   └── ms_deform_attn_cpu.h
        ├── cuda
        │   ├── ms_deform_attn_cuda.cu
        │   ├── ms_deform_attn_cuda.h
        │   └── ms_deform_im2col_cuda.cuh
        ├── ms_deform_attn.h
        └── vision.cpp
    └── test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | **/__pycache__/**
 3 | imnet_resnet50_scratch/timm_temp/
 4 | .dumbo.json
 5 | checkpoints/
 6 | data/
 7 | tmp.txt
 8 | data
 9 | tmp/
10 | **/data
11 | **/data/
12 | **/pretrained/
13 | **/work_dirs/
14 | **/results.pkl
15 | **/arun_log/
16 | tmp**
17 | arun_log/
18 | 
19 | .ipynb_checkpoints/
20 | checkpoint/
21 | 
22 | ops/MultiScaleDeformableAttention.egg-info/
23 | ops/build/
24 | ops/dist/
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DPT
 2 | --------
 3 | 
 4 | This repo is the official implementation of **DPT: Deformable Patch-based Transformer for Visual Recognition (ACM MM2021)**. We provide code and models for the following tasks:
 5 | 
 6 | > **Image Classification**: Detailed instruction and information see [classification/README.md](classification/README.md).
 7 | 
 8 | > **Object Detection**: Detailed instruction and information see [detection/README.md](detection/README.md).
 9 | 
10 | The papar has been relased on [[Arxiv](https://arxiv.org/abs/2107.14467)].
11 | 
12 | ## Introduction
13 | 
14 | Deformable Patch (DePatch) is a plug-and-play module. It learns to adaptively split the images input patches with different positions and scales in a data-driven way, rather than using predefined fixed patches. In this way, our method can well preserve the semantics in patches.
15 | 
16 | In this repository, code and models for a Deformable Patch-based Transformer (DPT) are provided. As this field is developing rapidly, we are willing to see our DePatch applied to some other latest architectures and promote further research.
17 | 
18 | ## Main Results
19 | 
20 | ### Image Classification
21 | 
22 | Training commands and pretrained models are provided >>> [here](classification) <<<.
23 | 
24 | | Method     | #Params (M) | FLOPs(G) | Acc@1 |
25 | |------------|:-----------:|:--------:|:-----:|
26 | | DPT-Tiny   |    15.2     |   2.1    | 77.4  |
27 | | DPT-Small  |    26.4     |   4.0    | 81.0  |
28 | | DPT-Medium |    46.1     |   6.9    | 81.9  |
29 | 
30 | ### Object Detection
31 | Training command and detailed results are provided >>> [here](detection) <<<.
32 | 
33 | ## Citation
34 | ```
35 | @inproceedings{chenDPT21,
36 |   title = {DPT: Deformable Patch-based Transformer for Visual Recognition},
37 |   author = {Zhiyang Chen and Yousong Zhu and Chaoyang Zhao and Guosheng Hu and Wei Zeng and Jinqiao Wang and Ming Tang},
38 |   booktitle={Proceedings of the ACM International Conference on Multimedia},
39 |   year={2021}
40 | }
41 | ```
42 | 
43 | ## License
44 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file.
45 | 
46 | ## Acknowledgement
47 | Our implementation is mainly based on [PVT](https://github.com/whai362/PVT). The CUDA operator is borrowed from [Deformable-DETR](https://github.com/fundamentalvision/Deformable-DETR). You may refer these repositories for further information.
48 | 


--------------------------------------------------------------------------------
/classification/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 - present, Facebook, Inc
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/classification/README.md:
--------------------------------------------------------------------------------
 1 | # DPT for Image Classification
 2 | -----
 3 | Here is our code for ImageNet classification. Please check our paper (coming soon) for detailed information.
 4 | 
 5 | ## Instructions
 6 | 
 7 | ### Environment
 8 | 
 9 | We develop our model under `cuda 10.1`, `pytorch 1.7.1` and `timm 0.3.2`. Pytorch with other versions may also work. We advise you to prepare your environment with `conda`.
10 | ```bash
11 | conda install pytorch==1.7.1 torchvision==0.8.2 cudatoolkit=10.1 -c pytorch
12 | pip install timm==0.3.2
13 | ```
14 | 
15 | You may clone our repo and compile the provided operator.
16 | ```bash
17 | git clone https://github.com/CASIA-IVA-Lab/DPT.git
18 | cd ./ops
19 | sh ./make.sh
20 | # unit test (should see all checking is True)
21 | python test.py
22 | ```
23 | 
24 | ### Data Preparation
25 | 
26 | We follow the conventional way to prepare the ImangeNet dataset.
27 | 
28 | The directory structure is the standard layout for the torchvision [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder), and the training and validation data is expected to be in the `train/` folder and `val/` folder respectively:
29 | 
30 | ```
31 | /path/to/imagenet/
32 |   train/
33 |     class1/
34 |       img1.jpeg
35 |     class2/
36 |       img2.jpeg
37 |   val/
38 |     class1/
39 |       img3.jpeg
40 |     class/2
41 |       img4.jpeg
42 | ```
43 | 
44 | ### Evaluation
45 | 
46 | To evaluate a pretrained model on ImageNet val on a single gpus:
47 | 
48 | ```bash
49 | python -m torch.distributed.launch --nproc_per_node 1 --use_env main.py --eval --model $MODEL_NAME --data-path $DATA_PATH --resume $CKPT_PATH
50 | ```
51 | 
52 | Or with multiple gpus:
53 | 
54 | ```bash
55 | python -m torch.distributed.launch --nproc_per_node $NUM_GPUS --use_env main.py --eval --dist-eval --model $MODEL_NAME --data-path $DATA_PATH --resume $CKPT_PATH
56 | ```
57 | 
58 | For example, use 8 gpu to test our pretrained DPT-Small model.
59 | ```bash
60 | python -m torch.distributed.launch --nproc_per_node 8 --use_env main.py --eval --dist-eval --model dpt_tiny --data-path $DATA_PATH --resume dpt_tiny.pth
61 | ```
62 | which should give
63 | ```
64 | * Acc@1 80.954 Acc@5 95.388 loss 0.846
65 | Accuracy of the network on the 50000 test images: 81.0%
66 | ```
67 | 
68 | 
69 | ### Training
70 | 
71 | To train DPT-Small on ImageNet on a single node with 8 gpus for 300 epochs run:
72 | 
73 | ```bash
74 | MODEL_NAME=dpt_small
75 | DATA_PATH=/path/to/imagenet
76 | OUTPUT_PATH=/path/to/output
77 | 
78 | python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py\
79 |  --model $MODEL_NAME --batch-size 128 --dist-eval --test_interval 5\
80 |  --data-path $DATA_PATH --output_dir $OUTPUT_PATH
81 | ```
82 | 
83 | ## Model Zoo
84 | 
85 | | Method     | #Params (M) | FLOPs(G) | Acc@1 | Model |
86 | |------------|:-----------:|:--------:|:-----:|:-----:|
87 | | DPT-Tiny   |    15.2     |   2.1    | 77.4  | [Google Drive](https://drive.google.com/file/d/1WkuanDQodRun1sJtZmnoUd6pJOpNMetm/view?usp=sharing) |
88 | | DPT-Small  |    26.4     |   4.0    | 81.0  | [Google Drive](https://drive.google.com/file/d/1uM4iRLnZ9Omdt_OSPr-aK0uy8rQ5iLjA/view?usp=sharing) |
89 | | DPT-Medium |    46.1     |   6.9    | 81.9  | [Google Drive](https://drive.google.com/file/d/1IoAJoN4VFEiDS17hSwXpTDHiJPivZdsu/view?usp=sharing) |
90 | 
91 | You can also obtain the ImageNet1k pre-trained model from [BaiduNetdisk](https://pan.baidu.com/s/1nzfWr90_XP7Ruoj2hBJzLQ). Password for extract is **DPTs**.
92 | 
93 | ## License
94 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file.
95 | 


--------------------------------------------------------------------------------
/classification/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | import os
  4 | import json
  5 | 
  6 | from torchvision import datasets, transforms
  7 | from torchvision.datasets.folder import ImageFolder, default_loader
  8 | 
  9 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 10 | from timm.data import create_transform
 11 | from mcloader import ClassificationDataset
 12 | 
 13 | 
 14 | class INatDataset(ImageFolder):
 15 |     def __init__(self, root, train=True, year=2018, transform=None, target_transform=None,
 16 |                  category='name', loader=default_loader):
 17 |         self.transform = transform
 18 |         self.loader = loader
 19 |         self.target_transform = target_transform
 20 |         self.year = year
 21 |         # assert category in ['kingdom','phylum','class','order','supercategory','family','genus','name']
 22 |         path_json = os.path.join(root, f'{"train" if train else "val"}{year}.json')
 23 |         with open(path_json) as json_file:
 24 |             data = json.load(json_file)
 25 | 
 26 |         with open(os.path.join(root, 'categories.json')) as json_file:
 27 |             data_catg = json.load(json_file)
 28 | 
 29 |         path_json_for_targeter = os.path.join(root, f"train{year}.json")
 30 | 
 31 |         with open(path_json_for_targeter) as json_file:
 32 |             data_for_targeter = json.load(json_file)
 33 | 
 34 |         targeter = {}
 35 |         indexer = 0
 36 |         for elem in data_for_targeter['annotations']:
 37 |             king = []
 38 |             king.append(data_catg[int(elem['category_id'])][category])
 39 |             if king[0] not in targeter.keys():
 40 |                 targeter[king[0]] = indexer
 41 |                 indexer += 1
 42 |         self.nb_classes = len(targeter)
 43 | 
 44 |         self.samples = []
 45 |         for elem in data['images']:
 46 |             cut = elem['file_name'].split('/')
 47 |             target_current = int(cut[2])
 48 |             path_current = os.path.join(root, cut[0], cut[2], cut[3])
 49 | 
 50 |             categors = data_catg[target_current]
 51 |             target_current_true = targeter[categors[category]]
 52 |             self.samples.append((path_current, target_current_true))
 53 | 
 54 |     # __getitem__ and __len__ inherited from ImageFolder
 55 | 
 56 | 
 57 | def build_dataset(is_train, args):
 58 |     transform = build_transform(is_train, args)
 59 | 
 60 |     if args.data_set == 'CIFAR':
 61 |         dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform)
 62 |         nb_classes = 100
 63 |     elif args.data_set == 'IMNET':
 64 |         if not args.use_mcloader:
 65 |             root = os.path.join(args.data_path, 'train' if is_train else 'val')
 66 |             dataset = datasets.ImageFolder(root, transform=transform)
 67 |         else:
 68 |             dataset = ClassificationDataset(
 69 |                 'train' if is_train else 'val',
 70 |                 pipeline=transform
 71 |             )
 72 |         nb_classes = 1000
 73 |     elif args.data_set == 'INAT':
 74 |         dataset = INatDataset(args.data_path, train=is_train, year=2018,
 75 |                               category=args.inat_category, transform=transform)
 76 |         nb_classes = dataset.nb_classes
 77 |     elif args.data_set == 'INAT19':
 78 |         dataset = INatDataset(args.data_path, train=is_train, year=2019,
 79 |                               category=args.inat_category, transform=transform)
 80 |         nb_classes = dataset.nb_classes
 81 | 
 82 |     return dataset, nb_classes
 83 | 
 84 | 
 85 | def build_transform(is_train, args):
 86 |     resize_im = args.input_size > 32
 87 |     if is_train:
 88 |         # this should always dispatch to transforms_imagenet_train
 89 |         transform = create_transform(
 90 |             input_size=args.input_size,
 91 |             is_training=True,
 92 |             color_jitter=args.color_jitter,
 93 |             auto_augment=args.aa,
 94 |             interpolation=args.train_interpolation,
 95 |             re_prob=args.reprob,
 96 |             re_mode=args.remode,
 97 |             re_count=args.recount,
 98 |         )
 99 |         if not resize_im:
100 |             # replace RandomResizedCropAndInterpolation with
101 |             # RandomCrop
102 |             transform.transforms[0] = transforms.RandomCrop(
103 |                 args.input_size, padding=4)
104 |         return transform
105 | 
106 |     t = []
107 |     if resize_im:
108 |         size = int((256 / 224) * args.input_size)
109 |         t.append(
110 |             transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
111 |         )
112 |         t.append(transforms.CenterCrop(args.input_size))
113 | 
114 |     t.append(transforms.ToTensor())
115 |     t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
116 |     return transforms.Compose(t)
117 | 


--------------------------------------------------------------------------------
/classification/dist_resume.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export NCCL_LL_THRESHOLD=0
 3 | 
 4 | ARCH=$1
 5 | GPUS=$2
 6 | OUT_PATH=$3
 7 | PORT=${PORT:-29500}
 8 | 
 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
10 |     --use_env main.py --model $ARCH --batch-size 128 --epochs 300 --data-path /path/to/imagenet \
11 |     --output_dir $OUT_PATH --resume $OUT_PATH/checkpoint.pth ${@:4}


--------------------------------------------------------------------------------
/classification/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export NCCL_LL_THRESHOLD=0
 3 | 
 4 | ARCH=$1
 5 | GPUS=$2
 6 | OUT_PATH=$3
 7 | PORT=${PORT:-29500}
 8 | 
 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
10 |     --use_env main.py --model $ARCH --batch-size 128 --epochs 300 --data-path /path/to/imagenet \
11 |     --output_dir $OUT_PATH ${@:4}


--------------------------------------------------------------------------------
/classification/engine.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | """
  4 | Train and eval functions used in main.py
  5 | """
  6 | import math
  7 | import sys
  8 | from typing import Iterable, Optional
  9 | 
 10 | import torch
 11 | 
 12 | from timm.data import Mixup
 13 | from timm.utils import accuracy, ModelEma
 14 | 
 15 | from losses import DistillationLoss
 16 | import utils
 17 | 
 18 | 
 19 | def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss,
 20 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 21 |                     device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
 22 |                     model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None,
 23 |                     set_training_mode=True,
 24 |                     fp32=False):
 25 |     model.train(set_training_mode)
 26 |     metric_logger = utils.MetricLogger(delimiter="  ")
 27 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 28 |     header = 'Epoch: [{}]'.format(epoch)
 29 |     print_freq = 10
 30 | 
 31 |     for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
 32 |         samples = samples.to(device, non_blocking=True)
 33 |         targets = targets.to(device, non_blocking=True)
 34 | 
 35 |         if mixup_fn is not None:
 36 |             samples, targets = mixup_fn(samples, targets)
 37 | 
 38 |         # with torch.cuda.amp.autocast():
 39 |         #     outputs = model(samples)
 40 |         #     loss = criterion(samples, outputs, targets)
 41 |         with torch.cuda.amp.autocast(enabled=not fp32):
 42 |             outputs = model(samples)
 43 |             loss = criterion(samples, outputs, targets)
 44 | 
 45 |         loss_value = loss.item()
 46 | 
 47 |         if not math.isfinite(loss_value):
 48 |             print("Loss is {}, stopping training".format(loss_value))
 49 |             sys.exit(1)
 50 | 
 51 |         optimizer.zero_grad()
 52 | 
 53 |         # this attribute is added by timm on one optimizer (adahessian)
 54 |         is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
 55 |         loss_scaler(loss, optimizer, clip_grad=max_norm,
 56 |                     parameters=model.parameters(), create_graph=is_second_order)
 57 | 
 58 |         torch.cuda.synchronize()
 59 |         if model_ema is not None:
 60 |             model_ema.update(model)
 61 | 
 62 |         metric_logger.update(loss=loss_value)
 63 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 64 |     # gather the stats from all processes
 65 |     metric_logger.synchronize_between_processes()
 66 |     print("Averaged stats:", metric_logger)
 67 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 68 | 
 69 | 
 70 | @torch.no_grad()
 71 | def evaluate(data_loader, model, device):
 72 |     criterion = torch.nn.CrossEntropyLoss()
 73 | 
 74 |     metric_logger = utils.MetricLogger(delimiter="  ")
 75 |     header = 'Test:'
 76 | 
 77 |     # switch to evaluation mode
 78 |     model.eval()
 79 | 
 80 |     for images, target in metric_logger.log_every(data_loader, 10, header):
 81 |         images = images.to(device, non_blocking=True)
 82 |         target = target.to(device, non_blocking=True)
 83 | 
 84 |         # compute output
 85 |         with torch.cuda.amp.autocast():
 86 |             output = model(images)
 87 |             loss = criterion(output, target)
 88 | 
 89 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
 90 | 
 91 |         batch_size = images.shape[0]
 92 |         metric_logger.update(loss=loss.item())
 93 |         metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
 94 |         metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
 95 |     # gather the stats from all processes
 96 |     metric_logger.synchronize_between_processes()
 97 |     print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
 98 |           .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
 99 | 
100 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
101 | 


--------------------------------------------------------------------------------
/classification/hubconf.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-present, Facebook, Inc.
2 | # All rights reserved.
3 | from models import *
4 | 
5 | dependencies = ["torch", "torchvision", "timm"]
6 | 


--------------------------------------------------------------------------------
/classification/losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | """
 4 | Implements the knowledge distillation loss
 5 | """
 6 | import torch
 7 | from torch.nn import functional as F
 8 | 
 9 | 
10 | class DistillationLoss(torch.nn.Module):
11 |     """
12 |     This module wraps a standard criterion and adds an extra knowledge distillation loss by
13 |     taking a teacher model prediction and using it as additional supervision.
14 |     """
15 |     def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module,
16 |                  distillation_type: str, alpha: float, tau: float):
17 |         super().__init__()
18 |         self.base_criterion = base_criterion
19 |         self.teacher_model = teacher_model
20 |         assert distillation_type in ['none', 'soft', 'hard']
21 |         self.distillation_type = distillation_type
22 |         self.alpha = alpha
23 |         self.tau = tau
24 | 
25 |     def forward(self, inputs, outputs, labels):
26 |         """
27 |         Args:
28 |             inputs: The original inputs that are feed to the teacher model
29 |             outputs: the outputs of the model to be trained. It is expected to be
30 |                 either a Tensor, or a Tuple[Tensor, Tensor], with the original output
31 |                 in the first position and the distillation predictions as the second output
32 |             labels: the labels for the base criterion
33 |         """
34 |         outputs_kd = None
35 |         if not isinstance(outputs, torch.Tensor):
36 |             # assume that the model outputs a tuple of [outputs, outputs_kd]
37 |             outputs, outputs_kd = outputs
38 |         base_loss = self.base_criterion(outputs, labels)
39 |         if self.distillation_type == 'none':
40 |             return base_loss
41 | 
42 |         if outputs_kd is None:
43 |             raise ValueError("When knowledge distillation is enabled, the model is "
44 |                              "expected to return a Tuple[Tensor, Tensor] with the output of the "
45 |                              "class_token and the dist_token")
46 |         # don't backprop throught the teacher
47 |         with torch.no_grad():
48 |             teacher_outputs = self.teacher_model(inputs)
49 | 
50 |         if self.distillation_type == 'soft':
51 |             T = self.tau
52 |             # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
53 |             # with slight modifications
54 |             distillation_loss = F.kl_div(
55 |                 F.log_softmax(outputs_kd / T, dim=1),
56 |                 F.log_softmax(teacher_outputs / T, dim=1),
57 |                 reduction='sum',
58 |                 log_target=True
59 |             ) * (T * T) / outputs_kd.numel()
60 |         elif self.distillation_type == 'hard':
61 |             distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(dim=1))
62 | 
63 |         loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha
64 |         return loss
65 | 


--------------------------------------------------------------------------------
/classification/mcloader/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification import ClassificationDataset
2 | from .data_prefetcher import DataPrefetcher


--------------------------------------------------------------------------------
/classification/mcloader/classification.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | from .imagenet import ImageNet
 4 | 
 5 | 
 6 | class ClassificationDataset(Dataset):
 7 |     """Dataset for classification.
 8 |     """
 9 | 
10 |     def __init__(self, split='train', pipeline=None):
11 |         if split == 'train':
12 |             self.data_source = ImageNet(root='data/imagenet/train',
13 |                                         list_file='data/imagenet/meta/train.txt',
14 |                                         memcached=True,
15 |                                         mclient_path='/mnt/lustre/share/memcached_client')
16 |         else:
17 |             self.data_source = ImageNet(root='data/imagenet/val',
18 |                                         list_file='data/imagenet/meta/val.txt',
19 |                                         memcached=True,
20 |                                         mclient_path='/mnt/lustre/share/memcached_client')
21 |         self.pipeline = pipeline
22 | 
23 |     def __len__(self):
24 |         return self.data_source.get_length()
25 | 
26 |     def __getitem__(self, idx):
27 |         img, target = self.data_source.get_sample(idx)
28 |         if self.pipeline is not None:
29 |             img = self.pipeline(img)
30 | 
31 |         return img, target
32 | 


--------------------------------------------------------------------------------
/classification/mcloader/data_prefetcher.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class DataPrefetcher:
 5 |     def __init__(self, loader):
 6 |         self.loader = iter(loader)
 7 |         self.stream = torch.cuda.Stream()
 8 |         self.preload()
 9 | 
10 |     def preload(self):
11 |         try:
12 |             self.next_input, self.next_target = next(self.loader)
13 |         except StopIteration:
14 |             self.next_input = None
15 |             self.next_target = None
16 |             return
17 | 
18 |         with torch.cuda.stream(self.stream):
19 |             self.next_input = self.next_input.cuda(non_blocking=True)
20 |             self.next_target = self.next_target.cuda(non_blocking=True)
21 | 
22 |     def next(self):
23 |         torch.cuda.current_stream().wait_stream(self.stream)
24 |         input = self.next_input
25 |         target = self.next_target
26 |         if input is not None:
27 |             self.preload()
28 |         return input, target
29 | 


--------------------------------------------------------------------------------
/classification/mcloader/image_list.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | 
 4 | from .mcloader import McLoader
 5 | 
 6 | 
 7 | class ImageList(object):
 8 | 
 9 |     def __init__(self, root, list_file, memcached=False, mclient_path=None):
10 |         with open(list_file, 'r') as f:
11 |             lines = f.readlines()
12 |         self.has_labels = len(lines[0].split()) == 2
13 |         if self.has_labels:
14 |             self.fns, self.labels = zip(*[l.strip().split() for l in lines])
15 |             self.labels = [int(l) for l in self.labels]
16 |         else:
17 |             self.fns = [l.strip() for l in lines]
18 |         self.fns = [os.path.join(root, fn) for fn in self.fns]
19 |         self.memcached = memcached
20 |         self.mclient_path = mclient_path
21 |         self.initialized = False
22 | 
23 |     def _init_memcached(self):
24 |         if not self.initialized:
25 |             assert self.mclient_path is not None
26 |             self.mc_loader = McLoader(self.mclient_path)
27 |             self.initialized = True
28 | 
29 |     def get_length(self):
30 |         return len(self.fns)
31 | 
32 |     def get_sample(self, idx):
33 |         if self.memcached:
34 |             self._init_memcached()
35 |         if self.memcached:
36 |             img = self.mc_loader(self.fns[idx])
37 |         else:
38 |             img = Image.open(self.fns[idx])
39 |         img = img.convert('RGB')
40 |         if self.has_labels:
41 |             target = self.labels[idx]
42 |             return img, target
43 |         else:
44 |             return img
45 | 


--------------------------------------------------------------------------------
/classification/mcloader/imagenet.py:
--------------------------------------------------------------------------------
1 | from .image_list import ImageList
2 | 
3 | 
4 | class ImageNet(ImageList):
5 | 
6 |     def __init__(self, root, list_file, memcached, mclient_path):
7 |         super(ImageNet, self).__init__(
8 |             root, list_file, memcached, mclient_path)
9 | 


--------------------------------------------------------------------------------
/classification/mcloader/mcloader.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from PIL import Image
 3 | try:
 4 |     import mc
 5 | except ImportError as E:
 6 |     pass
 7 | 
 8 | 
 9 | def pil_loader(img_str):
10 |     buff = io.BytesIO(img_str)
11 |     return Image.open(buff)
12 | 
13 | 
14 | class McLoader(object):
15 | 
16 |     def __init__(self, mclient_path):
17 |         assert mclient_path is not None, \
18 |             "Please specify 'data_mclient_path' in the config."
19 |         self.mclient_path = mclient_path
20 |         server_list_config_file = "{}/server_list.conf".format(
21 |             self.mclient_path)
22 |         client_config_file = "{}/client.conf".format(self.mclient_path)
23 |         self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file,
24 |                                                       client_config_file)
25 | 
26 |     def __call__(self, fn):
27 |         try:
28 |             img_value = mc.pyvector()
29 |             self.mclient.Get(fn, img_value)
30 |             img_value_str = mc.ConvertBuffer(img_value)
31 |             img = pil_loader(img_value_str)
32 |         except:
33 |             print('Read image failed ({})'.format(fn))
34 |             return None
35 |         else:
36 |             return img


--------------------------------------------------------------------------------
/classification/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dpt import *
2 | from .pvt import *
3 | 


--------------------------------------------------------------------------------
/classification/models/dpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .dpt import *
2 | 


--------------------------------------------------------------------------------
/classification/models/dpt/box_coder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import math
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | __all__ = ["pointCoder", "pointwhCoder"]
  8 | 
  9 | 
 10 | class pointCoder(nn.Module):
 11 |     def __init__(self, input_size, patch_count, weights=(1., 1.), tanh=True):
 12 |         super().__init__()
 13 |         self.input_size = input_size
 14 |         self.patch_count = patch_count
 15 |         self.weights = weights
 16 |         self._generate_anchor()
 17 |         self.tanh = tanh
 18 | 
 19 |     def _generate_anchor(self):
 20 |         anchors = []
 21 |         patch_stride = 1. / self.patch_count
 22 |         for i in range(self.patch_count):
 23 |             for j in range(self.patch_count):
 24 |                 y = (0.5+i)*patch_stride
 25 |                 x = (0.5+j)*patch_stride
 26 |                 anchors.append([x, y])
 27 |         anchors = torch.as_tensor(anchors)
 28 |         self.register_buffer("anchor", anchors)
 29 | 
 30 |     @torch.cuda.amp.autocast(enabled=False)
 31 |     def forward(self, pts, model_offset=None):
 32 |         assert model_offset is None
 33 |         self.boxes = self.decode(pts)
 34 |         return self.boxes
 35 | 
 36 |     def decode(self, rel_codes):
 37 |         # print ('xyxy decoding')
 38 |         boxes = self.anchor
 39 |         pixel = 1./self.patch_count
 40 |         wx, wy = self.weights
 41 | 
 42 |         dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx
 43 |         dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy
 44 | 
 45 |         pred_boxes = torch.zeros_like(rel_codes)
 46 | 
 47 |         ref_x = boxes[:,0].unsqueeze(0)
 48 |         ref_y = boxes[:,1].unsqueeze(0)
 49 | 
 50 |         pred_boxes[:, :, 0] = dx + ref_x
 51 |         pred_boxes[:, :, 1] = dy + ref_y
 52 |         pred_boxes = pred_boxes.clamp_(min=0., max=1.)
 53 | 
 54 |         return pred_boxes
 55 | 
 56 |     def get_offsets(self):
 57 |         return (self.boxes - self.anchor) * self.input_size
 58 | 
 59 | 
 60 | class pointwhCoder(pointCoder):
 61 |     def __init__(self, input_size, patch_count, weights=(1., 1.), pts=1, tanh=True, wh_bias=None):
 62 |         super().__init__(input_size=input_size, patch_count=patch_count, weights=weights, tanh=tanh)
 63 |         self.patch_pixel = pts
 64 |         self.wh_bias = None
 65 |         if wh_bias is not None:
 66 |             self.wh_bias = nn.Parameter(torch.zeros(2) + wh_bias)
 67 | 
 68 |     @torch.cuda.amp.autocast(enabled=False)
 69 |     def forward(self, pts, model_offset=None):
 70 |         assert model_offset is None
 71 |         if self.wh_bias is not None:
 72 |             pts[:, :, 2:] = pts[:, :, 2:] + self.wh_bias
 73 |         self.boxes = self.decode(pts)
 74 |         points = self.meshgrid(self.boxes)
 75 |         return points
 76 | 
 77 |     def decode(self, rel_codes):
 78 |         # print ('xyxy decoding')
 79 |         boxes = self.anchor
 80 |         pixel = 1./self.patch_count
 81 |         wx, wy, wh, ww = self.weights
 82 | 
 83 |         dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx
 84 |         dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy
 85 | 
 86 |         dw = F.relu(F.tanh(rel_codes[:, :, 2]/ww)) * pixel
 87 |         dh = F.relu(F.tanh(rel_codes[:, :, 3]/wh)) * pixel
 88 | 
 89 |         pred_boxes = torch.zeros_like(rel_codes)
 90 | 
 91 |         ref_x = boxes[:,0].unsqueeze(0)
 92 |         ref_y = boxes[:,1].unsqueeze(0)
 93 | 
 94 |         pred_boxes[:, :, 0] = dx + ref_x - dw
 95 |         pred_boxes[:, :, 1] = dy + ref_y - dh
 96 |         pred_boxes[:, :, 2] = dx + ref_x + dw
 97 |         pred_boxes[:, :, 3] = dy + ref_y + dh
 98 |         pred_boxes = pred_boxes.clamp_(min=0., max=1.)
 99 | 
100 |         return pred_boxes
101 | 
102 |     def get_offsets(self):
103 |         return (self.boxes - self.anchor.repeat(1,2)) * self.input_size
104 | 
105 |     def get_scales(self):
106 |         return (self.boxes[:, :, 2:] - self.boxes[:, :, :2]) * self.input_size
107 |     
108 |     def meshgrid(self, boxes):
109 |         B = boxes.shape[0]
110 |         xs, ys = boxes[:, :, 0::2], boxes[: , :, 1::2]
111 |         xs = torch.nn.functional.interpolate(xs, size=self.patch_pixel, mode='linear', align_corners=True)
112 |         ys = torch.nn.functional.interpolate(ys, size=self.patch_pixel, mode='linear', align_corners=True)
113 |         xs, ys = xs.unsqueeze(3).repeat_interleave(self.patch_pixel, dim=3), ys.unsqueeze(2).repeat_interleave(self.patch_pixel, dim=2)
114 |         results = torch.stack([xs, ys], dim = -1)
115 |         results = results.reshape(B, self.patch_count*self.patch_count*self.patch_pixel*self.patch_pixel, 2)
116 |         return results
117 | 


--------------------------------------------------------------------------------
/classification/models/dpt/depatch_embed.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from functools import partial
  4 | 
  5 | from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
  6 | from timm.models.helpers import load_pretrained
  7 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
  8 | from timm.models.resnet import resnet26d, resnet50d
  9 | from timm.models.registry import register_model
 10 | 
 11 | from timm.models import create_model
 12 | from timm.models.vision_transformer import _cfg, Block
 13 | from .ms_deform_attn_func import MSDeformAttnFunction
 14 | 
 15 | class PatchEmbed(nn.Module):
 16 |     """ Image to Patch Embedding
 17 |     """
 18 |     def __init__(self, img_size=224, patch_size=16, patch_count=14, in_chans=3, embed_dim=768, with_norm=False):
 19 |         super().__init__()  
 20 |         patch_stride = img_size // patch_count
 21 |         patch_pad = (patch_stride * (patch_count - 1) + patch_size - img_size) // 2
 22 |         img_size = to_2tuple(img_size)
 23 |         patch_size = to_2tuple(patch_size)
 24 |         num_patches = patch_count * patch_count
 25 |         self.img_size = img_size
 26 |         self.patch_size = patch_size
 27 |         self.num_patches = num_patches
 28 | 
 29 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=patch_pad)
 30 |         if with_norm:
 31 |             self.norm = nn.LayerNorm(embed_dim)
 32 | 
 33 |     def forward(self, x, **kwargs):
 34 |         B, C, H, W = x.shape
 35 |         # FIXME look at relaxing size constraints
 36 |         assert H == self.img_size[0] and W == self.img_size[1], \
 37 |             f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
 38 |         x = self.proj(x).flatten(2).transpose(1, 2)
 39 |         if hasattr(self, "norm"):
 40 |             x = self.norm(x)
 41 |         assert x.shape[1] == self.num_patches
 42 |         return x
 43 | 
 44 | 
 45 | class Simple_Patch(nn.Module):
 46 |     def __init__(self, offset_embed, img_size=224, patch_size=16, patch_pixel=16, patch_count=14, 
 47 |                  in_chans=3, embed_dim=192, another_linear=False, use_GE=False, local_feature=False, with_norm=False):
 48 |         super().__init__()
 49 |         self.num_patches = patch_count * patch_count
 50 |         self.another_linear = another_linear
 51 |         if self.another_linear:
 52 |             self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, embed_dim, with_norm=with_norm)
 53 |             self.act = nn.GELU() if use_GE else nn.Identity()
 54 |             self.offset_predictor = nn.Linear(embed_dim, offset_embed, bias=False)
 55 |         else:
 56 |             self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, offset_embed)
 57 | 
 58 |         self.img_size, self.patch_size, self.patch_pixel, self.patch_count = img_size, patch_size, patch_pixel, patch_count
 59 |         self.in_chans, self.embed_dim = in_chans, embed_dim
 60 | 
 61 |     def reset_offset(self):
 62 |         if self.another_linear:
 63 |             nn.init.constant_(self.offset_predictor.weight, 0)
 64 |             if hasattr(self.offset_predictor, "bias") and self.offset_predictor.bias is not None:
 65 |                 nn.init.constant_(self.offset_predictor.bias, 0)
 66 |         else:
 67 |             nn.init.constant_(self.patch_embed.proj.weight, 0)
 68 |             if hasattr(self.patch_embed.proj, "bias") and self.patch_embed.proj.bias is not None:
 69 |                 nn.init.constant_(self.patch_embed.proj.bias, 0)
 70 |         print("Parameter for offsets reseted.")
 71 | 
 72 |     @torch.cuda.amp.autocast(enabled=False)
 73 |     def forward(self, x, model_offset=None):
 74 |         if x.dim() == 3:
 75 |             B, H, W = x.shape[0], self.img_size, self.img_size
 76 |             assert x.shape[1] == H * W
 77 |             x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
 78 |         B, C, H, W = x.shape
 79 |         img = x
 80 |         x = self.patch_embed(x)
 81 |         if self.another_linear:
 82 |             pred_offset = self.offset_predictor(self.act(x))
 83 |         else:
 84 |             pred_offset = x.contiguous()
 85 |         return self.get_output(img, pred_offset, model_offset), (self.patch_count, self.patch_count)
 86 | 
 87 | class Simple_DePatch(Simple_Patch):
 88 |     def __init__(self, box_coder, show_dim=4, **kwargs):
 89 |         super().__init__(show_dim, **kwargs)
 90 |         self.box_coder = box_coder
 91 |         self.register_buffer("value_spatial_shapes", torch.as_tensor([[self.img_size, self.img_size]], dtype=torch.long))
 92 |         self.register_buffer("value_level_start_index", torch.as_tensor([0], dtype=torch.long))
 93 |         self.output_proj = nn.Linear(self.in_chans * self.patch_pixel * self.patch_pixel, self.embed_dim)
 94 |         self.num_sample_points = self.patch_pixel * self.patch_pixel * self.patch_count * self.patch_count
 95 |         if kwargs["with_norm"]:
 96 |             self.with_norm=True
 97 |             self.norm = nn.LayerNorm(self.embed_dim)
 98 |         else:
 99 |             self.with_norm=False
100 | 
101 |     def get_output(self, img, pred_offset, model_offset=None):
102 |         #copyed
103 |         B = img.shape[0]
104 |         sample_location = self.box_coder(pred_offset, model_offset)
105 |         sampling_locations = sample_location.view(B, self.num_sample_points,1,1,1,2).to(torch.float)
106 |         attention_weights = torch.ones((B, self.num_sample_points, 1, 1, 1), device=img.device)
107 |         x = img.view(B, self.in_chans, 1, -1).transpose(1, 3).contiguous()
108 |         output = MSDeformAttnFunction.apply(x, self.value_spatial_shapes, self.value_level_start_index, sampling_locations, attention_weights, 1)
109 |         # output_proj
110 |         output = output.view(B, self.num_patches, self.in_chans*self.patch_pixel*self.patch_pixel)
111 |         output = self.output_proj(output)
112 |         if self.with_norm:
113 |             output = self.norm(output)
114 |         return output
115 | 


--------------------------------------------------------------------------------
/classification/models/dpt/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/classification/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.7.0
2 | torchvision==0.8.1
3 | timm==0.3.2
4 | 


--------------------------------------------------------------------------------
/classification/run_with_submitit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | """
  4 | A script to run multinode training with submitit.
  5 | """
  6 | import argparse
  7 | import os
  8 | import os.path as osp
  9 | import uuid
 10 | from pathlib import Path
 11 | 
 12 | import main as classification
 13 | import submitit
 14 | 
 15 | 
 16 | def parse_args():
 17 |     classification_parser = classification.get_args_parser()
 18 |     parser = argparse.ArgumentParser("Submitit for DeiT", parents=[classification_parser])
 19 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 20 |     parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request")
 21 |     parser.add_argument("--timeout", default=2800, type=int, help="Duration of the job")
 22 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 23 | 
 24 |     parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit")
 25 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 26 |     parser.add_argument('--comment', default="", type=str,
 27 |                         help='Comment to pass to scheduler, e.g. priority message')
 28 |     return parser.parse_args()
 29 | 
 30 | 
 31 | def get_shared_folder() -> Path:
 32 |     root = '/mnt/lustre/wangwenhai/workspace/PVT/'
 33 |     if Path(osp.join(root, 'checkpoints/')).is_dir():
 34 |         p = Path(osp.join(root, 'checkpoints/experiments/'))
 35 |         p.mkdir(exist_ok=True)
 36 |         return p
 37 |     raise RuntimeError("No shared folder available")
 38 | 
 39 | 
 40 | def get_init_file():
 41 |     # Init file must not exist, but it's parent dir must exist.
 42 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 43 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 44 |     if init_file.exists():
 45 |         os.remove(str(init_file))
 46 |     return init_file
 47 | 
 48 | 
 49 | class Trainer(object):
 50 |     def __init__(self, args):
 51 |         self.args = args
 52 | 
 53 |     def __call__(self):
 54 |         import main as classification
 55 | 
 56 |         self._setup_gpu_args()
 57 |         classification.main(self.args)
 58 | 
 59 |     def checkpoint(self):
 60 |         import os
 61 |         import submitit
 62 | 
 63 |         self.args.dist_url = get_init_file().as_uri()
 64 |         checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth")
 65 |         if os.path.exists(checkpoint_file):
 66 |             self.args.resume = checkpoint_file
 67 |         print("Requeuing ", self.args)
 68 |         empty_trainer = type(self)(self.args)
 69 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 70 | 
 71 |     def _setup_gpu_args(self):
 72 |         import submitit
 73 |         from pathlib import Path
 74 | 
 75 |         job_env = submitit.JobEnvironment()
 76 |         self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
 77 |         self.args.gpu = job_env.local_rank
 78 |         self.args.rank = job_env.global_rank
 79 |         self.args.world_size = job_env.num_tasks
 80 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 81 | 
 82 | 
 83 | def main():
 84 |     args = parse_args()
 85 |     if args.job_dir == "":
 86 |         args.job_dir = get_shared_folder() / "%j"
 87 | 
 88 |     # Note that the folder will depend on the job_id, to easily track experiments
 89 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 90 | 
 91 |     num_gpus_per_node = args.ngpus
 92 |     nodes = args.nodes
 93 |     timeout_min = args.timeout
 94 | 
 95 |     partition = args.partition
 96 |     kwargs = {}
 97 |     if args.use_volta32:
 98 |         kwargs['slurm_constraint'] = 'volta32gb'
 99 |     if args.comment:
100 |         kwargs['slurm_comment'] = args.comment
101 | 
102 |     executor.update_parameters(
103 |         # mem_gb=40 * num_gpus_per_node,
104 |         # gpus_per_node=num_gpus_per_node,
105 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
106 |         # cpus_per_task=10,
107 |         nodes=nodes,
108 |         timeout_min=60 * 24 * 10,  # max is 60 * 72
109 |         # Below are cluster dependent parameters
110 |         slurm_gres="gpu:%d" % num_gpus_per_node,
111 |         slurm_partition=partition,
112 |         slurm_signal_delay_s=120,
113 |         slurm_additional_parameters={
114 |             'qos': 'non-preemptable',
115 |             'mpi': 'pmi2'
116 |         },
117 |         **kwargs
118 |     )
119 | 
120 |     executor.update_parameters(name="deit")
121 | 
122 |     args.dist_url = get_init_file().as_uri()
123 |     args.output_dir = args.job_dir
124 | 
125 |     trainer = Trainer(args)
126 |     job = executor.submit(trainer)
127 | 
128 |     print("Submitted job_id:", job.job_id)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------
/classification/samplers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | import torch
 4 | import torch.distributed as dist
 5 | import math
 6 | 
 7 | 
 8 | class RASampler(torch.utils.data.Sampler):
 9 |     """Sampler that restricts data loading to a subset of the dataset for distributed,
10 |     with repeated augmentation.
11 |     It ensures that different each augmented version of a sample will be visible to a
12 |     different process (GPU)
13 |     Heavily based on torch.utils.data.DistributedSampler
14 |     """
15 | 
16 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
17 |         if num_replicas is None:
18 |             if not dist.is_available():
19 |                 raise RuntimeError("Requires distributed package to be available")
20 |             num_replicas = dist.get_world_size()
21 |         if rank is None:
22 |             if not dist.is_available():
23 |                 raise RuntimeError("Requires distributed package to be available")
24 |             rank = dist.get_rank()
25 |         self.dataset = dataset
26 |         self.num_replicas = num_replicas
27 |         self.rank = rank
28 |         self.epoch = 0
29 |         self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas))
30 |         self.total_size = self.num_samples * self.num_replicas
31 |         # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
32 |         self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
33 |         self.shuffle = shuffle
34 | 
35 |     def __iter__(self):
36 |         # deterministically shuffle based on epoch
37 |         g = torch.Generator()
38 |         g.manual_seed(self.epoch)
39 |         if self.shuffle:
40 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
41 |         else:
42 |             indices = list(range(len(self.dataset)))
43 | 
44 |         # add extra samples to make it evenly divisible
45 |         indices = [ele for ele in indices for i in range(3)]
46 |         indices += indices[:(self.total_size - len(indices))]
47 |         assert len(indices) == self.total_size
48 | 
49 |         # subsample
50 |         indices = indices[self.rank:self.total_size:self.num_replicas]
51 |         assert len(indices) == self.num_samples
52 | 
53 |         return iter(indices[:self.num_selected_samples])
54 | 
55 |     def __len__(self):
56 |         return self.num_selected_samples
57 | 
58 |     def set_epoch(self, epoch):
59 |         self.epoch = epoch
60 | 


--------------------------------------------------------------------------------
/classification/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = F401,E402,F403,W503,W504
4 | 


--------------------------------------------------------------------------------
/classification/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | import io
  9 | import os
 10 | import time
 11 | from collections import defaultdict, deque
 12 | import datetime
 13 | 
 14 | import torch
 15 | import torch.distributed as dist
 16 | 
 17 | 
 18 | class SmoothedValue(object):
 19 |     """Track a series of values and provide access to smoothed values over a
 20 |     window or the global series average.
 21 |     """
 22 | 
 23 |     def __init__(self, window_size=20, fmt=None):
 24 |         if fmt is None:
 25 |             fmt = "{median:.4f} ({global_avg:.4f})"
 26 |         self.deque = deque(maxlen=window_size)
 27 |         self.total = 0.0
 28 |         self.count = 0
 29 |         self.fmt = fmt
 30 | 
 31 |     def update(self, value, n=1):
 32 |         self.deque.append(value)
 33 |         self.count += n
 34 |         self.total += value * n
 35 | 
 36 |     def synchronize_between_processes(self):
 37 |         """
 38 |         Warning: does not synchronize the deque!
 39 |         """
 40 |         if not is_dist_avail_and_initialized():
 41 |             return
 42 |         t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
 43 |         dist.barrier()
 44 |         dist.all_reduce(t)
 45 |         t = t.tolist()
 46 |         self.count = int(t[0])
 47 |         self.total = t[1]
 48 | 
 49 |     @property
 50 |     def median(self):
 51 |         d = torch.tensor(list(self.deque))
 52 |         return d.median().item()
 53 | 
 54 |     @property
 55 |     def avg(self):
 56 |         d = torch.tensor(list(self.deque), dtype=torch.float32)
 57 |         return d.mean().item()
 58 | 
 59 |     @property
 60 |     def global_avg(self):
 61 |         return self.total / self.count
 62 | 
 63 |     @property
 64 |     def max(self):
 65 |         return max(self.deque)
 66 | 
 67 |     @property
 68 |     def value(self):
 69 |         return self.deque[-1]
 70 | 
 71 |     def __str__(self):
 72 |         return self.fmt.format(
 73 |             median=self.median,
 74 |             avg=self.avg,
 75 |             global_avg=self.global_avg,
 76 |             max=self.max,
 77 |             value=self.value)
 78 | 
 79 | 
 80 | class MetricLogger(object):
 81 |     def __init__(self, delimiter="\t"):
 82 |         self.meters = defaultdict(SmoothedValue)
 83 |         self.delimiter = delimiter
 84 | 
 85 |     def update(self, **kwargs):
 86 |         for k, v in kwargs.items():
 87 |             if isinstance(v, torch.Tensor):
 88 |                 v = v.item()
 89 |             assert isinstance(v, (float, int))
 90 |             self.meters[k].update(v)
 91 | 
 92 |     def __getattr__(self, attr):
 93 |         if attr in self.meters:
 94 |             return self.meters[attr]
 95 |         if attr in self.__dict__:
 96 |             return self.__dict__[attr]
 97 |         raise AttributeError("'{}' object has no attribute '{}'".format(
 98 |             type(self).__name__, attr))
 99 | 
100 |     def __str__(self):
101 |         loss_str = []
102 |         for name, meter in self.meters.items():
103 |             loss_str.append(
104 |                 "{}: {}".format(name, str(meter))
105 |             )
106 |         return self.delimiter.join(loss_str)
107 | 
108 |     def synchronize_between_processes(self):
109 |         for meter in self.meters.values():
110 |             meter.synchronize_between_processes()
111 | 
112 |     def add_meter(self, name, meter):
113 |         self.meters[name] = meter
114 | 
115 |     def log_every(self, iterable, print_freq, header=None):
116 |         i = 0
117 |         if not header:
118 |             header = ''
119 |         start_time = time.time()
120 |         end = time.time()
121 |         iter_time = SmoothedValue(fmt='{avg:.4f}')
122 |         data_time = SmoothedValue(fmt='{avg:.4f}')
123 |         space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
124 |         log_msg = [
125 |             header,
126 |             '[{0' + space_fmt + '}/{1}]',
127 |             'eta: {eta}',
128 |             '{meters}',
129 |             'time: {time}',
130 |             'data: {data}'
131 |         ]
132 |         if torch.cuda.is_available():
133 |             log_msg.append('max mem: {memory:.0f}')
134 |         log_msg = self.delimiter.join(log_msg)
135 |         MB = 1024.0 * 1024.0
136 |         for obj in iterable:
137 |             data_time.update(time.time() - end)
138 |             yield obj
139 |             iter_time.update(time.time() - end)
140 |             if i % print_freq == 0 or i == len(iterable) - 1:
141 |                 eta_seconds = iter_time.global_avg * (len(iterable) - i)
142 |                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
143 |                 if torch.cuda.is_available():
144 |                     print(log_msg.format(
145 |                         i, len(iterable), eta=eta_string,
146 |                         meters=str(self),
147 |                         time=str(iter_time), data=str(data_time),
148 |                         memory=torch.cuda.max_memory_allocated() / MB))
149 |                 else:
150 |                     print(log_msg.format(
151 |                         i, len(iterable), eta=eta_string,
152 |                         meters=str(self),
153 |                         time=str(iter_time), data=str(data_time)))
154 |             i += 1
155 |             end = time.time()
156 |         total_time = time.time() - start_time
157 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
158 |         print('{} Total time: {} ({:.4f} s / it)'.format(
159 |             header, total_time_str, total_time / len(iterable)))
160 | 
161 | 
162 | def _load_checkpoint_for_ema(model_ema, checkpoint):
163 |     """
164 |     Workaround for ModelEma._load_checkpoint to accept an already-loaded object
165 |     """
166 |     mem_file = io.BytesIO()
167 |     torch.save(checkpoint, mem_file)
168 |     mem_file.seek(0)
169 |     model_ema._load_checkpoint(mem_file)
170 | 
171 | 
172 | def setup_for_distributed(is_master):
173 |     """
174 |     This function disables printing when not in master process
175 |     """
176 |     import builtins as __builtin__
177 |     builtin_print = __builtin__.print
178 | 
179 |     def print(*args, **kwargs):
180 |         force = kwargs.pop('force', False)
181 |         if is_master or force:
182 |             builtin_print(*args, **kwargs)
183 | 
184 |     __builtin__.print = print
185 | 
186 | 
187 | def is_dist_avail_and_initialized():
188 |     if not dist.is_available():
189 |         return False
190 |     if not dist.is_initialized():
191 |         return False
192 |     return True
193 | 
194 | 
195 | def get_world_size():
196 |     if not is_dist_avail_and_initialized():
197 |         return 1
198 |     return dist.get_world_size()
199 | 
200 | 
201 | def get_rank():
202 |     if not is_dist_avail_and_initialized():
203 |         return 0
204 |     return dist.get_rank()
205 | 
206 | 
207 | def is_main_process():
208 |     return get_rank() == 0
209 | 
210 | 
211 | def save_on_master(*args, **kwargs):
212 |     if is_main_process():
213 |         torch.save(*args, **kwargs)
214 | 
215 | 
216 | def init_distributed_mode(args):
217 |     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
218 |         args.rank = int(os.environ["RANK"])
219 |         args.world_size = int(os.environ['WORLD_SIZE'])
220 |         args.gpu = int(os.environ['LOCAL_RANK'])
221 |     elif 'SLURM_PROCID' in os.environ:
222 |         args.rank = int(os.environ['SLURM_PROCID'])
223 |         args.gpu = args.rank % torch.cuda.device_count()
224 |     else:
225 |         print('Not using distributed mode')
226 |         args.distributed = False
227 |         return
228 | 
229 |     args.distributed = True
230 | 
231 |     torch.cuda.set_device(args.gpu)
232 |     args.dist_backend = 'nccl'
233 |     print('| distributed init (rank {}): {}'.format(
234 |         args.rank, args.dist_url), flush=True)
235 |     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
236 |                                          world_size=args.world_size, rank=args.rank)
237 |     torch.distributed.barrier()
238 |     setup_for_distributed(args.rank == 0)
239 | 


--------------------------------------------------------------------------------
/detection/README.md:
--------------------------------------------------------------------------------
 1 | # DPT for Object Detection
 2 | 
 3 | Here is our code for ImageNet classification. Please check [our paper](https://arxiv.org/abs/2107.14467) for detailed information.
 4 | 
 5 | ## Instructions
 6 | 
 7 | ### Preparations
 8 | 
 9 | First, install pytorch as for classification.
10 | ```bash
11 | conda install pytorch==1.7.1 torchvision==0.8.2 cudatoolkit=10.1 -c pytorch
12 | pip install timm==0.3.2
13 | ```
14 | 
15 | We develop our method under environment `mmcv==1.2.7` and `mmdet==2.8.0`. We recommand you [this document](https://github.com/open-mmlab/mmdetection/blob/v2.8.0/docs/get_started.md) for detailed instructions.
16 | 
17 | ### Evaluation
18 | 
19 | To evaluate RetinaNet on COCO val2017 with 8 gpus run:
20 | ```
21 | ./dist_test.sh /path/to/config/file /path/to/checkpoint_file 8 --eval bbox
22 | ```
23 | 
24 | For example, to evaluate RetinaNet with DPT-Tiny:
25 | 
26 | ```
27 | ./dist_test.sh configs/retinanet_dpt_t_fpn_1x_coco.py pretrained/detection/retinanet_dpt_t_1x.pth 8 --eval bbox
28 | ```
29 | 
30 | 
31 | To evaluate Mask R-CNN on COCO val2017 with 8 gpus run:
32 | 
33 | ```
34 | ./dist_test.sh /path/to/config/file /path/to/checkpoint_file 8 --eval bbox segm
35 | ```
36 | 
37 | For example, to evaluate Mask R-CNN with DPT-Tiny:
38 | 
39 | ```
40 | ./dist_test.sh configs/mask_rcnn_dpt_t_fpn_1x_coco.py pretrained/detection/mrcnn_dpt_t_1x.pth 8 --eval bbox segm
41 | ```
42 | 
43 | ### Training
44 | 
45 | Train with certain config file:
46 | 
47 | ```
48 | dist_train.sh /path/to/config/file $NUM_GPUS
49 | ```
50 | 
51 | 
52 | For example, to train DPT-Small + Mask R-CNN on COCO train2017 for 12 epochs with 8 gpus:
53 | 
54 | ```
55 | dist_train.sh configs/mask_rcnn_dpt_s_fpn_1x_coco.py 8
56 | ```
57 | 
58 | 
59 | ## Results and Models
60 | ### RetinaNet Results
61 | 
62 | | Method     | #Params (M) | Schedule |  mAP | AP50 | AP75 |  APs |  APm |  APl | Download |
63 | |------------|:-----------:|:--------:|:----:|:----:|:----:|:----:|:----:|:----:|:--------:|
64 | | DPT-Tiny   |    24.9     |    1x    | 39.5 | 60.4 | 41.8 | 23.7 | 43.2 | 52.2 |[Google Drive](https://drive.google.com/file/d/1S84hbeVxPjtcmjeOcae5Sn_XYqyagMn3/view?usp=sharing)|
65 | | DPT-Tiny   |    24.9     |    MS+3x | 41.2 | 62.0 | 44.0 | 25.7 | 44.6 | 53.9 |[Google Drive](https://drive.google.com/file/d/1OdMjRxjGdwqow124ZB-COgEh-AgV8TxH/view?usp=sharing)|
66 | | DPT-Small  |    36.1     |    1x    | 42.5 | 63.6 | 45.3 | 26.2 | 45.7 | 56.9 |[Google Drive](https://drive.google.com/file/d/1iVmK6MTdX8n2A7nS31GHUCaY0wcjhWzS/view?usp=sharing)|
67 | | DPT-Small  |    36.1     |    MS+3x | 43.3 | 64.0 | 46.5 | 27.8 | 46.3 | 58.5 |[Google Drive](https://drive.google.com/file/d/1PvoQYQC6UklSFavkhwqaSO-2zrYnJYkB/view?usp=sharing)|
68 | | DPT-Medium |    55.9     |    1x    | 43.3 | 64.6 | 45.9 | 27.2 | 46.7 | 58.6 |[Google Drive](https://drive.google.com/file/d/1AWYLEEZN27sKmkCyV3WEy6mWbp2ar0gI/view?usp=sharing)|
69 | | DPT-Medium |    55.9     |    MS+3x | 43.7 | 64.6 | 46.4 | 27.2 | 47.0 | 58.4 |[Google Drive](https://drive.google.com/file/d/1AeAq2nCSohMfKp1Q4WUROx0csYLOZaZG/view?usp=sharing)|
70 | 
71 | ### Mask R-CNN Results
72 | 
73 | | Method     | #Params (M) | Schedule | box mAP | box AP50 | box AP75 | mask mAP | mask AP50 | mask AP75 | Download |
74 | |------------|:-----------:|:--------:|:-------:|:--------:|:--------:|:--------:|:---------:|:---------:|:--------:|
75 | | DPT-Tiny   |    34.8     |    1x    |   40.2  |   62.8   |   43.8   |   37.7   |   59.8    |   40.4    |[Google Drive](https://drive.google.com/file/d/1jCp5mYqHnNs1Uzrh8uopOyRSw0aVLDQp/view?usp=sharing)|
76 | | DPT-Tiny   |    34.8     |    MS+3x |   42.2  |   64.4   |   46.1   |   39.4   |   61.5    |   42.3    |[Google Drive](https://drive.google.com/file/d/1S3_ERb4Ak4ksWPXryXUTV1CEEVAP2BAj/view?usp=sharing)|
77 | | DPT-Small  |    46.1     |    1x    |   43.1  |   65.7   |   47.2   |   39.9   |   62.9    |   43.0    |[Google Drive](https://drive.google.com/file/d/17hg0oLhH96nFTc8H9GAoDpX1Qb7oMGGJ/view?usp=sharing)|
78 | | DPT-Small  |    46.1     |    MS+3x |   44.4  |   66.5   |   48.9   |   41.0   |   63.6    |   44.2    |[Google Drive](https://drive.google.com/file/d/1aOLw_rVs-LGCKbcMXZN4ogGIOp0rc-UY/view?usp=sharing)|
79 | | DPT-Medium |    65.8     |    1x    |   43.8  |   66.2   |   48.3   |   40.3   |   63.1    |   43.4    |[Google Drive](https://drive.google.com/file/d/1pl8W7WW_MN9N9TxgNZB87FuwEw6pM_n_/view?usp=sharing)|
80 | | DPT-Medium |    65.8     |    MS+3x |   44.3  |   65.6   |   48.8   |   40.7   |   63.1    |   44.1    |[Google Drive](https://drive.google.com/file/d/1_m4Huy1sNiwBDKamPhrvo6cLE6JNJ_kY/view?usp=sharing)|
81 | 
82 | ### Other links
83 | 
84 | These models can also be obtained from [BaiduNetdisk](https://pan.baidu.com/s/19nJXoOAK_mljV4BPx1sUSQ). Password for extraction is **DPTs**.
85 | Our result is pretrained on the ImageNet1k dataset. ImageNet1k-pretrained models can be found [here](../classification/README.md).
86 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/datasets/coco_detection.py:
--------------------------------------------------------------------------------
 1 | dataset_type = 'CocoDataset'
 2 | data_root = '/gdata/MSCOCO2017/'
 3 | img_norm_cfg = dict(
 4 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', with_bbox=True),
 8 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
 9 |     dict(type='RandomFlip', flip_ratio=0.5),
10 |     dict(type='Normalize', **img_norm_cfg),
11 |     dict(type='Pad', size_divisor=32),
12 |     dict(type='DefaultFormatBundle'),
13 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
14 | ]
15 | test_pipeline = [
16 |     dict(type='LoadImageFromFile'),
17 |     dict(
18 |         type='MultiScaleFlipAug',
19 |         img_scale=(1333, 800),
20 |         flip=False,
21 |         transforms=[
22 |             dict(type='Resize', keep_ratio=True),
23 |             dict(type='RandomFlip'),
24 |             dict(type='Normalize', **img_norm_cfg),
25 |             dict(type='Pad', size_divisor=32),
26 |             dict(type='ImageToTensor', keys=['img']),
27 |             dict(type='Collect', keys=['img']),
28 |         ])
29 | ]
30 | data = dict(
31 |     samples_per_gpu=2,
32 |     workers_per_gpu=2,
33 |     train=dict(
34 |         type=dataset_type,
35 |         ann_file=data_root + 'annotations/instances_train2017.json',
36 |         img_prefix=data_root + 'train2017/',
37 |         pipeline=train_pipeline),
38 |     val=dict(
39 |         type=dataset_type,
40 |         ann_file=data_root + 'annotations/instances_val2017.json',
41 |         img_prefix=data_root + 'val2017/',
42 |         pipeline=test_pipeline),
43 |     test=dict(
44 |         type=dataset_type,
45 |         ann_file=data_root + 'annotations/instances_val2017.json',
46 |         img_prefix=data_root + 'val2017/',
47 |         pipeline=test_pipeline))
48 | evaluation = dict(interval=1, metric='bbox')
49 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/datasets/coco_instance.py:
--------------------------------------------------------------------------------
 1 | dataset_type = 'CocoDataset'
 2 | data_root = '/gdata/MSCOCO2017/'
 3 | img_norm_cfg = dict(
 4 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
 8 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
 9 |     dict(type='RandomFlip', flip_ratio=0.5),
10 |     dict(type='Normalize', **img_norm_cfg),
11 |     dict(type='Pad', size_divisor=32),
12 |     dict(type='DefaultFormatBundle'),
13 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
14 | ]
15 | test_pipeline = [
16 |     dict(type='LoadImageFromFile'),
17 |     dict(
18 |         type='MultiScaleFlipAug',
19 |         img_scale=(1333, 800),
20 |         flip=False,
21 |         transforms=[
22 |             dict(type='Resize', keep_ratio=True),
23 |             dict(type='RandomFlip'),
24 |             dict(type='Normalize', **img_norm_cfg),
25 |             dict(type='Pad', size_divisor=32),
26 |             dict(type='ImageToTensor', keys=['img']),
27 |             dict(type='Collect', keys=['img']),
28 |         ])
29 | ]
30 | data = dict(
31 |     samples_per_gpu=2,
32 |     workers_per_gpu=2,
33 |     train=dict(
34 |         type=dataset_type,
35 |         ann_file=data_root + 'annotations/instances_train2017.json',
36 |         img_prefix=data_root + 'train2017/',
37 |         pipeline=train_pipeline),
38 |     val=dict(
39 |         type=dataset_type,
40 |         ann_file=data_root + 'annotations/instances_val2017.json',
41 |         img_prefix=data_root + 'val2017/',
42 |         pipeline=test_pipeline),
43 |     test=dict(
44 |         type=dataset_type,
45 |         ann_file=data_root + 'annotations/instances_val2017.json',
46 |         img_prefix=data_root + 'val2017/',
47 |         pipeline=test_pipeline))
48 | evaluation = dict(metric=['bbox', 'segm'])
49 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | checkpoint_config = dict(interval=1)
 2 | # yapf:disable
 3 | log_config = dict(
 4 |     interval=50,
 5 |     hooks=[
 6 |         dict(type='TextLoggerHook'),
 7 |         # dict(type='TensorboardLoggerHook')
 8 |     ])
 9 | # yapf:enable
10 | dist_params = dict(backend='nccl')
11 | log_level = 'INFO'
12 | load_from = None
13 | resume_from = None
14 | workflow = [('train', 1)]
15 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/mask_rcnn_r50_fpn.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='MaskRCNN',
  4 |     pretrained='torchvision://resnet50',
  5 |     backbone=dict(
  6 |         type='ResNet',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=(0, 1, 2, 3),
 10 |         frozen_stages=1,
 11 |         norm_cfg=dict(type='BN', requires_grad=True),
 12 |         norm_eval=True,
 13 |         style='pytorch'),
 14 |     neck=dict(
 15 |         type='FPN',
 16 |         in_channels=[256, 512, 1024, 2048],
 17 |         out_channels=256,
 18 |         num_outs=5),
 19 |     rpn_head=dict(
 20 |         type='RPNHead',
 21 |         in_channels=256,
 22 |         feat_channels=256,
 23 |         anchor_generator=dict(
 24 |             type='AnchorGenerator',
 25 |             scales=[8],
 26 |             ratios=[0.5, 1.0, 2.0],
 27 |             strides=[4, 8, 16, 32, 64]),
 28 |         bbox_coder=dict(
 29 |             type='DeltaXYWHBBoxCoder',
 30 |             target_means=[.0, .0, .0, .0],
 31 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
 32 |         loss_cls=dict(
 33 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
 34 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
 35 |     roi_head=dict(
 36 |         type='StandardRoIHead',
 37 |         bbox_roi_extractor=dict(
 38 |             type='SingleRoIExtractor',
 39 |             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
 40 |             out_channels=256,
 41 |             featmap_strides=[4, 8, 16, 32]),
 42 |         bbox_head=dict(
 43 |             type='Shared2FCBBoxHead',
 44 |             in_channels=256,
 45 |             fc_out_channels=1024,
 46 |             roi_feat_size=7,
 47 |             num_classes=80,
 48 |             bbox_coder=dict(
 49 |                 type='DeltaXYWHBBoxCoder',
 50 |                 target_means=[0., 0., 0., 0.],
 51 |                 target_stds=[0.1, 0.1, 0.2, 0.2]),
 52 |             reg_class_agnostic=False,
 53 |             loss_cls=dict(
 54 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
 55 |             loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
 56 |         mask_roi_extractor=dict(
 57 |             type='SingleRoIExtractor',
 58 |             roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
 59 |             out_channels=256,
 60 |             featmap_strides=[4, 8, 16, 32]),
 61 |         mask_head=dict(
 62 |             type='FCNMaskHead',
 63 |             num_convs=4,
 64 |             in_channels=256,
 65 |             conv_out_channels=256,
 66 |             num_classes=80,
 67 |             loss_mask=dict(
 68 |                 type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
 69 | # model training and testing settings
 70 | train_cfg = dict(
 71 |     rpn=dict(
 72 |         assigner=dict(
 73 |             type='MaxIoUAssigner',
 74 |             pos_iou_thr=0.7,
 75 |             neg_iou_thr=0.3,
 76 |             min_pos_iou=0.3,
 77 |             match_low_quality=True,
 78 |             ignore_iof_thr=-1),
 79 |         sampler=dict(
 80 |             type='RandomSampler',
 81 |             num=256,
 82 |             pos_fraction=0.5,
 83 |             neg_pos_ub=-1,
 84 |             add_gt_as_proposals=False),
 85 |         allowed_border=-1,
 86 |         pos_weight=-1,
 87 |         debug=False),
 88 |     rpn_proposal=dict(
 89 |         nms_across_levels=False,
 90 |         nms_pre=2000,
 91 |         nms_post=1000,
 92 |         max_num=1000,
 93 |         nms_thr=0.7,
 94 |         min_bbox_size=0),
 95 |     rcnn=dict(
 96 |         assigner=dict(
 97 |             type='MaxIoUAssigner',
 98 |             pos_iou_thr=0.5,
 99 |             neg_iou_thr=0.5,
100 |             min_pos_iou=0.5,
101 |             match_low_quality=True,
102 |             ignore_iof_thr=-1),
103 |         sampler=dict(
104 |             type='RandomSampler',
105 |             num=512,
106 |             pos_fraction=0.25,
107 |             neg_pos_ub=-1,
108 |             add_gt_as_proposals=True),
109 |         mask_size=28,
110 |         pos_weight=-1,
111 |         debug=False))
112 | test_cfg = dict(
113 |     rpn=dict(
114 |         nms_across_levels=False,
115 |         nms_pre=1000,
116 |         nms_post=1000,
117 |         max_num=1000,
118 |         nms_thr=0.7,
119 |         min_bbox_size=0),
120 |     rcnn=dict(
121 |         score_thr=0.05,
122 |         nms=dict(type='nms', iou_threshold=0.5),
123 |         max_per_img=100,
124 |         mask_thr_binary=0.5))
125 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/retinanet_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RetinaNet',
 4 |     pretrained='torchvision://resnet50',
 5 |     backbone=dict(
 6 |         type='ResNet',
 7 |         depth=50,
 8 |         num_stages=4,
 9 |         out_indices=(0, 1, 2, 3),
10 |         frozen_stages=1,
11 |         norm_cfg=dict(type='BN', requires_grad=True),
12 |         norm_eval=True,
13 |         style='pytorch'),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         start_level=1,
19 |         add_extra_convs='on_input',
20 |         num_outs=5),
21 |     bbox_head=dict(
22 |         type='RetinaHead',
23 |         num_classes=80,
24 |         in_channels=256,
25 |         stacked_convs=4,
26 |         feat_channels=256,
27 |         anchor_generator=dict(
28 |             type='AnchorGenerator',
29 |             octave_base_scale=4,
30 |             scales_per_octave=3,
31 |             ratios=[0.5, 1.0, 2.0],
32 |             strides=[8, 16, 32, 64, 128]),
33 |         bbox_coder=dict(
34 |             type='DeltaXYWHBBoxCoder',
35 |             target_means=[.0, .0, .0, .0],
36 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
37 |         loss_cls=dict(
38 |             type='FocalLoss',
39 |             use_sigmoid=True,
40 |             gamma=2.0,
41 |             alpha=0.25,
42 |             loss_weight=1.0),
43 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
44 | # training and testing settings
45 | train_cfg = dict(
46 |     assigner=dict(
47 |         type='MaxIoUAssigner',
48 |         pos_iou_thr=0.5,
49 |         neg_iou_thr=0.4,
50 |         min_pos_iou=0,
51 |         ignore_iof_thr=-1),
52 |     allowed_border=-1,
53 |     pos_weight=-1,
54 |     debug=False)
55 | test_cfg = dict(
56 |     nms_pre=1000,
57 |     min_bbox_size=0,
58 |     score_thr=0.05,
59 |     nms=dict(type='nms', iou_threshold=0.5),
60 |     max_per_img=100)
61 | 


--------------------------------------------------------------------------------
/detection/configs/detr_dpt_s_8x2_50ep_coco.py:
--------------------------------------------------------------------------------
  1 | _base_ = [
  2 |     '_base_/datasets/coco_detection.py',
  3 |     '_base_/default_runtime.py'
  4 | ]
  5 | model = dict(
  6 |     type='DETR',
  7 |     pretrained='pretrained/dpt_small.pth',
  8 |     backbone=dict(
  9 |         type='dpt_small_f4',
 10 |         style='pytorch'),
 11 |     bbox_head=dict(
 12 |         type='TransformerHead',
 13 |         num_classes=80,
 14 |         in_channels=512,
 15 |         num_fcs=2,
 16 |         transformer=dict(
 17 |             type='Transformer',
 18 |             embed_dims=256,
 19 |             num_heads=8,
 20 |             num_encoder_layers=6,
 21 |             num_decoder_layers=6,
 22 |             feedforward_channels=2048,
 23 |             dropout=0.1,
 24 |             act_cfg=dict(type='ReLU', inplace=True),
 25 |             norm_cfg=dict(type='LN'),
 26 |             num_fcs=2,
 27 |             pre_norm=False,
 28 |             return_intermediate_dec=True),
 29 |         positional_encoding=dict(
 30 |             type='SinePositionalEncoding', num_feats=128, normalize=True),
 31 |         loss_cls=dict(
 32 |             type='CrossEntropyLoss',
 33 |             bg_cls_weight=0.1,
 34 |             use_sigmoid=False,
 35 |             loss_weight=1.0,
 36 |             class_weight=1.0),
 37 |         loss_bbox=dict(type='L1Loss', loss_weight=5.0),
 38 |         loss_iou=dict(type='GIoULoss', loss_weight=2.0)))
 39 | # training and testing settings
 40 | train_cfg = dict(
 41 |     assigner=dict(
 42 |         type='HungarianAssigner',
 43 |         cls_cost=dict(type='ClassificationCost', weight=1.),
 44 |         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
 45 |         iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)))
 46 | test_cfg = dict(max_per_img=100)
 47 | img_norm_cfg = dict(
 48 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 49 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
 50 | # from the default setting in mmdet.
 51 | train_pipeline = [
 52 |     dict(type='LoadImageFromFile'),
 53 |     dict(type='LoadAnnotations', with_bbox=True),
 54 |     dict(
 55 |         type='Resize',
 56 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
 57 |                    (1333, 768), (1333, 800)],
 58 |         multiscale_mode='value',
 59 |         keep_ratio=True),
 60 |     dict(type='RandomFlip', flip_ratio=0.5),
 61 |     dict(type='Normalize', **img_norm_cfg),
 62 |     dict(type='Pad', size_divisor=32),
 63 |     dict(type='DefaultFormatBundle'),
 64 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
 65 | ]
 66 | # test_pipeline, NOTE the Pad's size_divisor is different from the default
 67 | # setting (size_divisor=32). While there is little effect on the performance
 68 | # whether we use the default setting or use size_divisor=1.
 69 | test_pipeline = [
 70 |     dict(type='LoadImageFromFile'),
 71 |     dict(
 72 |         type='MultiScaleFlipAug',
 73 |         img_scale=(1333, 800),
 74 |         flip=False,
 75 |         transforms=[
 76 |             dict(type='Resize', keep_ratio=True),
 77 |             dict(type='RandomFlip'),
 78 |             dict(type='Normalize', **img_norm_cfg),
 79 |             dict(type='Pad', size_divisor=32),
 80 |             dict(type='ImageToTensor', keys=['img']),
 81 |             dict(type='Collect', keys=['img'])
 82 |         ])
 83 | ]
 84 | data = dict(
 85 |     samples_per_gpu=2,
 86 |     workers_per_gpu=2,
 87 |     train=dict(pipeline=train_pipeline),
 88 |     val=dict(pipeline=test_pipeline),
 89 |     test=dict(pipeline=test_pipeline))
 90 | # optimizer
 91 | optimizer = dict(
 92 |     type='AdamW',
 93 |     lr=0.0001,
 94 |     weight_decay=0.0001,
 95 |     paramwise_cfg=dict(
 96 |         custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
 97 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
 98 | # learning policy
 99 | lr_config = dict(policy='step', step=[33])
100 | total_epochs = 50
101 | 


--------------------------------------------------------------------------------
/detection/configs/detr_pvt_s_8x2_50ep_coco.py:
--------------------------------------------------------------------------------
  1 | _base_ = [
  2 |     '_base_/datasets/coco_detection.py',
  3 |     '_base_/default_runtime.py'
  4 | ]
  5 | model = dict(
  6 |     type='DETR',
  7 |     pretrained='pretrained/pvt_small.pth',
  8 |     backbone=dict(
  9 |         type='pvt_small_f4',
 10 |         style='pytorch'),
 11 |     bbox_head=dict(
 12 |         type='TransformerHead',
 13 |         num_classes=80,
 14 |         in_channels=512,
 15 |         num_fcs=2,
 16 |         transformer=dict(
 17 |             type='Transformer',
 18 |             embed_dims=256,
 19 |             num_heads=8,
 20 |             num_encoder_layers=6,
 21 |             num_decoder_layers=6,
 22 |             feedforward_channels=2048,
 23 |             dropout=0.1,
 24 |             act_cfg=dict(type='ReLU', inplace=True),
 25 |             norm_cfg=dict(type='LN'),
 26 |             num_fcs=2,
 27 |             pre_norm=False,
 28 |             return_intermediate_dec=True),
 29 |         positional_encoding=dict(
 30 |             type='SinePositionalEncoding', num_feats=128, normalize=True),
 31 |         loss_cls=dict(
 32 |             type='CrossEntropyLoss',
 33 |             bg_cls_weight=0.1,
 34 |             use_sigmoid=False,
 35 |             loss_weight=1.0,
 36 |             class_weight=1.0),
 37 |         loss_bbox=dict(type='L1Loss', loss_weight=5.0),
 38 |         loss_iou=dict(type='GIoULoss', loss_weight=2.0)))
 39 | # training and testing settings
 40 | train_cfg = dict(
 41 |     assigner=dict(
 42 |         type='HungarianAssigner',
 43 |         cls_cost=dict(type='ClassificationCost', weight=1.),
 44 |         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
 45 |         iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)))
 46 | test_cfg = dict(max_per_img=100)
 47 | img_norm_cfg = dict(
 48 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 49 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
 50 | # from the default setting in mmdet.
 51 | train_pipeline = [
 52 |     dict(type='LoadImageFromFile'),
 53 |     dict(type='LoadAnnotations', with_bbox=True),
 54 |     dict(
 55 |         type='Resize',
 56 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
 57 |                    (1333, 768), (1333, 800)],
 58 |         multiscale_mode='value',
 59 |         keep_ratio=True),
 60 |     dict(type='RandomFlip', flip_ratio=0.5),
 61 |     dict(type='Normalize', **img_norm_cfg),
 62 |     dict(type='Pad', size_divisor=32),
 63 |     dict(type='DefaultFormatBundle'),
 64 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
 65 | ]
 66 | # test_pipeline, NOTE the Pad's size_divisor is different from the default
 67 | # setting (size_divisor=32). While there is little effect on the performance
 68 | # whether we use the default setting or use size_divisor=1.
 69 | test_pipeline = [
 70 |     dict(type='LoadImageFromFile'),
 71 |     dict(
 72 |         type='MultiScaleFlipAug',
 73 |         img_scale=(1333, 800),
 74 |         flip=False,
 75 |         transforms=[
 76 |             dict(type='Resize', keep_ratio=True),
 77 |             dict(type='RandomFlip'),
 78 |             dict(type='Normalize', **img_norm_cfg),
 79 |             dict(type='Pad', size_divisor=32),
 80 |             dict(type='ImageToTensor', keys=['img']),
 81 |             dict(type='Collect', keys=['img'])
 82 |         ])
 83 | ]
 84 | data = dict(
 85 |     samples_per_gpu=2,
 86 |     workers_per_gpu=2,
 87 |     train=dict(pipeline=train_pipeline),
 88 |     val=dict(pipeline=test_pipeline),
 89 |     test=dict(pipeline=test_pipeline))
 90 | # optimizer
 91 | optimizer = dict(
 92 |     type='AdamW',
 93 |     lr=0.0001,
 94 |     weight_decay=0.0001,
 95 |     paramwise_cfg=dict(
 96 |         custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
 97 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
 98 | # learning policy
 99 | lr_config = dict(policy='step', step=[33])
100 | total_epochs = 50
101 | 


--------------------------------------------------------------------------------
/detection/configs/detr_r50_8x2_50ep_coco_baseline.py:
--------------------------------------------------------------------------------
  1 | _base_ = [
  2 |     '_base_/datasets/coco_detection.py',
  3 |     '_base_/default_runtime.py'
  4 | ]
  5 | model = dict(
  6 |     type='DETR',
  7 |     # pretrained='torchvision://resnet50',
  8 |     pretrained='pretrained/resnet50-19c8e357.pth',
  9 |     backbone=dict(
 10 |         type='ResNet',
 11 |         depth=50,
 12 |         num_stages=4,
 13 |         out_indices=(3,),
 14 |         frozen_stages=1,
 15 |         norm_cfg=dict(type='BN', requires_grad=False),
 16 |         norm_eval=True,
 17 |         style='pytorch'),
 18 |     bbox_head=dict(
 19 |         type='TransformerHead',
 20 |         num_classes=80,
 21 |         in_channels=2048,
 22 |         num_fcs=2,
 23 |         transformer=dict(
 24 |             type='Transformer',
 25 |             embed_dims=256,
 26 |             num_heads=8,
 27 |             num_encoder_layers=6,
 28 |             num_decoder_layers=6,
 29 |             feedforward_channels=2048,
 30 |             dropout=0.1,
 31 |             act_cfg=dict(type='ReLU', inplace=True),
 32 |             norm_cfg=dict(type='LN'),
 33 |             num_fcs=2,
 34 |             pre_norm=False,
 35 |             return_intermediate_dec=True),
 36 |         positional_encoding=dict(
 37 |             type='SinePositionalEncoding', num_feats=128, normalize=True),
 38 |         loss_cls=dict(
 39 |             type='CrossEntropyLoss',
 40 |             bg_cls_weight=0.1,
 41 |             use_sigmoid=False,
 42 |             loss_weight=1.0,
 43 |             class_weight=1.0),
 44 |         loss_bbox=dict(type='L1Loss', loss_weight=5.0),
 45 |         loss_iou=dict(type='GIoULoss', loss_weight=2.0)))
 46 | # training and testing settings
 47 | train_cfg = dict(
 48 |     assigner=dict(
 49 |         type='HungarianAssigner',
 50 |         cls_cost=dict(type='ClassificationCost', weight=1.),
 51 |         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
 52 |         iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)))
 53 | test_cfg = dict(max_per_img=100)
 54 | img_norm_cfg = dict(
 55 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 56 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
 57 | # from the default setting in mmdet.
 58 | train_pipeline = [
 59 |     dict(type='LoadImageFromFile'),
 60 |     dict(type='LoadAnnotations', with_bbox=True),
 61 |     dict(
 62 |         type='Resize',
 63 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
 64 |                    (1333, 768), (1333, 800)],
 65 |         multiscale_mode='value',
 66 |         keep_ratio=True),
 67 |     dict(type='RandomFlip', flip_ratio=0.5),
 68 |     dict(type='Normalize', **img_norm_cfg),
 69 |     dict(type='Pad', size_divisor=32),
 70 |     dict(type='DefaultFormatBundle'),
 71 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
 72 | ]
 73 | # test_pipeline, NOTE the Pad's size_divisor is different from the default
 74 | # setting (size_divisor=32). While there is little effect on the performance
 75 | # whether we use the default setting or use size_divisor=1.
 76 | test_pipeline = [
 77 |     dict(type='LoadImageFromFile'),
 78 |     dict(
 79 |         type='MultiScaleFlipAug',
 80 |         img_scale=(1333, 800),
 81 |         flip=False,
 82 |         transforms=[
 83 |             dict(type='Resize', keep_ratio=True),
 84 |             dict(type='RandomFlip'),
 85 |             dict(type='Normalize', **img_norm_cfg),
 86 |             dict(type='Pad', size_divisor=1),
 87 |             dict(type='ImageToTensor', keys=['img']),
 88 |             dict(type='Collect', keys=['img'])
 89 |         ])
 90 | ]
 91 | data = dict(
 92 |     samples_per_gpu=2,
 93 |     workers_per_gpu=2,
 94 |     train=dict(pipeline=train_pipeline),
 95 |     val=dict(pipeline=test_pipeline),
 96 |     test=dict(pipeline=test_pipeline))
 97 | # optimizer
 98 | optimizer = dict(
 99 |     type='AdamW',
100 |     lr=0.0001,
101 |     weight_decay=0.0001,
102 |     paramwise_cfg=dict(
103 |         custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
104 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
105 | # learning policy
106 | lr_config = dict(policy='step', step=[33])
107 | total_epochs = 50
108 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_m_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_medium.pth',
 9 |     backbone=dict(
10 |         type='dpt_medium',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_m_fpn_mstrain-poly_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_medium.pth',
 9 |     backbone=dict(
10 |         type='dpt_medium',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # multi-scale
18 | img_norm_cfg = dict(
19 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
20 | train_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='LoadAnnotations',
24 |         with_bbox=True,
25 |         with_mask=True,
26 |         poly2mask=False),
27 |     dict(
28 |         type='Resize',
29 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
30 |                    (1333, 768), (1333, 800)],
31 |         multiscale_mode='value',
32 |         keep_ratio=True),
33 |     dict(type='RandomFlip', flip_ratio=0.5),
34 |     dict(type='Normalize', **img_norm_cfg),
35 |     dict(type='Pad', size_divisor=32),
36 |     dict(type='DefaultFormatBundle'),
37 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
38 | ]
39 | test_pipeline = [
40 |     dict(type='LoadImageFromFile'),
41 |     dict(
42 |         type='MultiScaleFlipAug',
43 |         img_scale=(1333, 800),
44 |         flip=False,
45 |         transforms=[
46 |             dict(type='Resize', keep_ratio=True),
47 |             dict(type='RandomFlip'),
48 |             dict(type='Normalize', **img_norm_cfg),
49 |             dict(type='Pad', size_divisor=32),
50 |             dict(type='ImageToTensor', keys=['img']),
51 |             dict(type='Collect', keys=['img']),
52 |         ])
53 | ]
54 | data = dict(
55 |     train=dict(pipeline=train_pipeline),
56 |     val=dict(pipeline=test_pipeline),
57 |     test=dict(pipeline=test_pipeline))
58 | # optimizer
59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
60 | optimizer_config = dict(grad_clip=None)
61 | # learning policy
62 | lr_config = dict(
63 |     policy='step',
64 |     warmup='linear',
65 |     warmup_iters=500,
66 |     warmup_ratio=0.001,
67 |     step=[28, 34])
68 | total_epochs = 36
69 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_s_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_small.pth',
 9 |     backbone=dict(
10 |         type='dpt_small',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_s_fpn_mstrain-poly_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_small.pth',
 9 |     backbone=dict(
10 |         type='dpt_small',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # multi-scale
18 | img_norm_cfg = dict(
19 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
20 | train_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='LoadAnnotations',
24 |         with_bbox=True,
25 |         with_mask=True,
26 |         poly2mask=False),
27 |     dict(
28 |         type='Resize',
29 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
30 |                    (1333, 768), (1333, 800)],
31 |         multiscale_mode='value',
32 |         keep_ratio=True),
33 |     dict(type='RandomFlip', flip_ratio=0.5),
34 |     dict(type='Normalize', **img_norm_cfg),
35 |     dict(type='Pad', size_divisor=32),
36 |     dict(type='DefaultFormatBundle'),
37 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
38 | ]
39 | test_pipeline = [
40 |     dict(type='LoadImageFromFile'),
41 |     dict(
42 |         type='MultiScaleFlipAug',
43 |         img_scale=(1333, 800),
44 |         flip=False,
45 |         transforms=[
46 |             dict(type='Resize', keep_ratio=True),
47 |             dict(type='RandomFlip'),
48 |             dict(type='Normalize', **img_norm_cfg),
49 |             dict(type='Pad', size_divisor=32),
50 |             dict(type='ImageToTensor', keys=['img']),
51 |             dict(type='Collect', keys=['img']),
52 |         ])
53 | ]
54 | data = dict(
55 |     train=dict(pipeline=train_pipeline),
56 |     val=dict(pipeline=test_pipeline),
57 |     test=dict(pipeline=test_pipeline))
58 | # optimizer
59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
60 | optimizer_config = dict(grad_clip=None)
61 | # learning policy
62 | lr_config = dict(
63 |     policy='step',
64 |     warmup='linear',
65 |     warmup_iters=500,
66 |     warmup_ratio=0.001,
67 |     step=[28, 34])
68 | total_epochs = 36
69 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_t_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_tiny.pth',
 9 |     backbone=dict(
10 |         type='dpt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_dpt_t_fpn_mstrain-poly_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_tiny.pth',
 9 |     backbone=dict(
10 |         type='dpt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # multi-scale
18 | img_norm_cfg = dict(
19 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
20 | train_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='LoadAnnotations',
24 |         with_bbox=True,
25 |         with_mask=True,
26 |         poly2mask=False),
27 |     dict(
28 |         type='Resize',
29 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
30 |                    (1333, 768), (1333, 800)],
31 |         multiscale_mode='value',
32 |         keep_ratio=True),
33 |     dict(type='RandomFlip', flip_ratio=0.5),
34 |     dict(type='Normalize', **img_norm_cfg),
35 |     dict(type='Pad', size_divisor=32),
36 |     dict(type='DefaultFormatBundle'),
37 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
38 | ]
39 | test_pipeline = [
40 |     dict(type='LoadImageFromFile'),
41 |     dict(
42 |         type='MultiScaleFlipAug',
43 |         img_scale=(1333, 800),
44 |         flip=False,
45 |         transforms=[
46 |             dict(type='Resize', keep_ratio=True),
47 |             dict(type='RandomFlip'),
48 |             dict(type='Normalize', **img_norm_cfg),
49 |             dict(type='Pad', size_divisor=32),
50 |             dict(type='ImageToTensor', keys=['img']),
51 |             dict(type='Collect', keys=['img']),
52 |         ])
53 | ]
54 | data = dict(
55 |     train=dict(pipeline=train_pipeline),
56 |     val=dict(pipeline=test_pipeline),
57 |     test=dict(pipeline=test_pipeline))
58 | # optimizer
59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
60 | optimizer_config = dict(grad_clip=None)
61 | # learning policy
62 | lr_config = dict(
63 |     policy='step',
64 |     warmup='linear',
65 |     warmup_iters=500,
66 |     warmup_ratio=0.001,
67 |     step=[28, 34])
68 | total_epochs = 36
69 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_pvt_s_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/pvt_small.pth',
 9 |     backbone=dict(
10 |         type='pvt_small',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | 


--------------------------------------------------------------------------------
/detection/configs/mask_rcnn_pvt_t_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/mask_rcnn_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_instance.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/pvt_tiny.pth',
 9 |     backbone=dict(
10 |         type='pvt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_m_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/retinanet_r50_fpn.py',
 3 |     '_base_/datasets/coco_detection.py',
 4 |     '_base_/default_runtime.py'
 5 | ]
 6 | model = dict(
 7 |     pretrained='pretrained/dpt_medium.pth',
 8 |     backbone=dict(
 9 |         type='dpt_medium',
10 |         style='pytorch'),
11 |     neck=dict(
12 |         type='FPN',
13 |         in_channels=[64, 128, 320, 512],
14 |         out_channels=256,
15 |         start_level=1,
16 |         add_extra_convs='on_input',
17 |         num_outs=5))
18 | # optimizer
19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
20 | optimizer_config = dict(grad_clip=None)
21 | # learning policy
22 | lr_config = dict(
23 |     policy='step',
24 |     warmup='linear',
25 |     warmup_iters=500,
26 |     warmup_ratio=0.001,
27 |     step=[8, 11])
28 | total_epochs = 12
29 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_m_fpn_mstrain_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_detection.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_medium.pth',
 9 |     backbone=dict(
10 |         type='dpt_medium',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         start_level=1,
17 |         add_extra_convs='on_input',
18 |         num_outs=5))
19 | img_norm_cfg = dict(
20 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
21 | train_pipeline = [
22 |     dict(type='LoadImageFromFile'),
23 |     dict(type='LoadAnnotations', with_bbox=True),
24 |     dict(
25 |         type='Resize',
26 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
27 |                    (1333, 768), (1333, 800)],
28 |         multiscale_mode='value',
29 |         keep_ratio=True),
30 |     dict(type='RandomFlip', flip_ratio=0.5),
31 |     dict(type='Normalize', **img_norm_cfg),
32 |     dict(type='Pad', size_divisor=32),
33 |     dict(type='DefaultFormatBundle'),
34 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
35 | ]
36 | test_pipeline = [
37 |     dict(type='LoadImageFromFile'),
38 |     dict(
39 |         type='MultiScaleFlipAug',
40 |         img_scale=(1333, 800),
41 |         flip=False,
42 |         transforms=[
43 |             dict(type='Resize', keep_ratio=True),
44 |             dict(type='RandomFlip'),
45 |             dict(type='Normalize', **img_norm_cfg),
46 |             dict(type='Pad', size_divisor=32),
47 |             dict(type='ImageToTensor', keys=['img']),
48 |             dict(type='Collect', keys=['img']),
49 |         ])
50 | ]
51 | data = dict(
52 |     train=dict(pipeline=train_pipeline),
53 |     val=dict(pipeline=test_pipeline),
54 |     test=dict(pipeline=test_pipeline))
55 | # optimizer
56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
57 | optimizer_config = dict(grad_clip=None)
58 | # learning policy
59 | lr_config = dict(
60 |     policy='step',
61 |     warmup='linear',
62 |     warmup_iters=500,
63 |     warmup_ratio=0.001,
64 |     step=[28, 34])
65 | total_epochs = 36
66 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_s_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/retinanet_r50_fpn.py',
 3 |     '_base_/datasets/coco_detection.py',
 4 |     '_base_/default_runtime.py'
 5 | ]
 6 | model = dict(
 7 |     pretrained='pretrained/dpt_small.pth',
 8 |     backbone=dict(
 9 |         type='dpt_small',
10 |         style='pytorch'),
11 |     neck=dict(
12 |         type='FPN',
13 |         in_channels=[64, 128, 320, 512],
14 |         out_channels=256,
15 |         start_level=1,
16 |         add_extra_convs='on_input',
17 |         num_outs=5))
18 | # optimizer
19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
20 | optimizer_config = dict(grad_clip=None)
21 | # learning policy
22 | lr_config = dict(
23 |     policy='step',
24 |     warmup='linear',
25 |     warmup_iters=500,
26 |     warmup_ratio=0.001,
27 |     step=[8, 11])
28 | total_epochs = 12
29 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_s_fpn_mstrain_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_detection.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_small.pth',
 9 |     backbone=dict(
10 |         type='dpt_small',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         start_level=1,
17 |         add_extra_convs='on_input',
18 |         num_outs=5))
19 | img_norm_cfg = dict(
20 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
21 | train_pipeline = [
22 |     dict(type='LoadImageFromFile'),
23 |     dict(type='LoadAnnotations', with_bbox=True),
24 |     dict(
25 |         type='Resize',
26 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
27 |                    (1333, 768), (1333, 800)],
28 |         multiscale_mode='value',
29 |         keep_ratio=True),
30 |     dict(type='RandomFlip', flip_ratio=0.5),
31 |     dict(type='Normalize', **img_norm_cfg),
32 |     dict(type='Pad', size_divisor=32),
33 |     dict(type='DefaultFormatBundle'),
34 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
35 | ]
36 | test_pipeline = [
37 |     dict(type='LoadImageFromFile'),
38 |     dict(
39 |         type='MultiScaleFlipAug',
40 |         img_scale=(1333, 800),
41 |         flip=False,
42 |         transforms=[
43 |             dict(type='Resize', keep_ratio=True),
44 |             dict(type='RandomFlip'),
45 |             dict(type='Normalize', **img_norm_cfg),
46 |             dict(type='Pad', size_divisor=32),
47 |             dict(type='ImageToTensor', keys=['img']),
48 |             dict(type='Collect', keys=['img']),
49 |         ])
50 | ]
51 | data = dict(
52 |     train=dict(pipeline=train_pipeline),
53 |     val=dict(pipeline=test_pipeline),
54 |     test=dict(pipeline=test_pipeline))
55 | # optimizer
56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
57 | optimizer_config = dict(grad_clip=None)
58 | # learning policy
59 | lr_config = dict(
60 |     policy='step',
61 |     warmup='linear',
62 |     warmup_iters=500,
63 |     warmup_ratio=0.001,
64 |     step=[28, 34])
65 | total_epochs = 36
66 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_t_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_detection.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_tiny.pth',
 9 |     backbone=dict(
10 |         type='dpt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         start_level=1,
17 |         add_extra_convs='on_input',
18 |         num_outs=5))
19 | # optimizer
20 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
21 | optimizer_config = dict(grad_clip=None)
22 | # learning policy
23 | lr_config = dict(
24 |     policy='step',
25 |     warmup='linear',
26 |     warmup_iters=500,
27 |     warmup_ratio=0.001,
28 |     step=[8, 11])
29 | total_epochs = 12
30 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_dpt_t_fpn_mstrain_3x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_detection.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/dpt_tiny.pth',
 9 |     backbone=dict(
10 |         type='dpt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         start_level=1,
17 |         add_extra_convs='on_input',
18 |         num_outs=5))
19 | img_norm_cfg = dict(
20 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
21 | train_pipeline = [
22 |     dict(type='LoadImageFromFile'),
23 |     dict(type='LoadAnnotations', with_bbox=True),
24 |     dict(
25 |         type='Resize',
26 |         img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
27 |                    (1333, 768), (1333, 800)],
28 |         multiscale_mode='value',
29 |         keep_ratio=True),
30 |     dict(type='RandomFlip', flip_ratio=0.5),
31 |     dict(type='Normalize', **img_norm_cfg),
32 |     dict(type='Pad', size_divisor=32),
33 |     dict(type='DefaultFormatBundle'),
34 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
35 | ]
36 | test_pipeline = [
37 |     dict(type='LoadImageFromFile'),
38 |     dict(
39 |         type='MultiScaleFlipAug',
40 |         img_scale=(1333, 800),
41 |         flip=False,
42 |         transforms=[
43 |             dict(type='Resize', keep_ratio=True),
44 |             dict(type='RandomFlip'),
45 |             dict(type='Normalize', **img_norm_cfg),
46 |             dict(type='Pad', size_divisor=32),
47 |             dict(type='ImageToTensor', keys=['img']),
48 |             dict(type='Collect', keys=['img']),
49 |         ])
50 | ]
51 | data = dict(
52 |     train=dict(pipeline=train_pipeline),
53 |     val=dict(pipeline=test_pipeline),
54 |     test=dict(pipeline=test_pipeline))
55 | # optimizer
56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
57 | optimizer_config = dict(grad_clip=None)
58 | # learning policy
59 | lr_config = dict(
60 |     policy='step',
61 |     warmup='linear',
62 |     warmup_iters=500,
63 |     warmup_ratio=0.001,
64 |     step=[28, 34])
65 | total_epochs = 36
66 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_pvt_s_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/retinanet_r50_fpn.py',
 3 |     '_base_/datasets/coco_detection.py',
 4 |     '_base_/default_runtime.py'
 5 | ]
 6 | model = dict(
 7 |     pretrained='pretrained/pvt_small.pth',
 8 |     backbone=dict(
 9 |         type='pvt_small',
10 |         style='pytorch'),
11 |     neck=dict(
12 |         type='FPN',
13 |         in_channels=[64, 128, 320, 512],
14 |         out_channels=256,
15 |         start_level=1,
16 |         add_extra_convs='on_input',
17 |         num_outs=5))
18 | # optimizer
19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
20 | optimizer_config = dict(grad_clip=None)
21 | # learning policy
22 | lr_config = dict(
23 |     policy='step',
24 |     warmup='linear',
25 |     warmup_iters=500,
26 |     warmup_ratio=0.001,
27 |     step=[8, 11])
28 | total_epochs = 12
29 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_pvt_s_fpn_1x_coco_640.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/default_runtime.py'
 4 | ]
 5 | model = dict(
 6 |     pretrained='pretrained/pvt_small.pth',
 7 |     backbone=dict(
 8 |         type='pvt_small',
 9 |         style='pytorch'),
10 |     neck=dict(
11 |         type='FPN',
12 |         in_channels=[64, 128, 320, 512],
13 |         out_channels=256,
14 |         start_level=1,
15 |         add_extra_convs='on_input',
16 |         num_outs=5))
17 | # optimizer
18 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
19 | optimizer_config = dict(grad_clip=None)
20 | # learning policy
21 | lr_config = dict(
22 |     policy='step',
23 |     warmup='linear',
24 |     warmup_iters=500,
25 |     warmup_ratio=0.001,
26 |     step=[8, 11])
27 | total_epochs = 12
28 | # dataset
29 | dataset_type = 'CocoDataset'
30 | data_root = 'data/coco/'
31 | img_norm_cfg = dict(
32 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
33 | train_pipeline = [
34 |     dict(type='LoadImageFromFile'),
35 |     dict(type='LoadAnnotations', with_bbox=True),
36 |     dict(type='Resize', img_scale=(1066, 640), keep_ratio=True),
37 |     dict(type='RandomFlip', flip_ratio=0.5),
38 |     dict(type='Normalize', **img_norm_cfg),
39 |     dict(type='Pad', size_divisor=32),
40 |     dict(type='DefaultFormatBundle'),
41 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
42 | ]
43 | test_pipeline = [
44 |     dict(type='LoadImageFromFile'),
45 |     dict(
46 |         type='MultiScaleFlipAug',
47 |         img_scale=(1066, 640),
48 |         flip=False,
49 |         transforms=[
50 |             dict(type='Resize', keep_ratio=True),
51 |             dict(type='RandomFlip'),
52 |             dict(type='Normalize', **img_norm_cfg),
53 |             dict(type='Pad', size_divisor=32),
54 |             dict(type='ImageToTensor', keys=['img']),
55 |             dict(type='Collect', keys=['img']),
56 |         ])
57 | ]
58 | data = dict(
59 |     samples_per_gpu=2,
60 |     workers_per_gpu=2,
61 |     train=dict(
62 |         type=dataset_type,
63 |         ann_file=data_root + 'annotations/instances_train2017.json',
64 |         img_prefix=data_root + 'train2017/',
65 |         pipeline=train_pipeline),
66 |     val=dict(
67 |         type=dataset_type,
68 |         ann_file=data_root + 'annotations/instances_val2017.json',
69 |         img_prefix=data_root + 'val2017/',
70 |         pipeline=test_pipeline),
71 |     test=dict(
72 |         type=dataset_type,
73 |         ann_file=data_root + 'annotations/instances_val2017.json',
74 |         img_prefix=data_root + 'val2017/',
75 |         pipeline=test_pipeline))
76 | evaluation = dict(interval=1, metric='bbox')
77 | 


--------------------------------------------------------------------------------
/detection/configs/retinanet_pvt_t_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '../configs/_base_/models/retinanet_r50_fpn.py',
 3 |     '../configs/_base_/datasets/coco_detection.py',
 4 |     # '../configs/_base_/schedules/schedule_1x.py',
 5 |     '../configs/_base_/default_runtime.py'
 6 | ]
 7 | model = dict(
 8 |     pretrained='pretrained/pvt_tiny.pth',
 9 |     backbone=dict(
10 |         type='pvt_tiny',
11 |         style='pytorch'),
12 |     neck=dict(
13 |         type='FPN',
14 |         in_channels=[64, 128, 320, 512],
15 |         out_channels=256,
16 |         start_level=1,
17 |         add_extra_convs='on_input',
18 |         num_outs=5))
19 | # optimizer
20 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001)
21 | optimizer_config = dict(grad_clip=None)
22 | # learning policy
23 | lr_config = dict(
24 |     policy='step',
25 |     warmup='linear',
26 |     warmup_iters=500,
27 |     warmup_ratio=0.001,
28 |     step=[8, 11])
29 | total_epochs = 12


--------------------------------------------------------------------------------
/detection/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | PORT=${PORT:-29500}
 7 | 
 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
10 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
11 | 


--------------------------------------------------------------------------------
/detection/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | PORT=${PORT:-29500}
 6 | 
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=66667 \
 9 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
10 | 


--------------------------------------------------------------------------------
/detection/dpt_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dpt import *
2 | 


--------------------------------------------------------------------------------
/detection/dpt_models/box_coder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import math
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn import functional as F
  6 | 
  7 | __all__ = ["pointCoder", "pointwhCoder"]
  8 | 
  9 | 
 10 | class pointCoder(nn.Module):
 11 |     def __init__(self, input_size, patch_count, weights=(1., 1.), tanh=True):
 12 |         super().__init__()
 13 |         self.input_size = input_size
 14 |         self.patch_count = patch_count
 15 |         self.weights = weights
 16 |         #self._generate_anchor()
 17 |         self.tanh = tanh
 18 | 
 19 |     def _generate_anchor(self, device="cpu"):
 20 |         anchors = []
 21 |         patch_stride_y, patch_stride_x = 1. / self.patch_count[0], 1. / self.patch_count[1]
 22 |         for i in range(self.patch_count[0]):
 23 |             for j in range(self.patch_count[1]):
 24 |                 y = (0.5+i)*patch_stride_y
 25 |                 x = (0.5+j)*patch_stride_x
 26 |                 anchors.append([x, y])
 27 |         anchors = torch.as_tensor(anchors)
 28 |         self.anchor = torch.as_tensor(anchors, device=device)
 29 |         #self.register_buffer("anchor", anchors)
 30 | 
 31 |     @torch.cuda.amp.autocast(enabled=False)
 32 |     def forward(self, pts, model_offset=None):
 33 |         assert model_offset is None
 34 |         self.boxes = self.decode(pts)
 35 |         return self.boxes
 36 | 
 37 |     def decode(self, rel_codes):
 38 |         # print ('xyxy decoding')
 39 |         boxes = self.anchor
 40 |         pixel = 1./self.patch_count
 41 |         wx, wy = self.weights
 42 | 
 43 |         dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx
 44 |         dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy
 45 | 
 46 |         pred_boxes = torch.zeros_like(rel_codes)
 47 | 
 48 |         ref_x = boxes[:,0].unsqueeze(0)
 49 |         ref_y = boxes[:,1].unsqueeze(0)
 50 | 
 51 |         pred_boxes[:, :, 0] = dx + ref_x
 52 |         pred_boxes[:, :, 1] = dy + ref_y
 53 |         pred_boxes = pred_boxes.clamp_(min=0., max=1.)
 54 | 
 55 |         return pred_boxes
 56 | 
 57 |     def get_offsets(self):
 58 |         return (self.boxes - self.anchor) * self.input_size
 59 | 
 60 | 
 61 | class pointwhCoder(pointCoder):
 62 |     def __init__(self, input_size, patch_count, weights=(1., 1.), pts=1, tanh=True, wh_bias=None):
 63 |         super().__init__(input_size=input_size, patch_count=patch_count, weights=weights, tanh=tanh)
 64 |         self.patch_pixel = pts
 65 |         self.wh_bias = None
 66 |         if wh_bias is not None:
 67 |             self.wh_bias = nn.Parameter(torch.zeros(2) + wh_bias)
 68 | 
 69 |     @torch.cuda.amp.autocast(enabled=False)
 70 |     def forward(self, boxes, img_size, output_size):
 71 |         self.input_size = img_size
 72 |         self.patch_count = output_size
 73 |         self._generate_anchor(device=boxes.device)
 74 |         if self.wh_bias is not None:
 75 |             boxes[:, :, 2:] = boxes[:, :, 2:] + self.wh_bias
 76 |         self.boxes = self.decode(boxes)
 77 |         points = self.meshgrid(self.boxes)
 78 |         return points
 79 | 
 80 |     def decode(self, rel_codes):
 81 |         # print ('xyxy decoding')
 82 |         boxes = self.anchor
 83 |         pixel_x, pixel_y = 1./self.patch_count[1], 1./self.patch_count[0]
 84 |         wx, wy, wh, ww = self.weights
 85 | 
 86 |         dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel_x if self.tanh else rel_codes[:, :, 0]*pixel_x / wx
 87 |         dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel_y if self.tanh else rel_codes[:, :, 1]*pixel_y / wy
 88 | 
 89 |         dw = F.relu(F.tanh(rel_codes[:, :, 2]/ww)) * pixel_x
 90 |         dh = F.relu(F.tanh(rel_codes[:, :, 3]/wh)) * pixel_y
 91 | 
 92 |         pred_boxes = torch.zeros_like(rel_codes)
 93 | 
 94 |         ref_x = boxes[:,0].unsqueeze(0)
 95 |         ref_y = boxes[:,1].unsqueeze(0)
 96 | 
 97 |         pred_boxes[:, :, 0] = dx + ref_x - dw
 98 |         pred_boxes[:, :, 1] = dy + ref_y - dh
 99 |         pred_boxes[:, :, 2] = dx + ref_x + dw
100 |         pred_boxes[:, :, 3] = dy + ref_y + dh
101 |         pred_boxes = pred_boxes.clamp_(min=0., max=1.)
102 | 
103 |         return pred_boxes
104 | 
105 |     def get_offsets(self):
106 |         return (self.boxes - self.anchor.repeat(1,2)) * self.input_size
107 | 
108 |     def get_scales(self):
109 |         return (self.boxes[:, :, 2:] - self.boxes[:, :, :2]) * self.input_size
110 |     
111 |     def meshgrid(self, boxes):
112 |         B = boxes.shape[0]
113 |         xs, ys = boxes[:, :, 0::2], boxes[: , :, 1::2]
114 |         xs = torch.nn.functional.interpolate(xs, size=self.patch_pixel, mode='linear', align_corners=True)
115 |         ys = torch.nn.functional.interpolate(ys, size=self.patch_pixel, mode='linear', align_corners=True)
116 |         xs, ys = xs.unsqueeze(3).repeat_interleave(self.patch_pixel, dim=3), ys.unsqueeze(2).repeat_interleave(self.patch_pixel, dim=2)
117 |         results = torch.stack([xs, ys], dim = -1)
118 |         results = results.reshape(B, self.patch_count[0]*self.patch_count[1]*self.patch_pixel*self.patch_pixel, 2)
119 |         return results
120 | 


--------------------------------------------------------------------------------
/detection/dpt_models/depatch_embed.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from functools import partial
  4 | 
  5 | from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
  6 | from timm.models.helpers import load_pretrained
  7 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
  8 | from timm.models.resnet import resnet26d, resnet50d
  9 | from timm.models.registry import register_model
 10 | 
 11 | from timm.models import create_model
 12 | from timm.models.vision_transformer import _cfg, Block
 13 | from .ms_deform_attn_func import MSDeformAttnFunction
 14 | 
 15 | class PatchEmbed(nn.Module):
 16 |     """ Image to Patch Embedding
 17 |     """
 18 |     def __init__(self, img_size=224, patch_size=16, patch_count=14, in_chans=3, embed_dim=768, with_norm=False):
 19 |         super().__init__()  
 20 |         patch_stride = img_size // patch_count
 21 |         patch_pad = (patch_stride * (patch_count - 1) + patch_size - img_size) // 2
 22 |         img_size = to_2tuple(img_size)
 23 |         patch_size = to_2tuple(patch_size)
 24 |         num_patches = patch_count * patch_count
 25 |         self.img_size = img_size
 26 |         self.patch_size = patch_size
 27 |         self.num_patches = num_patches
 28 | 
 29 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=patch_pad)
 30 |         if with_norm:
 31 |             self.norm = nn.LayerNorm(embed_dim)
 32 | 
 33 |     def forward(self, x, **kwargs):
 34 |         B, C, H, W = x.shape
 35 |         # FIXME look at relaxing size constraints
 36 |         #assert H == self.img_size[0] and W == self.img_size[1], \
 37 |         #    f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
 38 |         x = self.proj(x).flatten(2).transpose(1, 2)
 39 |         if hasattr(self, "norm"):
 40 |             x = self.norm(x)
 41 |         #assert x.shape[1] == self.num_patches
 42 |         return x
 43 | 
 44 | 
 45 | class Simple_Patch(nn.Module):
 46 |     def __init__(self, offset_embed, img_size=224, patch_size=16, patch_pixel=16, patch_count=14, 
 47 |                  in_chans=3, embed_dim=192, another_linear=False, use_GE=False, local_feature=False, with_norm=False):
 48 |         super().__init__()
 49 |         self.H, self.W = patch_count, patch_count
 50 |         self.num_patches = patch_count * patch_count
 51 |         self.another_linear = another_linear
 52 |         if self.another_linear:
 53 |             self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, embed_dim, with_norm=with_norm)
 54 |             self.act = nn.GELU() if use_GE else nn.Identity()
 55 |             self.offset_predictor = nn.Linear(embed_dim, offset_embed, bias=False)
 56 |         else:
 57 |             self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, offset_embed)
 58 | 
 59 |         self.img_size, self.patch_size, self.patch_pixel, self.patch_count = img_size, patch_size, patch_pixel, patch_count
 60 |         self.in_chans, self.embed_dim = in_chans, embed_dim
 61 | 
 62 |     def reset_offset(self):
 63 |         if self.another_linear:
 64 |             nn.init.constant_(self.offset_predictor.weight, 0)
 65 |             if hasattr(self.offset_predictor, "bias") and self.offset_predictor.bias is not None:
 66 |                 nn.init.constant_(self.offset_predictor.bias, 0)
 67 |         else:
 68 |             nn.init.constant_(self.patch_embed.proj.weight, 0)
 69 |             if hasattr(self.patch_embed.proj, "bias") and self.patch_embed.proj.bias is not None:
 70 |                 nn.init.constant_(self.patch_embed.proj.bias, 0)
 71 |         print("Parameter for offsets reseted.")
 72 | 
 73 |     @torch.cuda.amp.autocast(enabled=False)
 74 |     def forward(self, x):
 75 |         #if x.dim() == 3:
 76 |         #    B, H, W = x.shape[0], self.img_size, self.img_size
 77 |         #    assert x.shape[1] == H * W
 78 |         #    x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
 79 |         B, C, H, W = x.shape
 80 |         img = x
 81 |         x = self.patch_embed(x)
 82 |         if self.another_linear:
 83 |             pred_offset = self.offset_predictor(self.act(x))
 84 |         else:
 85 |             pred_offset = x.contiguous()
 86 |         output_size = (H // self.patch_size, W // self.patch_size)
 87 |         return self.get_output(img, pred_offset, img_size=(H, W), output_size=output_size), output_size
 88 | 
 89 | class Simple_DePatch(Simple_Patch):
 90 |     def __init__(self, box_coder, show_dim=4, **kwargs):
 91 |         super().__init__(show_dim, **kwargs)
 92 |         self.box_coder = box_coder
 93 |         #self.register_buffer("value_spatial_shapes", torch.as_tensor([[self.img_size, self.img_size]], dtype=torch.long))
 94 |         self.register_buffer("value_level_start_index", torch.as_tensor([0], dtype=torch.long))
 95 |         self.output_proj = nn.Linear(self.in_chans * self.patch_pixel * self.patch_pixel, self.embed_dim)
 96 |         if kwargs["with_norm"]:
 97 |             self.with_norm=True
 98 |             self.norm = nn.LayerNorm(self.embed_dim)
 99 |         else:
100 |             self.with_norm=False
101 | 
102 |     def get_output(self, img, pred_offset, img_size, output_size):
103 |         #copyed
104 |         B = img.shape[0]
105 |         value_spatial_shapes = torch.as_tensor(img_size, dtype=torch.long, device=pred_offset.device).view(1, 2)
106 |         num_sample_points = self.patch_pixel * self.patch_pixel * output_size[0] * output_size[1]
107 | 
108 |         sample_location = self.box_coder(pred_offset, img_size=img_size, output_size=output_size)
109 |         sampling_locations = sample_location.view(B, num_sample_points,1,1,1,2).to(torch.float)
110 |         attention_weights = torch.ones((B, num_sample_points, 1, 1, 1), device=img.device)
111 |         x = img.view(B, self.in_chans, 1, -1).transpose(1, 3).contiguous()
112 |         output = MSDeformAttnFunction.apply(x, value_spatial_shapes, self.value_level_start_index, sampling_locations, attention_weights, 1)
113 |         # output_proj
114 |         output = output.view(B, output_size[0]*output_size[1], self.in_chans*self.patch_pixel*self.patch_pixel)
115 |         output = self.output_proj(output)
116 |         if self.with_norm:
117 |             output = self.norm(output)
118 |         return output
119 | 


--------------------------------------------------------------------------------
/detection/dpt_models/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/detection/pvt.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from functools import partial
  5 | 
  6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
  7 | from timm.models.registry import register_model
  8 | from timm.models.vision_transformer import _cfg
  9 | from mmdet.models.builder import BACKBONES
 10 | from mmdet.utils import get_root_logger
 11 | from mmcv.runner import load_checkpoint
 12 | 
 13 | 
 14 | class Mlp(nn.Module):
 15 |     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
 16 |         super().__init__()
 17 |         out_features = out_features or in_features
 18 |         hidden_features = hidden_features or in_features
 19 |         self.fc1 = nn.Linear(in_features, hidden_features)
 20 |         self.act = act_layer()
 21 |         self.fc2 = nn.Linear(hidden_features, out_features)
 22 |         self.drop = nn.Dropout(drop)
 23 | 
 24 |     def forward(self, x):
 25 |         x = self.fc1(x)
 26 |         x = self.act(x)
 27 |         x = self.drop(x)
 28 |         x = self.fc2(x)
 29 |         x = self.drop(x)
 30 |         return x
 31 | 
 32 | 
 33 | class Attention(nn.Module):
 34 |     def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
 35 |         super().__init__()
 36 |         assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
 37 | 
 38 |         self.dim = dim
 39 |         self.num_heads = num_heads
 40 |         head_dim = dim // num_heads
 41 |         self.scale = qk_scale or head_dim ** -0.5
 42 | 
 43 |         self.q = nn.Linear(dim, dim, bias=qkv_bias)
 44 |         self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
 45 |         self.attn_drop = nn.Dropout(attn_drop)
 46 |         self.proj = nn.Linear(dim, dim)
 47 |         self.proj_drop = nn.Dropout(proj_drop)
 48 | 
 49 |         self.sr_ratio = sr_ratio
 50 |         if sr_ratio > 1:
 51 |             self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
 52 |             self.norm = nn.LayerNorm(dim)
 53 | 
 54 |     def forward(self, x, H, W):
 55 |         B, N, C = x.shape
 56 |         q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
 57 | 
 58 |         if self.sr_ratio > 1:
 59 |             x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
 60 |             x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
 61 |             x_ = self.norm(x_)
 62 |             kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
 63 |         else:
 64 |             kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
 65 |         k, v = kv[0], kv[1]
 66 | 
 67 |         attn = (q @ k.transpose(-2, -1)) * self.scale
 68 |         attn = attn.softmax(dim=-1)
 69 |         attn = self.attn_drop(attn)
 70 | 
 71 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
 72 |         x = self.proj(x)
 73 |         x = self.proj_drop(x)
 74 | 
 75 |         return x
 76 | 
 77 | 
 78 | class Block(nn.Module):
 79 | 
 80 |     def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
 81 |                  drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
 82 |         super().__init__()
 83 |         self.norm1 = norm_layer(dim)
 84 |         self.attn = Attention(
 85 |             dim,
 86 |             num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
 87 |             attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
 88 |         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
 89 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 90 |         self.norm2 = norm_layer(dim)
 91 |         mlp_hidden_dim = int(dim * mlp_ratio)
 92 |         self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
 93 | 
 94 |     def forward(self, x, H, W):
 95 |         x = x + self.drop_path(self.attn(self.norm1(x), H, W))
 96 |         x = x + self.drop_path(self.mlp(self.norm2(x)))
 97 | 
 98 |         return x
 99 | 
100 | 
101 | class PatchEmbed(nn.Module):
102 |     """ Image to Patch Embedding
103 |     """
104 | 
105 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
106 |         super().__init__()
107 |         img_size = to_2tuple(img_size)
108 |         patch_size = to_2tuple(patch_size)
109 | 
110 |         self.img_size = img_size
111 |         self.patch_size = patch_size
112 |         assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \
113 |             f"img_size {img_size} should be divided by patch_size {patch_size}."
114 |         self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
115 |         self.num_patches = self.H * self.W
116 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
117 |         self.norm = nn.LayerNorm(embed_dim)
118 | 
119 |     def forward(self, x):
120 |         B, C, H, W = x.shape
121 | 
122 |         x = self.proj(x).flatten(2).transpose(1, 2)
123 |         x = self.norm(x)
124 |         H, W = H // self.patch_size[0], W // self.patch_size[1]
125 | 
126 |         return x, (H, W)
127 | 
128 | 
129 | class PyramidVisionTransformer(nn.Module):
130 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
131 |                  num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
132 |                  attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
133 |                  depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], F4=False):
134 |         super().__init__()
135 |         self.num_classes = num_classes
136 |         self.depths = depths
137 |         self.F4 = F4
138 | 
139 |         # patch_embed
140 |         self.patch_embed1 = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans,
141 |                                        embed_dim=embed_dims[0])
142 |         self.patch_embed2 = PatchEmbed(img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0],
143 |                                        embed_dim=embed_dims[1])
144 |         self.patch_embed3 = PatchEmbed(img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1],
145 |                                        embed_dim=embed_dims[2])
146 |         self.patch_embed4 = PatchEmbed(img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2],
147 |                                        embed_dim=embed_dims[3])
148 | 
149 |         # pos_embed
150 |         self.pos_embed1 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[0]))
151 |         self.pos_drop1 = nn.Dropout(p=drop_rate)
152 |         self.pos_embed2 = nn.Parameter(torch.zeros(1, self.patch_embed2.num_patches, embed_dims[1]))
153 |         self.pos_drop2 = nn.Dropout(p=drop_rate)
154 |         self.pos_embed3 = nn.Parameter(torch.zeros(1, self.patch_embed3.num_patches, embed_dims[2]))
155 |         self.pos_drop3 = nn.Dropout(p=drop_rate)
156 |         self.pos_embed4 = nn.Parameter(torch.zeros(1, self.patch_embed4.num_patches + 1, embed_dims[3]))
157 |         self.pos_drop4 = nn.Dropout(p=drop_rate)
158 | 
159 |         # transformer encoder
160 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
161 |         cur = 0
162 |         self.block1 = nn.ModuleList([Block(
163 |             dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
164 |             drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
165 |             sr_ratio=sr_ratios[0])
166 |             for i in range(depths[0])])
167 | 
168 |         cur += depths[0]
169 |         self.block2 = nn.ModuleList([Block(
170 |             dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
171 |             drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
172 |             sr_ratio=sr_ratios[1])
173 |             for i in range(depths[1])])
174 | 
175 |         cur += depths[1]
176 |         self.block3 = nn.ModuleList([Block(
177 |             dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
178 |             drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
179 |             sr_ratio=sr_ratios[2])
180 |             for i in range(depths[2])])
181 | 
182 |         cur += depths[2]
183 |         self.block4 = nn.ModuleList([Block(
184 |             dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
185 |             drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
186 |             sr_ratio=sr_ratios[3])
187 |             for i in range(depths[3])])
188 | 
189 |         # init weights
190 |         trunc_normal_(self.pos_embed1, std=.02)
191 |         trunc_normal_(self.pos_embed2, std=.02)
192 |         trunc_normal_(self.pos_embed3, std=.02)
193 |         trunc_normal_(self.pos_embed4, std=.02)
194 |         self.apply(self._init_weights)
195 | 
196 |     def init_weights(self, pretrained=None):
197 |         if isinstance(pretrained, str):
198 |             logger = get_root_logger()
199 |             load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
200 | 
201 |     def reset_drop_path(self, drop_path_rate):
202 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
203 |         cur = 0
204 |         for i in range(self.depths[0]):
205 |             self.block1[i].drop_path.drop_prob = dpr[cur + i]
206 | 
207 |         cur += self.depths[0]
208 |         for i in range(self.depths[1]):
209 |             self.block2[i].drop_path.drop_prob = dpr[cur + i]
210 | 
211 |         cur += self.depths[1]
212 |         for i in range(self.depths[2]):
213 |             self.block3[i].drop_path.drop_prob = dpr[cur + i]
214 | 
215 |         cur += self.depths[2]
216 |         for i in range(self.depths[3]):
217 |             self.block4[i].drop_path.drop_prob = dpr[cur + i]
218 | 
219 |     def _init_weights(self, m):
220 |         if isinstance(m, nn.Linear):
221 |             trunc_normal_(m.weight, std=.02)
222 |             if isinstance(m, nn.Linear) and m.bias is not None:
223 |                 nn.init.constant_(m.bias, 0)
224 |         elif isinstance(m, nn.LayerNorm):
225 |             nn.init.constant_(m.bias, 0)
226 |             nn.init.constant_(m.weight, 1.0)
227 | 
228 |     def _get_pos_embed(self, pos_embed, patch_embed, H, W):
229 |         if H * W == self.patch_embed1.num_patches:
230 |             return pos_embed
231 |         else:
232 |             return F.interpolate(
233 |                 pos_embed.reshape(1, patch_embed.H, patch_embed.W, -1).permute(0, 3, 1, 2),
234 |                 size=(H, W), mode="bilinear").reshape(1, -1, H * W).permute(0, 2, 1)
235 | 
236 |     def forward_features(self, x):
237 |         outs = []
238 | 
239 |         B = x.shape[0]
240 | 
241 |         # stage 1
242 |         x, (H, W) = self.patch_embed1(x)
243 |         pos_embed1 = self._get_pos_embed(self.pos_embed1, self.patch_embed1, H, W)
244 |         x = x + pos_embed1
245 |         x = self.pos_drop1(x)
246 |         for blk in self.block1:
247 |             x = blk(x, H, W)
248 |         x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
249 |         outs.append(x)
250 | 
251 |         # stage 2
252 |         x, (H, W) = self.patch_embed2(x)
253 |         pos_embed2 = self._get_pos_embed(self.pos_embed2, self.patch_embed2, H, W)
254 |         x = x + pos_embed2
255 |         x = self.pos_drop2(x)
256 |         for blk in self.block2:
257 |             x = blk(x, H, W)
258 |         x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
259 |         outs.append(x)
260 | 
261 |         # stage 3
262 |         x, (H, W) = self.patch_embed3(x)
263 |         pos_embed3 = self._get_pos_embed(self.pos_embed3, self.patch_embed3, H, W)
264 |         x = x + pos_embed3
265 |         x = self.pos_drop3(x)
266 |         for blk in self.block3:
267 |             x = blk(x, H, W)
268 |         x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
269 |         outs.append(x)
270 | 
271 |         # stage 4
272 |         x, (H, W) = self.patch_embed4(x)
273 |         pos_embed4 = self._get_pos_embed(self.pos_embed4[:, 1:], self.patch_embed4, H, W)
274 |         x = x + pos_embed4
275 |         x = self.pos_drop4(x)
276 |         for blk in self.block4:
277 |             x = blk(x, H, W)
278 |         x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
279 |         outs.append(x)
280 | 
281 |         return outs
282 | 
283 |     def forward(self, x):
284 |         x = self.forward_features(x)
285 | 
286 |         if self.F4:
287 |             x = x[3:4]
288 | 
289 |         return x
290 | 
291 | 
292 | def _conv_filter(state_dict, patch_size=16):
293 |     """ convert patch embedding weight from manual patchify + linear proj to conv"""
294 |     out_dict = {}
295 |     for k, v in state_dict.items():
296 |         if 'patch_embed.proj.weight' in k:
297 |             v = v.reshape((v.shape[0], 3, patch_size, patch_size))
298 |         out_dict[k] = v
299 | 
300 |     return out_dict
301 | 
302 | 
303 | @BACKBONES.register_module()
304 | class pvt_tiny(PyramidVisionTransformer):
305 |     def __init__(self, **kwargs):
306 |         super(pvt_tiny, self).__init__(
307 |             patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
308 |             qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2],
309 |             sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1)
310 | 
311 | 
312 | @BACKBONES.register_module()
313 | class pvt_small(PyramidVisionTransformer):
314 |     def __init__(self, **kwargs):
315 |         super(pvt_small, self).__init__(
316 |             patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
317 |             qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
318 |             sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1)
319 | 
320 | 
321 | @BACKBONES.register_module()
322 | class pvt_small_f4(PyramidVisionTransformer):
323 |     def __init__(self, **kwargs):
324 |         super(pvt_small_f4, self).__init__(
325 |             patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
326 |             qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
327 |             sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, F4=True)
328 | 


--------------------------------------------------------------------------------
/detection/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | 
  5 | import mmcv
  6 | import torch
  7 | from mmcv import Config, DictAction
  8 | from mmcv.cnn import fuse_conv_bn
  9 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 10 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
 11 |                          wrap_fp16_model)
 12 | 
 13 | from mmdet.apis import multi_gpu_test, single_gpu_test
 14 | from mmdet.datasets import (build_dataloader, build_dataset,
 15 |                             replace_ImageToTensor)
 16 | from mmdet.models import build_detector
 17 | import pvt
 18 | import dpt_models
 19 | 
 20 | 
 21 | def parse_args():
 22 |     parser = argparse.ArgumentParser(
 23 |         description='MMDet test (and eval) a model')
 24 |     parser.add_argument('config', help='test config file path')
 25 |     parser.add_argument('checkpoint', help='checkpoint file')
 26 |     parser.add_argument('--out', help='output result file in pickle format')
 27 |     parser.add_argument(
 28 |         '--fuse-conv-bn',
 29 |         action='store_true',
 30 |         help='Whether to fuse conv and bn, this will slightly increase'
 31 |         'the inference speed')
 32 |     parser.add_argument(
 33 |         '--format-only',
 34 |         action='store_true',
 35 |         help='Format the output results without perform evaluation. It is'
 36 |         'useful when you want to format the result to a specific format and '
 37 |         'submit it to the test server')
 38 |     parser.add_argument(
 39 |         '--eval',
 40 |         type=str,
 41 |         nargs='+',
 42 |         help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
 43 |         ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
 44 |     parser.add_argument('--show', action='store_true', help='show results')
 45 |     parser.add_argument(
 46 |         '--show-dir', help='directory where painted images will be saved')
 47 |     parser.add_argument(
 48 |         '--show-score-thr',
 49 |         type=float,
 50 |         default=0.3,
 51 |         help='score threshold (default: 0.3)')
 52 |     parser.add_argument(
 53 |         '--gpu-collect',
 54 |         action='store_true',
 55 |         help='whether to use gpu to collect results.')
 56 |     parser.add_argument(
 57 |         '--tmpdir',
 58 |         help='tmp directory used for collecting results from multiple '
 59 |         'workers, available when gpu-collect is not specified')
 60 |     parser.add_argument(
 61 |         '--cfg-options',
 62 |         nargs='+',
 63 |         action=DictAction,
 64 |         help='override some settings in the used config, the key-value pair '
 65 |         'in xxx=yyy format will be merged into config file. If the value to '
 66 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 67 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 68 |         'Note that the quotation marks are necessary and that no white space '
 69 |         'is allowed.')
 70 |     parser.add_argument(
 71 |         '--options',
 72 |         nargs='+',
 73 |         action=DictAction,
 74 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
 75 |         'format will be kwargs for dataset.evaluate() function (deprecate), '
 76 |         'change to --eval-options instead.')
 77 |     parser.add_argument(
 78 |         '--eval-options',
 79 |         nargs='+',
 80 |         action=DictAction,
 81 |         help='custom options for evaluation, the key-value pair in xxx=yyy '
 82 |         'format will be kwargs for dataset.evaluate() function')
 83 |     parser.add_argument(
 84 |         '--launcher',
 85 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 86 |         default='none',
 87 |         help='job launcher')
 88 |     parser.add_argument('--local_rank', type=int, default=0)
 89 |     args = parser.parse_args()
 90 |     if 'LOCAL_RANK' not in os.environ:
 91 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 92 | 
 93 |     if args.options and args.eval_options:
 94 |         raise ValueError(
 95 |             '--options and --eval-options cannot be both '
 96 |             'specified, --options is deprecated in favor of --eval-options')
 97 |     if args.options:
 98 |         warnings.warn('--options is deprecated in favor of --eval-options')
 99 |         args.eval_options = args.options
100 |     return args
101 | 
102 | 
103 | def main():
104 |     args = parse_args()
105 | 
106 |     assert args.out or args.eval or args.format_only or args.show \
107 |         or args.show_dir, \
108 |         ('Please specify at least one operation (save/eval/format/show the '
109 |          'results / save the results) with the argument "--out", "--eval"'
110 |          ', "--format-only", "--show" or "--show-dir"')
111 | 
112 |     if args.eval and args.format_only:
113 |         raise ValueError('--eval and --format_only cannot be both specified')
114 | 
115 |     if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
116 |         raise ValueError('The output file must be a pkl file.')
117 | 
118 |     cfg = Config.fromfile(args.config)
119 |     if args.cfg_options is not None:
120 |         cfg.merge_from_dict(args.cfg_options)
121 |     # import modules from string list.
122 |     if cfg.get('custom_imports', None):
123 |         from mmcv.utils import import_modules_from_strings
124 |         import_modules_from_strings(**cfg['custom_imports'])
125 |     # set cudnn_benchmark
126 |     if cfg.get('cudnn_benchmark', False):
127 |         torch.backends.cudnn.benchmark = True
128 |     cfg.model.pretrained = None
129 |     if cfg.model.get('neck'):
130 |         if isinstance(cfg.model.neck, list):
131 |             for neck_cfg in cfg.model.neck:
132 |                 if neck_cfg.get('rfp_backbone'):
133 |                     if neck_cfg.rfp_backbone.get('pretrained'):
134 |                         neck_cfg.rfp_backbone.pretrained = None
135 |         elif cfg.model.neck.get('rfp_backbone'):
136 |             if cfg.model.neck.rfp_backbone.get('pretrained'):
137 |                 cfg.model.neck.rfp_backbone.pretrained = None
138 | 
139 |     # in case the test dataset is concatenated
140 |     if isinstance(cfg.data.test, dict):
141 |         cfg.data.test.test_mode = True
142 |     elif isinstance(cfg.data.test, list):
143 |         for ds_cfg in cfg.data.test:
144 |             ds_cfg.test_mode = True
145 | 
146 |     # init distributed env first, since logger depends on the dist info.
147 |     if args.launcher == 'none':
148 |         distributed = False
149 |     else:
150 |         distributed = True
151 |         init_dist(args.launcher, **cfg.dist_params)
152 | 
153 |     # build the dataloader
154 |     samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
155 |     if samples_per_gpu > 1:
156 |         # Replace 'ImageToTensor' to 'DefaultFormatBundle'
157 |         cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
158 |     dataset = build_dataset(cfg.data.test)
159 |     data_loader = build_dataloader(
160 |         dataset,
161 |         samples_per_gpu=samples_per_gpu,
162 |         workers_per_gpu=cfg.data.workers_per_gpu,
163 |         dist=distributed,
164 |         shuffle=False)
165 | 
166 |     # build the model and load checkpoint
167 |     model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
168 |     fp16_cfg = cfg.get('fp16', None)
169 |     if fp16_cfg is not None:
170 |         wrap_fp16_model(model)
171 |     checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
172 |     if args.fuse_conv_bn:
173 |         model = fuse_conv_bn(model)
174 |     # old versions did not save class info in checkpoints, this walkaround is
175 |     # for backward compatibility
176 |     if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']:
177 |         model.CLASSES = checkpoint['meta']['CLASSES']
178 |     else:
179 |         model.CLASSES = dataset.CLASSES
180 | 
181 |     if not distributed:
182 |         model = MMDataParallel(model, device_ids=[0])
183 |         outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
184 |                                   args.show_score_thr)
185 |     else:
186 |         model = MMDistributedDataParallel(
187 |             model.cuda(),
188 |             device_ids=[torch.cuda.current_device()],
189 |             broadcast_buffers=False)
190 |         outputs = multi_gpu_test(model, data_loader, args.tmpdir,
191 |                                  args.gpu_collect)
192 | 
193 |     rank, _ = get_dist_info()
194 |     if rank == 0:
195 |         if args.out:
196 |             print(f'\nwriting results to {args.out}')
197 |             mmcv.dump(outputs, args.out)
198 |         kwargs = {} if args.eval_options is None else args.eval_options
199 |         if args.format_only:
200 |             dataset.format_results(outputs, **kwargs)
201 |         if args.eval:
202 |             eval_kwargs = cfg.get('evaluation', {}).copy()
203 |             # hard-code way to remove EvalHook args
204 |             for key in [
205 |                     'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
206 |                     'rule'
207 |             ]:
208 |                 eval_kwargs.pop(key, None)
209 |             eval_kwargs.update(dict(metric=args.eval, **kwargs))
210 |             print(dataset.evaluate(outputs, **eval_kwargs))
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     main()
215 | 


--------------------------------------------------------------------------------
/detection/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import copy
  3 | import os
  4 | import os.path as osp
  5 | import time
  6 | import warnings
  7 | 
  8 | import mmcv
  9 | import torch
 10 | from mmcv import Config, DictAction
 11 | from mmcv.runner import get_dist_info, init_dist
 12 | from mmcv.utils import get_git_hash
 13 | 
 14 | from mmdet import __version__
 15 | from mmdet.apis import set_random_seed, train_detector
 16 | from mmdet.datasets import build_dataset
 17 | from mmdet.models import build_detector
 18 | from mmdet.utils import collect_env, get_root_logger
 19 | import pvt
 20 | import dpt_models
 21 | 
 22 | 
 23 | def parse_args():
 24 |     parser = argparse.ArgumentParser(description='Train a detector')
 25 |     parser.add_argument('config', help='train config file path')
 26 |     parser.add_argument('--work-dir', help='the dir to save logs and models')
 27 |     parser.add_argument(
 28 |         '--resume-from', help='the checkpoint file to resume from')
 29 |     parser.add_argument(
 30 |         '--no-validate',
 31 |         action='store_true',
 32 |         help='whether not to evaluate the checkpoint during training')
 33 |     group_gpus = parser.add_mutually_exclusive_group()
 34 |     group_gpus.add_argument(
 35 |         '--gpus',
 36 |         type=int,
 37 |         help='number of gpus to use '
 38 |              '(only applicable to non-distributed training)')
 39 |     group_gpus.add_argument(
 40 |         '--gpu-ids',
 41 |         type=int,
 42 |         nargs='+',
 43 |         help='ids of gpus to use '
 44 |              '(only applicable to non-distributed training)')
 45 |     parser.add_argument('--seed', type=int, default=None, help='random seed')
 46 |     parser.add_argument(
 47 |         '--deterministic',
 48 |         action='store_true',
 49 |         help='whether to set deterministic options for CUDNN backend.')
 50 |     parser.add_argument(
 51 |         '--options',
 52 |         nargs='+',
 53 |         action=DictAction,
 54 |         help='override some settings in the used config, the key-value pair '
 55 |              'in xxx=yyy format will be merged into config file (deprecate), '
 56 |              'change to --cfg-options instead.')
 57 |     parser.add_argument(
 58 |         '--cfg-options',
 59 |         nargs='+',
 60 |         action=DictAction,
 61 |         help='override some settings in the used config, the key-value pair '
 62 |              'in xxx=yyy format will be merged into config file. If the value to '
 63 |              'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 64 |              'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 65 |              'Note that the quotation marks are necessary and that no white space '
 66 |              'is allowed.')
 67 |     parser.add_argument(
 68 |         '--launcher',
 69 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 70 |         default='none',
 71 |         help='job launcher')
 72 |     parser.add_argument('--local_rank', type=int, default=0)
 73 |     args = parser.parse_args()
 74 |     if 'LOCAL_RANK' not in os.environ:
 75 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 76 | 
 77 |     if args.options and args.cfg_options:
 78 |         raise ValueError(
 79 |             '--options and --cfg-options cannot be both '
 80 |             'specified, --options is deprecated in favor of --cfg-options')
 81 |     if args.options:
 82 |         warnings.warn('--options is deprecated in favor of --cfg-options')
 83 |         args.cfg_options = args.options
 84 | 
 85 |     return args
 86 | 
 87 | 
 88 | def main():
 89 |     args = parse_args()
 90 | 
 91 |     cfg = Config.fromfile(args.config)
 92 |     if args.cfg_options is not None:
 93 |         cfg.merge_from_dict(args.cfg_options)
 94 |     # import modules from string list.
 95 |     if cfg.get('custom_imports', None):
 96 |         from mmcv.utils import import_modules_from_strings
 97 |         import_modules_from_strings(**cfg['custom_imports'])
 98 |     # set cudnn_benchmark
 99 |     if cfg.get('cudnn_benchmark', False):
100 |         torch.backends.cudnn.benchmark = True
101 | 
102 |     # work_dir is determined in this priority: CLI > segment in file > filename
103 |     if args.work_dir is not None:
104 |         # update configs according to CLI args if args.work_dir is not None
105 |         cfg.work_dir = args.work_dir
106 |     elif cfg.get('work_dir', None) is None:
107 |         # use config filename as default work_dir if cfg.work_dir is None
108 |         cfg.work_dir = osp.join('./work_dirs',
109 |                                 osp.splitext(osp.basename(args.config))[0])
110 |     if args.resume_from is not None:
111 |         cfg.resume_from = args.resume_from
112 |     if args.gpu_ids is not None:
113 |         cfg.gpu_ids = args.gpu_ids
114 |     else:
115 |         cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
116 | 
117 |     # init distributed env first, since logger depends on the dist info.
118 |     if args.launcher == 'none':
119 |         distributed = False
120 |     else:
121 |         distributed = True
122 |         init_dist(args.launcher, **cfg.dist_params)
123 |         # re-set gpu_ids with distributed training mode
124 |         _, world_size = get_dist_info()
125 |         cfg.gpu_ids = range(world_size)
126 | 
127 |     # create work_dir
128 |     mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
129 |     # dump config
130 |     cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
131 |     # init the logger before other steps
132 |     timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
133 |     log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
134 |     logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
135 | 
136 |     # init the meta dict to record some important information such as
137 |     # environment info and seed, which will be logged
138 |     meta = dict()
139 |     # log env info
140 |     env_info_dict = collect_env()
141 |     env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
142 |     dash_line = '-' * 60 + '\n'
143 |     logger.info('Environment info:\n' + dash_line + env_info + '\n' +
144 |                 dash_line)
145 |     meta['env_info'] = env_info
146 |     meta['config'] = cfg.pretty_text
147 |     # log some basic info
148 |     logger.info(f'Distributed training: {distributed}')
149 |     logger.info(f'Config:\n{cfg.pretty_text}')
150 | 
151 |     # set random seeds
152 |     if args.seed is not None:
153 |         logger.info(f'Set random seed to {args.seed}, '
154 |                     f'deterministic: {args.deterministic}')
155 |         set_random_seed(args.seed, deterministic=args.deterministic)
156 |     cfg.seed = args.seed
157 |     meta['seed'] = args.seed
158 |     meta['exp_name'] = osp.basename(args.config)
159 | 
160 |     model = build_detector(
161 |         cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
162 | 
163 |     datasets = [build_dataset(cfg.data.train)]
164 |     if len(cfg.workflow) == 2:
165 |         val_dataset = copy.deepcopy(cfg.data.val)
166 |         val_dataset.pipeline = cfg.data.train.pipeline
167 |         datasets.append(build_dataset(val_dataset))
168 |     if cfg.checkpoint_config is not None:
169 |         # save mmdet version, config file content and class names in
170 |         # checkpoints as meta data
171 |         cfg.checkpoint_config.meta = dict(
172 |             mmdet_version=__version__ + get_git_hash()[:7],
173 |             CLASSES=datasets[0].CLASSES)
174 |     # add an attribute for visualization convenience
175 |     model.CLASSES = datasets[0].CLASSES
176 |     train_detector(
177 |         model,
178 |         datasets,
179 |         cfg,
180 |         distributed=distributed,
181 |         validate=(not args.no_validate),
182 |         timestamp=timestamp,
183 |         meta=meta)
184 | 
185 | 
186 | if __name__ == '__main__':
187 |     main()
188 | 


--------------------------------------------------------------------------------
/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | python setup.py build install
11 | 


--------------------------------------------------------------------------------
/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | 
 13 | import warnings
 14 | import math
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.nn.functional as F
 19 | from torch.nn.init import xavier_uniform_, constant_
 20 | 
 21 | from ..functions import MSDeformAttnFunction
 22 | 
 23 | 
 24 | def _is_power_of_2(n):
 25 |     if (not isinstance(n, int)) or (n < 0):
 26 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 27 |     return (n & (n-1) == 0) and n != 0
 28 | 
 29 | 
 30 | class MSDeformAttn(nn.Module):
 31 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 32 |         """
 33 |         Multi-Scale Deformable Attention Module
 34 |         :param d_model      hidden dimension
 35 |         :param n_levels     number of feature levels
 36 |         :param n_heads      number of attention heads
 37 |         :param n_points     number of sampling points per attention head per feature level
 38 |         """
 39 |         super().__init__()
 40 |         if d_model % n_heads != 0:
 41 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 42 |         _d_per_head = d_model // n_heads
 43 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 44 |         if not _is_power_of_2(_d_per_head):
 45 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 46 |                           "which is more efficient in our CUDA implementation.")
 47 | 
 48 |         self.im2col_step = 64
 49 | 
 50 |         self.d_model = d_model
 51 |         self.n_levels = n_levels
 52 |         self.n_heads = n_heads
 53 |         self.n_points = n_points
 54 | 
 55 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 56 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 57 |         self.value_proj = nn.Linear(d_model, d_model)
 58 |         self.output_proj = nn.Linear(d_model, d_model)
 59 | 
 60 |         self._reset_parameters()
 61 | 
 62 |     def _reset_parameters(self):
 63 |         constant_(self.sampling_offsets.weight.data, 0.)
 64 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 65 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 66 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 67 |         for i in range(self.n_points):
 68 |             grid_init[:, :, i, :] *= i + 1
 69 |         with torch.no_grad():
 70 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 71 |         constant_(self.attention_weights.weight.data, 0.)
 72 |         constant_(self.attention_weights.bias.data, 0.)
 73 |         xavier_uniform_(self.value_proj.weight.data)
 74 |         constant_(self.value_proj.bias.data, 0.)
 75 |         xavier_uniform_(self.output_proj.weight.data)
 76 |         constant_(self.output_proj.bias.data, 0.)
 77 | 
 78 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 79 |         """
 80 |         :param query                       (N, Length_{query}, C)
 81 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 82 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 83 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 84 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 85 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 86 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 87 | 
 88 |         :return output                     (N, Length_{query}, C)
 89 |         """
 90 |         N, Len_q, _ = query.shape
 91 |         N, Len_in, _ = input_flatten.shape
 92 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 93 | 
 94 |         value = self.value_proj(input_flatten)
 95 |         if input_padding_mask is not None:
 96 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 97 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 98 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
 99 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
100 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
101 |         # N, Len_q, n_heads, n_levels, n_points, 2
102 |         if reference_points.shape[-1] == 2:
103 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
104 |             sampling_locations = reference_points[:, :, None, :, None, :] \
105 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
106 |         elif reference_points.shape[-1] == 4:
107 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
108 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
109 |         else:
110 |             raise ValueError(
111 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
112 |         output = MSDeformAttnFunction.apply(
113 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
114 |         output = self.output_proj(output)
115 |         return output
116 | 


--------------------------------------------------------------------------------
/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | #include <vector>
 12 | #include "cuda/ms_deform_im2col_cuda.cuh"
 13 | 
 14 | #include <ATen/ATen.h>
 15 | #include <ATen/cuda/CUDAContext.h>
 16 | #include <cuda.h>
 17 | #include <cuda_runtime.h>
 18 | 
 19 | 
 20 | at::Tensor ms_deform_attn_cuda_forward(
 21 |     const at::Tensor &value, 
 22 |     const at::Tensor &spatial_shapes,
 23 |     const at::Tensor &level_start_index,
 24 |     const at::Tensor &sampling_loc,
 25 |     const at::Tensor &attn_weight,
 26 |     const int im2col_step)
 27 | {
 28 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 29 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 30 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 31 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 32 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 33 | 
 34 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 35 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 36 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 37 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 38 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 39 | 
 40 |     const int batch = value.size(0);
 41 |     const int spatial_size = value.size(1);
 42 |     const int num_heads = value.size(2);
 43 |     const int channels = value.size(3);
 44 | 
 45 |     const int num_levels = spatial_shapes.size(0);
 46 | 
 47 |     const int num_query = sampling_loc.size(1);
 48 |     const int num_point = sampling_loc.size(4);
 49 | 
 50 |     const int im2col_step_ = std::min(batch, im2col_step);
 51 | 
 52 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 53 |     
 54 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 55 | 
 56 |     const int batch_n = im2col_step_;
 57 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 58 |     auto per_value_size = spatial_size * num_heads * channels;
 59 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 60 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 61 |     for (int n = 0; n < batch/im2col_step_; ++n)
 62 |     {
 63 |         auto columns = output_n.select(0, n);
 64 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 65 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 66 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 67 |                 spatial_shapes.data<int64_t>(),
 68 |                 level_start_index.data<int64_t>(),
 69 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 70 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 71 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 72 |                 columns.data<scalar_t>());
 73 | 
 74 |         }));
 75 |     }
 76 | 
 77 |     output = output.view({batch, num_query, num_heads*channels});
 78 | 
 79 |     return output;
 80 | }
 81 | 
 82 | 
 83 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 84 |     const at::Tensor &value, 
 85 |     const at::Tensor &spatial_shapes,
 86 |     const at::Tensor &level_start_index,
 87 |     const at::Tensor &sampling_loc,
 88 |     const at::Tensor &attn_weight,
 89 |     const at::Tensor &grad_output,
 90 |     const int im2col_step)
 91 | {
 92 | 
 93 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 94 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 95 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 96 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 97 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 98 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
 99 | 
100 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 | 
107 |     const int batch = value.size(0);
108 |     const int spatial_size = value.size(1);
109 |     const int num_heads = value.size(2);
110 |     const int channels = value.size(3);
111 | 
112 |     const int num_levels = spatial_shapes.size(0);
113 | 
114 |     const int num_query = sampling_loc.size(1);
115 |     const int num_point = sampling_loc.size(4);
116 | 
117 |     const int im2col_step_ = std::min(batch, im2col_step);
118 | 
119 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 | 
121 |     auto grad_value = at::zeros_like(value);
122 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 |     auto grad_attn_weight = at::zeros_like(attn_weight);
124 | 
125 |     const int batch_n = im2col_step_;
126 |     auto per_value_size = spatial_size * num_heads * channels;
127 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |     
131 |     for (int n = 0; n < batch/im2col_step_; ++n)
132 |     {
133 |         auto grad_output_g = grad_output_n.select(0, n);
134 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 |                                     grad_output_g.data<scalar_t>(),
137 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138 |                                     spatial_shapes.data<int64_t>(),
139 |                                     level_start_index.data<int64_t>(),
140 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
144 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146 | 
147 |         }));
148 |     }
149 | 
150 |     return {
151 |         grad_value, grad_sampling_loc, grad_attn_weight
152 |     };
153 | }


--------------------------------------------------------------------------------
/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------