├── .gitignore ├── LICENSE ├── README.md ├── classification ├── LICENSE ├── README.md ├── datasets.py ├── dist_resume.sh ├── dist_train.sh ├── engine.py ├── hubconf.py ├── losses.py ├── main.py ├── mcloader │ ├── __init__.py │ ├── classification.py │ ├── data_prefetcher.py │ ├── image_list.py │ ├── imagenet.py │ └── mcloader.py ├── models │ ├── __init__.py │ ├── dpt │ │ ├── __init__.py │ │ ├── box_coder.py │ │ ├── depatch_embed.py │ │ ├── dpt.py │ │ └── ms_deform_attn_func.py │ └── pvt.py ├── requirements.txt ├── run_with_submitit.py ├── samplers.py ├── tox.ini └── utils.py ├── detection ├── README.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── coco_detection.py │ │ │ └── coco_instance.py │ │ ├── default_runtime.py │ │ └── models │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ └── retinanet_r50_fpn.py │ ├── detr_dpt_s_8x2_50ep_coco.py │ ├── detr_pvt_s_8x2_50ep_coco.py │ ├── detr_r50_8x2_50ep_coco_baseline.py │ ├── mask_rcnn_dpt_m_fpn_1x_coco.py │ ├── mask_rcnn_dpt_m_fpn_mstrain-poly_3x_coco.py │ ├── mask_rcnn_dpt_s_fpn_1x_coco.py │ ├── mask_rcnn_dpt_s_fpn_mstrain-poly_3x_coco.py │ ├── mask_rcnn_dpt_t_fpn_1x_coco.py │ ├── mask_rcnn_dpt_t_fpn_mstrain-poly_3x_coco.py │ ├── mask_rcnn_pvt_s_fpn_1x_coco.py │ ├── mask_rcnn_pvt_t_fpn_1x_coco.py │ ├── retinanet_dpt_m_fpn_1x_coco.py │ ├── retinanet_dpt_m_fpn_mstrain_3x_coco.py │ ├── retinanet_dpt_s_fpn_1x_coco.py │ ├── retinanet_dpt_s_fpn_mstrain_3x_coco.py │ ├── retinanet_dpt_t_fpn_1x_coco.py │ ├── retinanet_dpt_t_fpn_mstrain_3x_coco.py │ ├── retinanet_pvt_s_fpn_1x_coco.py │ ├── retinanet_pvt_s_fpn_1x_coco_640.py │ └── retinanet_pvt_t_fpn_1x_coco.py ├── dist_test.sh ├── dist_train.sh ├── dpt_models │ ├── __init__.py │ ├── box_coder.py │ ├── depatch_embed.py │ ├── dpt.py │ └── ms_deform_attn_func.py ├── pvt.py ├── test.py └── train.py └── ops ├── functions ├── __init__.py └── ms_deform_attn_func.py ├── make.sh ├── modules ├── __init__.py └── ms_deform_attn.py ├── setup.py ├── src ├── cpu │ ├── ms_deform_attn_cpu.cpp │ └── ms_deform_attn_cpu.h ├── cuda │ ├── ms_deform_attn_cuda.cu │ ├── ms_deform_attn_cuda.h │ └── ms_deform_im2col_cuda.cuh ├── ms_deform_attn.h └── vision.cpp └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | **/__pycache__/** 3 | imnet_resnet50_scratch/timm_temp/ 4 | .dumbo.json 5 | checkpoints/ 6 | data/ 7 | tmp.txt 8 | data 9 | tmp/ 10 | **/data 11 | **/data/ 12 | **/pretrained/ 13 | **/work_dirs/ 14 | **/results.pkl 15 | **/arun_log/ 16 | tmp** 17 | arun_log/ 18 | 19 | .ipynb_checkpoints/ 20 | checkpoint/ 21 | 22 | ops/MultiScaleDeformableAttention.egg-info/ 23 | ops/build/ 24 | ops/dist/ 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DPT 2 | -------- 3 | 4 | This repo is the official implementation of **DPT: Deformable Patch-based Transformer for Visual Recognition (ACM MM2021)**. We provide code and models for the following tasks: 5 | 6 | > **Image Classification**: Detailed instruction and information see [classification/README.md](classification/README.md). 7 | 8 | > **Object Detection**: Detailed instruction and information see [detection/README.md](detection/README.md). 9 | 10 | The papar has been relased on [[Arxiv](https://arxiv.org/abs/2107.14467)]. 11 | 12 | ## Introduction 13 | 14 | Deformable Patch (DePatch) is a plug-and-play module. It learns to adaptively split the images input patches with different positions and scales in a data-driven way, rather than using predefined fixed patches. In this way, our method can well preserve the semantics in patches. 15 | 16 | In this repository, code and models for a Deformable Patch-based Transformer (DPT) are provided. As this field is developing rapidly, we are willing to see our DePatch applied to some other latest architectures and promote further research. 17 | 18 | ## Main Results 19 | 20 | ### Image Classification 21 | 22 | Training commands and pretrained models are provided >>> [here](classification) <<<. 23 | 24 | | Method | #Params (M) | FLOPs(G) | Acc@1 | 25 | |------------|:-----------:|:--------:|:-----:| 26 | | DPT-Tiny | 15.2 | 2.1 | 77.4 | 27 | | DPT-Small | 26.4 | 4.0 | 81.0 | 28 | | DPT-Medium | 46.1 | 6.9 | 81.9 | 29 | 30 | ### Object Detection 31 | Training command and detailed results are provided >>> [here](detection) <<<. 32 | 33 | ## Citation 34 | ``` 35 | @inproceedings{chenDPT21, 36 | title = {DPT: Deformable Patch-based Transformer for Visual Recognition}, 37 | author = {Zhiyang Chen and Yousong Zhu and Chaoyang Zhao and Guosheng Hu and Wei Zeng and Jinqiao Wang and Ming Tang}, 38 | booktitle={Proceedings of the ACM International Conference on Multimedia}, 39 | year={2021} 40 | } 41 | ``` 42 | 43 | ## License 44 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file. 45 | 46 | ## Acknowledgement 47 | Our implementation is mainly based on [PVT](https://github.com/whai362/PVT). The CUDA operator is borrowed from [Deformable-DETR](https://github.com/fundamentalvision/Deformable-DETR). You may refer these repositories for further information. 48 | -------------------------------------------------------------------------------- /classification/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 - present, Facebook, Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /classification/README.md: -------------------------------------------------------------------------------- 1 | # DPT for Image Classification 2 | ----- 3 | Here is our code for ImageNet classification. Please check our paper (coming soon) for detailed information. 4 | 5 | ## Instructions 6 | 7 | ### Environment 8 | 9 | We develop our model under `cuda 10.1`, `pytorch 1.7.1` and `timm 0.3.2`. Pytorch with other versions may also work. We advise you to prepare your environment with `conda`. 10 | ```bash 11 | conda install pytorch==1.7.1 torchvision==0.8.2 cudatoolkit=10.1 -c pytorch 12 | pip install timm==0.3.2 13 | ``` 14 | 15 | You may clone our repo and compile the provided operator. 16 | ```bash 17 | git clone https://github.com/CASIA-IVA-Lab/DPT.git 18 | cd ./ops 19 | sh ./make.sh 20 | # unit test (should see all checking is True) 21 | python test.py 22 | ``` 23 | 24 | ### Data Preparation 25 | 26 | We follow the conventional way to prepare the ImangeNet dataset. 27 | 28 | The directory structure is the standard layout for the torchvision [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder), and the training and validation data is expected to be in the `train/` folder and `val/` folder respectively: 29 | 30 | ``` 31 | /path/to/imagenet/ 32 | train/ 33 | class1/ 34 | img1.jpeg 35 | class2/ 36 | img2.jpeg 37 | val/ 38 | class1/ 39 | img3.jpeg 40 | class/2 41 | img4.jpeg 42 | ``` 43 | 44 | ### Evaluation 45 | 46 | To evaluate a pretrained model on ImageNet val on a single gpus: 47 | 48 | ```bash 49 | python -m torch.distributed.launch --nproc_per_node 1 --use_env main.py --eval --model $MODEL_NAME --data-path $DATA_PATH --resume $CKPT_PATH 50 | ``` 51 | 52 | Or with multiple gpus: 53 | 54 | ```bash 55 | python -m torch.distributed.launch --nproc_per_node $NUM_GPUS --use_env main.py --eval --dist-eval --model $MODEL_NAME --data-path $DATA_PATH --resume $CKPT_PATH 56 | ``` 57 | 58 | For example, use 8 gpu to test our pretrained DPT-Small model. 59 | ```bash 60 | python -m torch.distributed.launch --nproc_per_node 8 --use_env main.py --eval --dist-eval --model dpt_tiny --data-path $DATA_PATH --resume dpt_tiny.pth 61 | ``` 62 | which should give 63 | ``` 64 | * Acc@1 80.954 Acc@5 95.388 loss 0.846 65 | Accuracy of the network on the 50000 test images: 81.0% 66 | ``` 67 | 68 | 69 | ### Training 70 | 71 | To train DPT-Small on ImageNet on a single node with 8 gpus for 300 epochs run: 72 | 73 | ```bash 74 | MODEL_NAME=dpt_small 75 | DATA_PATH=/path/to/imagenet 76 | OUTPUT_PATH=/path/to/output 77 | 78 | python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py\ 79 | --model $MODEL_NAME --batch-size 128 --dist-eval --test_interval 5\ 80 | --data-path $DATA_PATH --output_dir $OUTPUT_PATH 81 | ``` 82 | 83 | ## Model Zoo 84 | 85 | | Method | #Params (M) | FLOPs(G) | Acc@1 | Model | 86 | |------------|:-----------:|:--------:|:-----:|:-----:| 87 | | DPT-Tiny | 15.2 | 2.1 | 77.4 | [Google Drive](https://drive.google.com/file/d/1WkuanDQodRun1sJtZmnoUd6pJOpNMetm/view?usp=sharing) | 88 | | DPT-Small | 26.4 | 4.0 | 81.0 | [Google Drive](https://drive.google.com/file/d/1uM4iRLnZ9Omdt_OSPr-aK0uy8rQ5iLjA/view?usp=sharing) | 89 | | DPT-Medium | 46.1 | 6.9 | 81.9 | [Google Drive](https://drive.google.com/file/d/1IoAJoN4VFEiDS17hSwXpTDHiJPivZdsu/view?usp=sharing) | 90 | 91 | You can also obtain the ImageNet1k pre-trained model from [BaiduNetdisk](https://pan.baidu.com/s/1nzfWr90_XP7Ruoj2hBJzLQ). Password for extract is **DPTs**. 92 | 93 | ## License 94 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file. 95 | -------------------------------------------------------------------------------- /classification/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import os 4 | import json 5 | 6 | from torchvision import datasets, transforms 7 | from torchvision.datasets.folder import ImageFolder, default_loader 8 | 9 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 10 | from timm.data import create_transform 11 | from mcloader import ClassificationDataset 12 | 13 | 14 | class INatDataset(ImageFolder): 15 | def __init__(self, root, train=True, year=2018, transform=None, target_transform=None, 16 | category='name', loader=default_loader): 17 | self.transform = transform 18 | self.loader = loader 19 | self.target_transform = target_transform 20 | self.year = year 21 | # assert category in ['kingdom','phylum','class','order','supercategory','family','genus','name'] 22 | path_json = os.path.join(root, f'{"train" if train else "val"}{year}.json') 23 | with open(path_json) as json_file: 24 | data = json.load(json_file) 25 | 26 | with open(os.path.join(root, 'categories.json')) as json_file: 27 | data_catg = json.load(json_file) 28 | 29 | path_json_for_targeter = os.path.join(root, f"train{year}.json") 30 | 31 | with open(path_json_for_targeter) as json_file: 32 | data_for_targeter = json.load(json_file) 33 | 34 | targeter = {} 35 | indexer = 0 36 | for elem in data_for_targeter['annotations']: 37 | king = [] 38 | king.append(data_catg[int(elem['category_id'])][category]) 39 | if king[0] not in targeter.keys(): 40 | targeter[king[0]] = indexer 41 | indexer += 1 42 | self.nb_classes = len(targeter) 43 | 44 | self.samples = [] 45 | for elem in data['images']: 46 | cut = elem['file_name'].split('/') 47 | target_current = int(cut[2]) 48 | path_current = os.path.join(root, cut[0], cut[2], cut[3]) 49 | 50 | categors = data_catg[target_current] 51 | target_current_true = targeter[categors[category]] 52 | self.samples.append((path_current, target_current_true)) 53 | 54 | # __getitem__ and __len__ inherited from ImageFolder 55 | 56 | 57 | def build_dataset(is_train, args): 58 | transform = build_transform(is_train, args) 59 | 60 | if args.data_set == 'CIFAR': 61 | dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform) 62 | nb_classes = 100 63 | elif args.data_set == 'IMNET': 64 | if not args.use_mcloader: 65 | root = os.path.join(args.data_path, 'train' if is_train else 'val') 66 | dataset = datasets.ImageFolder(root, transform=transform) 67 | else: 68 | dataset = ClassificationDataset( 69 | 'train' if is_train else 'val', 70 | pipeline=transform 71 | ) 72 | nb_classes = 1000 73 | elif args.data_set == 'INAT': 74 | dataset = INatDataset(args.data_path, train=is_train, year=2018, 75 | category=args.inat_category, transform=transform) 76 | nb_classes = dataset.nb_classes 77 | elif args.data_set == 'INAT19': 78 | dataset = INatDataset(args.data_path, train=is_train, year=2019, 79 | category=args.inat_category, transform=transform) 80 | nb_classes = dataset.nb_classes 81 | 82 | return dataset, nb_classes 83 | 84 | 85 | def build_transform(is_train, args): 86 | resize_im = args.input_size > 32 87 | if is_train: 88 | # this should always dispatch to transforms_imagenet_train 89 | transform = create_transform( 90 | input_size=args.input_size, 91 | is_training=True, 92 | color_jitter=args.color_jitter, 93 | auto_augment=args.aa, 94 | interpolation=args.train_interpolation, 95 | re_prob=args.reprob, 96 | re_mode=args.remode, 97 | re_count=args.recount, 98 | ) 99 | if not resize_im: 100 | # replace RandomResizedCropAndInterpolation with 101 | # RandomCrop 102 | transform.transforms[0] = transforms.RandomCrop( 103 | args.input_size, padding=4) 104 | return transform 105 | 106 | t = [] 107 | if resize_im: 108 | size = int((256 / 224) * args.input_size) 109 | t.append( 110 | transforms.Resize(size, interpolation=3), # to maintain same ratio w.r.t. 224 images 111 | ) 112 | t.append(transforms.CenterCrop(args.input_size)) 113 | 114 | t.append(transforms.ToTensor()) 115 | t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)) 116 | return transforms.Compose(t) 117 | -------------------------------------------------------------------------------- /classification/dist_resume.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export NCCL_LL_THRESHOLD=0 3 | 4 | ARCH=$1 5 | GPUS=$2 6 | OUT_PATH=$3 7 | PORT=${PORT:-29500} 8 | 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | --use_env main.py --model $ARCH --batch-size 128 --epochs 300 --data-path /path/to/imagenet \ 11 | --output_dir $OUT_PATH --resume $OUT_PATH/checkpoint.pth ${@:4} -------------------------------------------------------------------------------- /classification/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export NCCL_LL_THRESHOLD=0 3 | 4 | ARCH=$1 5 | GPUS=$2 6 | OUT_PATH=$3 7 | PORT=${PORT:-29500} 8 | 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | --use_env main.py --model $ARCH --batch-size 128 --epochs 300 --data-path /path/to/imagenet \ 11 | --output_dir $OUT_PATH ${@:4} -------------------------------------------------------------------------------- /classification/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | Train and eval functions used in main.py 5 | """ 6 | import math 7 | import sys 8 | from typing import Iterable, Optional 9 | 10 | import torch 11 | 12 | from timm.data import Mixup 13 | from timm.utils import accuracy, ModelEma 14 | 15 | from losses import DistillationLoss 16 | import utils 17 | 18 | 19 | def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss, 20 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 21 | device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, 22 | model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, 23 | set_training_mode=True, 24 | fp32=False): 25 | model.train(set_training_mode) 26 | metric_logger = utils.MetricLogger(delimiter=" ") 27 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 28 | header = 'Epoch: [{}]'.format(epoch) 29 | print_freq = 10 30 | 31 | for samples, targets in metric_logger.log_every(data_loader, print_freq, header): 32 | samples = samples.to(device, non_blocking=True) 33 | targets = targets.to(device, non_blocking=True) 34 | 35 | if mixup_fn is not None: 36 | samples, targets = mixup_fn(samples, targets) 37 | 38 | # with torch.cuda.amp.autocast(): 39 | # outputs = model(samples) 40 | # loss = criterion(samples, outputs, targets) 41 | with torch.cuda.amp.autocast(enabled=not fp32): 42 | outputs = model(samples) 43 | loss = criterion(samples, outputs, targets) 44 | 45 | loss_value = loss.item() 46 | 47 | if not math.isfinite(loss_value): 48 | print("Loss is {}, stopping training".format(loss_value)) 49 | sys.exit(1) 50 | 51 | optimizer.zero_grad() 52 | 53 | # this attribute is added by timm on one optimizer (adahessian) 54 | is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order 55 | loss_scaler(loss, optimizer, clip_grad=max_norm, 56 | parameters=model.parameters(), create_graph=is_second_order) 57 | 58 | torch.cuda.synchronize() 59 | if model_ema is not None: 60 | model_ema.update(model) 61 | 62 | metric_logger.update(loss=loss_value) 63 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 64 | # gather the stats from all processes 65 | metric_logger.synchronize_between_processes() 66 | print("Averaged stats:", metric_logger) 67 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 68 | 69 | 70 | @torch.no_grad() 71 | def evaluate(data_loader, model, device): 72 | criterion = torch.nn.CrossEntropyLoss() 73 | 74 | metric_logger = utils.MetricLogger(delimiter=" ") 75 | header = 'Test:' 76 | 77 | # switch to evaluation mode 78 | model.eval() 79 | 80 | for images, target in metric_logger.log_every(data_loader, 10, header): 81 | images = images.to(device, non_blocking=True) 82 | target = target.to(device, non_blocking=True) 83 | 84 | # compute output 85 | with torch.cuda.amp.autocast(): 86 | output = model(images) 87 | loss = criterion(output, target) 88 | 89 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 90 | 91 | batch_size = images.shape[0] 92 | metric_logger.update(loss=loss.item()) 93 | metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) 94 | metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) 95 | # gather the stats from all processes 96 | metric_logger.synchronize_between_processes() 97 | print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' 98 | .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) 99 | 100 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 101 | -------------------------------------------------------------------------------- /classification/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | from models import * 4 | 5 | dependencies = ["torch", "torchvision", "timm"] 6 | -------------------------------------------------------------------------------- /classification/losses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | Implements the knowledge distillation loss 5 | """ 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | 10 | class DistillationLoss(torch.nn.Module): 11 | """ 12 | This module wraps a standard criterion and adds an extra knowledge distillation loss by 13 | taking a teacher model prediction and using it as additional supervision. 14 | """ 15 | def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module, 16 | distillation_type: str, alpha: float, tau: float): 17 | super().__init__() 18 | self.base_criterion = base_criterion 19 | self.teacher_model = teacher_model 20 | assert distillation_type in ['none', 'soft', 'hard'] 21 | self.distillation_type = distillation_type 22 | self.alpha = alpha 23 | self.tau = tau 24 | 25 | def forward(self, inputs, outputs, labels): 26 | """ 27 | Args: 28 | inputs: The original inputs that are feed to the teacher model 29 | outputs: the outputs of the model to be trained. It is expected to be 30 | either a Tensor, or a Tuple[Tensor, Tensor], with the original output 31 | in the first position and the distillation predictions as the second output 32 | labels: the labels for the base criterion 33 | """ 34 | outputs_kd = None 35 | if not isinstance(outputs, torch.Tensor): 36 | # assume that the model outputs a tuple of [outputs, outputs_kd] 37 | outputs, outputs_kd = outputs 38 | base_loss = self.base_criterion(outputs, labels) 39 | if self.distillation_type == 'none': 40 | return base_loss 41 | 42 | if outputs_kd is None: 43 | raise ValueError("When knowledge distillation is enabled, the model is " 44 | "expected to return a Tuple[Tensor, Tensor] with the output of the " 45 | "class_token and the dist_token") 46 | # don't backprop throught the teacher 47 | with torch.no_grad(): 48 | teacher_outputs = self.teacher_model(inputs) 49 | 50 | if self.distillation_type == 'soft': 51 | T = self.tau 52 | # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 53 | # with slight modifications 54 | distillation_loss = F.kl_div( 55 | F.log_softmax(outputs_kd / T, dim=1), 56 | F.log_softmax(teacher_outputs / T, dim=1), 57 | reduction='sum', 58 | log_target=True 59 | ) * (T * T) / outputs_kd.numel() 60 | elif self.distillation_type == 'hard': 61 | distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(dim=1)) 62 | 63 | loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha 64 | return loss 65 | -------------------------------------------------------------------------------- /classification/mcloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification import ClassificationDataset 2 | from .data_prefetcher import DataPrefetcher -------------------------------------------------------------------------------- /classification/mcloader/classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | from .imagenet import ImageNet 4 | 5 | 6 | class ClassificationDataset(Dataset): 7 | """Dataset for classification. 8 | """ 9 | 10 | def __init__(self, split='train', pipeline=None): 11 | if split == 'train': 12 | self.data_source = ImageNet(root='data/imagenet/train', 13 | list_file='data/imagenet/meta/train.txt', 14 | memcached=True, 15 | mclient_path='/mnt/lustre/share/memcached_client') 16 | else: 17 | self.data_source = ImageNet(root='data/imagenet/val', 18 | list_file='data/imagenet/meta/val.txt', 19 | memcached=True, 20 | mclient_path='/mnt/lustre/share/memcached_client') 21 | self.pipeline = pipeline 22 | 23 | def __len__(self): 24 | return self.data_source.get_length() 25 | 26 | def __getitem__(self, idx): 27 | img, target = self.data_source.get_sample(idx) 28 | if self.pipeline is not None: 29 | img = self.pipeline(img) 30 | 31 | return img, target 32 | -------------------------------------------------------------------------------- /classification/mcloader/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class DataPrefetcher: 5 | def __init__(self, loader): 6 | self.loader = iter(loader) 7 | self.stream = torch.cuda.Stream() 8 | self.preload() 9 | 10 | def preload(self): 11 | try: 12 | self.next_input, self.next_target = next(self.loader) 13 | except StopIteration: 14 | self.next_input = None 15 | self.next_target = None 16 | return 17 | 18 | with torch.cuda.stream(self.stream): 19 | self.next_input = self.next_input.cuda(non_blocking=True) 20 | self.next_target = self.next_target.cuda(non_blocking=True) 21 | 22 | def next(self): 23 | torch.cuda.current_stream().wait_stream(self.stream) 24 | input = self.next_input 25 | target = self.next_target 26 | if input is not None: 27 | self.preload() 28 | return input, target 29 | -------------------------------------------------------------------------------- /classification/mcloader/image_list.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | 4 | from .mcloader import McLoader 5 | 6 | 7 | class ImageList(object): 8 | 9 | def __init__(self, root, list_file, memcached=False, mclient_path=None): 10 | with open(list_file, 'r') as f: 11 | lines = f.readlines() 12 | self.has_labels = len(lines[0].split()) == 2 13 | if self.has_labels: 14 | self.fns, self.labels = zip(*[l.strip().split() for l in lines]) 15 | self.labels = [int(l) for l in self.labels] 16 | else: 17 | self.fns = [l.strip() for l in lines] 18 | self.fns = [os.path.join(root, fn) for fn in self.fns] 19 | self.memcached = memcached 20 | self.mclient_path = mclient_path 21 | self.initialized = False 22 | 23 | def _init_memcached(self): 24 | if not self.initialized: 25 | assert self.mclient_path is not None 26 | self.mc_loader = McLoader(self.mclient_path) 27 | self.initialized = True 28 | 29 | def get_length(self): 30 | return len(self.fns) 31 | 32 | def get_sample(self, idx): 33 | if self.memcached: 34 | self._init_memcached() 35 | if self.memcached: 36 | img = self.mc_loader(self.fns[idx]) 37 | else: 38 | img = Image.open(self.fns[idx]) 39 | img = img.convert('RGB') 40 | if self.has_labels: 41 | target = self.labels[idx] 42 | return img, target 43 | else: 44 | return img 45 | -------------------------------------------------------------------------------- /classification/mcloader/imagenet.py: -------------------------------------------------------------------------------- 1 | from .image_list import ImageList 2 | 3 | 4 | class ImageNet(ImageList): 5 | 6 | def __init__(self, root, list_file, memcached, mclient_path): 7 | super(ImageNet, self).__init__( 8 | root, list_file, memcached, mclient_path) 9 | -------------------------------------------------------------------------------- /classification/mcloader/mcloader.py: -------------------------------------------------------------------------------- 1 | import io 2 | from PIL import Image 3 | try: 4 | import mc 5 | except ImportError as E: 6 | pass 7 | 8 | 9 | def pil_loader(img_str): 10 | buff = io.BytesIO(img_str) 11 | return Image.open(buff) 12 | 13 | 14 | class McLoader(object): 15 | 16 | def __init__(self, mclient_path): 17 | assert mclient_path is not None, \ 18 | "Please specify 'data_mclient_path' in the config." 19 | self.mclient_path = mclient_path 20 | server_list_config_file = "{}/server_list.conf".format( 21 | self.mclient_path) 22 | client_config_file = "{}/client.conf".format(self.mclient_path) 23 | self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file, 24 | client_config_file) 25 | 26 | def __call__(self, fn): 27 | try: 28 | img_value = mc.pyvector() 29 | self.mclient.Get(fn, img_value) 30 | img_value_str = mc.ConvertBuffer(img_value) 31 | img = pil_loader(img_value_str) 32 | except: 33 | print('Read image failed ({})'.format(fn)) 34 | return None 35 | else: 36 | return img -------------------------------------------------------------------------------- /classification/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpt import * 2 | from .pvt import * 3 | -------------------------------------------------------------------------------- /classification/models/dpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpt import * 2 | -------------------------------------------------------------------------------- /classification/models/dpt/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | __all__ = ["pointCoder", "pointwhCoder"] 8 | 9 | 10 | class pointCoder(nn.Module): 11 | def __init__(self, input_size, patch_count, weights=(1., 1.), tanh=True): 12 | super().__init__() 13 | self.input_size = input_size 14 | self.patch_count = patch_count 15 | self.weights = weights 16 | self._generate_anchor() 17 | self.tanh = tanh 18 | 19 | def _generate_anchor(self): 20 | anchors = [] 21 | patch_stride = 1. / self.patch_count 22 | for i in range(self.patch_count): 23 | for j in range(self.patch_count): 24 | y = (0.5+i)*patch_stride 25 | x = (0.5+j)*patch_stride 26 | anchors.append([x, y]) 27 | anchors = torch.as_tensor(anchors) 28 | self.register_buffer("anchor", anchors) 29 | 30 | @torch.cuda.amp.autocast(enabled=False) 31 | def forward(self, pts, model_offset=None): 32 | assert model_offset is None 33 | self.boxes = self.decode(pts) 34 | return self.boxes 35 | 36 | def decode(self, rel_codes): 37 | # print ('xyxy decoding') 38 | boxes = self.anchor 39 | pixel = 1./self.patch_count 40 | wx, wy = self.weights 41 | 42 | dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx 43 | dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy 44 | 45 | pred_boxes = torch.zeros_like(rel_codes) 46 | 47 | ref_x = boxes[:,0].unsqueeze(0) 48 | ref_y = boxes[:,1].unsqueeze(0) 49 | 50 | pred_boxes[:, :, 0] = dx + ref_x 51 | pred_boxes[:, :, 1] = dy + ref_y 52 | pred_boxes = pred_boxes.clamp_(min=0., max=1.) 53 | 54 | return pred_boxes 55 | 56 | def get_offsets(self): 57 | return (self.boxes - self.anchor) * self.input_size 58 | 59 | 60 | class pointwhCoder(pointCoder): 61 | def __init__(self, input_size, patch_count, weights=(1., 1.), pts=1, tanh=True, wh_bias=None): 62 | super().__init__(input_size=input_size, patch_count=patch_count, weights=weights, tanh=tanh) 63 | self.patch_pixel = pts 64 | self.wh_bias = None 65 | if wh_bias is not None: 66 | self.wh_bias = nn.Parameter(torch.zeros(2) + wh_bias) 67 | 68 | @torch.cuda.amp.autocast(enabled=False) 69 | def forward(self, pts, model_offset=None): 70 | assert model_offset is None 71 | if self.wh_bias is not None: 72 | pts[:, :, 2:] = pts[:, :, 2:] + self.wh_bias 73 | self.boxes = self.decode(pts) 74 | points = self.meshgrid(self.boxes) 75 | return points 76 | 77 | def decode(self, rel_codes): 78 | # print ('xyxy decoding') 79 | boxes = self.anchor 80 | pixel = 1./self.patch_count 81 | wx, wy, wh, ww = self.weights 82 | 83 | dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx 84 | dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy 85 | 86 | dw = F.relu(F.tanh(rel_codes[:, :, 2]/ww)) * pixel 87 | dh = F.relu(F.tanh(rel_codes[:, :, 3]/wh)) * pixel 88 | 89 | pred_boxes = torch.zeros_like(rel_codes) 90 | 91 | ref_x = boxes[:,0].unsqueeze(0) 92 | ref_y = boxes[:,1].unsqueeze(0) 93 | 94 | pred_boxes[:, :, 0] = dx + ref_x - dw 95 | pred_boxes[:, :, 1] = dy + ref_y - dh 96 | pred_boxes[:, :, 2] = dx + ref_x + dw 97 | pred_boxes[:, :, 3] = dy + ref_y + dh 98 | pred_boxes = pred_boxes.clamp_(min=0., max=1.) 99 | 100 | return pred_boxes 101 | 102 | def get_offsets(self): 103 | return (self.boxes - self.anchor.repeat(1,2)) * self.input_size 104 | 105 | def get_scales(self): 106 | return (self.boxes[:, :, 2:] - self.boxes[:, :, :2]) * self.input_size 107 | 108 | def meshgrid(self, boxes): 109 | B = boxes.shape[0] 110 | xs, ys = boxes[:, :, 0::2], boxes[: , :, 1::2] 111 | xs = torch.nn.functional.interpolate(xs, size=self.patch_pixel, mode='linear', align_corners=True) 112 | ys = torch.nn.functional.interpolate(ys, size=self.patch_pixel, mode='linear', align_corners=True) 113 | xs, ys = xs.unsqueeze(3).repeat_interleave(self.patch_pixel, dim=3), ys.unsqueeze(2).repeat_interleave(self.patch_pixel, dim=2) 114 | results = torch.stack([xs, ys], dim = -1) 115 | results = results.reshape(B, self.patch_count*self.patch_count*self.patch_pixel*self.patch_pixel, 2) 116 | return results 117 | -------------------------------------------------------------------------------- /classification/models/dpt/depatch_embed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from functools import partial 4 | 5 | from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 6 | from timm.models.helpers import load_pretrained 7 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 8 | from timm.models.resnet import resnet26d, resnet50d 9 | from timm.models.registry import register_model 10 | 11 | from timm.models import create_model 12 | from timm.models.vision_transformer import _cfg, Block 13 | from .ms_deform_attn_func import MSDeformAttnFunction 14 | 15 | class PatchEmbed(nn.Module): 16 | """ Image to Patch Embedding 17 | """ 18 | def __init__(self, img_size=224, patch_size=16, patch_count=14, in_chans=3, embed_dim=768, with_norm=False): 19 | super().__init__() 20 | patch_stride = img_size // patch_count 21 | patch_pad = (patch_stride * (patch_count - 1) + patch_size - img_size) // 2 22 | img_size = to_2tuple(img_size) 23 | patch_size = to_2tuple(patch_size) 24 | num_patches = patch_count * patch_count 25 | self.img_size = img_size 26 | self.patch_size = patch_size 27 | self.num_patches = num_patches 28 | 29 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=patch_pad) 30 | if with_norm: 31 | self.norm = nn.LayerNorm(embed_dim) 32 | 33 | def forward(self, x, **kwargs): 34 | B, C, H, W = x.shape 35 | # FIXME look at relaxing size constraints 36 | assert H == self.img_size[0] and W == self.img_size[1], \ 37 | f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." 38 | x = self.proj(x).flatten(2).transpose(1, 2) 39 | if hasattr(self, "norm"): 40 | x = self.norm(x) 41 | assert x.shape[1] == self.num_patches 42 | return x 43 | 44 | 45 | class Simple_Patch(nn.Module): 46 | def __init__(self, offset_embed, img_size=224, patch_size=16, patch_pixel=16, patch_count=14, 47 | in_chans=3, embed_dim=192, another_linear=False, use_GE=False, local_feature=False, with_norm=False): 48 | super().__init__() 49 | self.num_patches = patch_count * patch_count 50 | self.another_linear = another_linear 51 | if self.another_linear: 52 | self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, embed_dim, with_norm=with_norm) 53 | self.act = nn.GELU() if use_GE else nn.Identity() 54 | self.offset_predictor = nn.Linear(embed_dim, offset_embed, bias=False) 55 | else: 56 | self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, offset_embed) 57 | 58 | self.img_size, self.patch_size, self.patch_pixel, self.patch_count = img_size, patch_size, patch_pixel, patch_count 59 | self.in_chans, self.embed_dim = in_chans, embed_dim 60 | 61 | def reset_offset(self): 62 | if self.another_linear: 63 | nn.init.constant_(self.offset_predictor.weight, 0) 64 | if hasattr(self.offset_predictor, "bias") and self.offset_predictor.bias is not None: 65 | nn.init.constant_(self.offset_predictor.bias, 0) 66 | else: 67 | nn.init.constant_(self.patch_embed.proj.weight, 0) 68 | if hasattr(self.patch_embed.proj, "bias") and self.patch_embed.proj.bias is not None: 69 | nn.init.constant_(self.patch_embed.proj.bias, 0) 70 | print("Parameter for offsets reseted.") 71 | 72 | @torch.cuda.amp.autocast(enabled=False) 73 | def forward(self, x, model_offset=None): 74 | if x.dim() == 3: 75 | B, H, W = x.shape[0], self.img_size, self.img_size 76 | assert x.shape[1] == H * W 77 | x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 78 | B, C, H, W = x.shape 79 | img = x 80 | x = self.patch_embed(x) 81 | if self.another_linear: 82 | pred_offset = self.offset_predictor(self.act(x)) 83 | else: 84 | pred_offset = x.contiguous() 85 | return self.get_output(img, pred_offset, model_offset), (self.patch_count, self.patch_count) 86 | 87 | class Simple_DePatch(Simple_Patch): 88 | def __init__(self, box_coder, show_dim=4, **kwargs): 89 | super().__init__(show_dim, **kwargs) 90 | self.box_coder = box_coder 91 | self.register_buffer("value_spatial_shapes", torch.as_tensor([[self.img_size, self.img_size]], dtype=torch.long)) 92 | self.register_buffer("value_level_start_index", torch.as_tensor([0], dtype=torch.long)) 93 | self.output_proj = nn.Linear(self.in_chans * self.patch_pixel * self.patch_pixel, self.embed_dim) 94 | self.num_sample_points = self.patch_pixel * self.patch_pixel * self.patch_count * self.patch_count 95 | if kwargs["with_norm"]: 96 | self.with_norm=True 97 | self.norm = nn.LayerNorm(self.embed_dim) 98 | else: 99 | self.with_norm=False 100 | 101 | def get_output(self, img, pred_offset, model_offset=None): 102 | #copyed 103 | B = img.shape[0] 104 | sample_location = self.box_coder(pred_offset, model_offset) 105 | sampling_locations = sample_location.view(B, self.num_sample_points,1,1,1,2).to(torch.float) 106 | attention_weights = torch.ones((B, self.num_sample_points, 1, 1, 1), device=img.device) 107 | x = img.view(B, self.in_chans, 1, -1).transpose(1, 3).contiguous() 108 | output = MSDeformAttnFunction.apply(x, self.value_spatial_shapes, self.value_level_start_index, sampling_locations, attention_weights, 1) 109 | # output_proj 110 | output = output.view(B, self.num_patches, self.in_chans*self.patch_pixel*self.patch_pixel) 111 | output = self.output_proj(output) 112 | if self.with_norm: 113 | output = self.norm(output) 114 | return output 115 | -------------------------------------------------------------------------------- /classification/models/dpt/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /classification/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.7.0 2 | torchvision==0.8.1 3 | timm==0.3.2 4 | -------------------------------------------------------------------------------- /classification/run_with_submitit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | A script to run multinode training with submitit. 5 | """ 6 | import argparse 7 | import os 8 | import os.path as osp 9 | import uuid 10 | from pathlib import Path 11 | 12 | import main as classification 13 | import submitit 14 | 15 | 16 | def parse_args(): 17 | classification_parser = classification.get_args_parser() 18 | parser = argparse.ArgumentParser("Submitit for DeiT", parents=[classification_parser]) 19 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 20 | parser.add_argument("--nodes", default=2, type=int, help="Number of nodes to request") 21 | parser.add_argument("--timeout", default=2800, type=int, help="Duration of the job") 22 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 23 | 24 | parser.add_argument("--partition", default="learnfair", type=str, help="Partition where to submit") 25 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 26 | parser.add_argument('--comment', default="", type=str, 27 | help='Comment to pass to scheduler, e.g. priority message') 28 | return parser.parse_args() 29 | 30 | 31 | def get_shared_folder() -> Path: 32 | root = '/mnt/lustre/wangwenhai/workspace/PVT/' 33 | if Path(osp.join(root, 'checkpoints/')).is_dir(): 34 | p = Path(osp.join(root, 'checkpoints/experiments/')) 35 | p.mkdir(exist_ok=True) 36 | return p 37 | raise RuntimeError("No shared folder available") 38 | 39 | 40 | def get_init_file(): 41 | # Init file must not exist, but it's parent dir must exist. 42 | os.makedirs(str(get_shared_folder()), exist_ok=True) 43 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 44 | if init_file.exists(): 45 | os.remove(str(init_file)) 46 | return init_file 47 | 48 | 49 | class Trainer(object): 50 | def __init__(self, args): 51 | self.args = args 52 | 53 | def __call__(self): 54 | import main as classification 55 | 56 | self._setup_gpu_args() 57 | classification.main(self.args) 58 | 59 | def checkpoint(self): 60 | import os 61 | import submitit 62 | 63 | self.args.dist_url = get_init_file().as_uri() 64 | checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") 65 | if os.path.exists(checkpoint_file): 66 | self.args.resume = checkpoint_file 67 | print("Requeuing ", self.args) 68 | empty_trainer = type(self)(self.args) 69 | return submitit.helpers.DelayedSubmission(empty_trainer) 70 | 71 | def _setup_gpu_args(self): 72 | import submitit 73 | from pathlib import Path 74 | 75 | job_env = submitit.JobEnvironment() 76 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) 77 | self.args.gpu = job_env.local_rank 78 | self.args.rank = job_env.global_rank 79 | self.args.world_size = job_env.num_tasks 80 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | if args.job_dir == "": 86 | args.job_dir = get_shared_folder() / "%j" 87 | 88 | # Note that the folder will depend on the job_id, to easily track experiments 89 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 90 | 91 | num_gpus_per_node = args.ngpus 92 | nodes = args.nodes 93 | timeout_min = args.timeout 94 | 95 | partition = args.partition 96 | kwargs = {} 97 | if args.use_volta32: 98 | kwargs['slurm_constraint'] = 'volta32gb' 99 | if args.comment: 100 | kwargs['slurm_comment'] = args.comment 101 | 102 | executor.update_parameters( 103 | # mem_gb=40 * num_gpus_per_node, 104 | # gpus_per_node=num_gpus_per_node, 105 | tasks_per_node=num_gpus_per_node, # one task per GPU 106 | # cpus_per_task=10, 107 | nodes=nodes, 108 | timeout_min=60 * 24 * 10, # max is 60 * 72 109 | # Below are cluster dependent parameters 110 | slurm_gres="gpu:%d" % num_gpus_per_node, 111 | slurm_partition=partition, 112 | slurm_signal_delay_s=120, 113 | slurm_additional_parameters={ 114 | 'qos': 'non-preemptable', 115 | 'mpi': 'pmi2' 116 | }, 117 | **kwargs 118 | ) 119 | 120 | executor.update_parameters(name="deit") 121 | 122 | args.dist_url = get_init_file().as_uri() 123 | args.output_dir = args.job_dir 124 | 125 | trainer = Trainer(args) 126 | job = executor.submit(trainer) 127 | 128 | print("Submitted job_id:", job.job_id) 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /classification/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.distributed as dist 5 | import math 6 | 7 | 8 | class RASampler(torch.utils.data.Sampler): 9 | """Sampler that restricts data loading to a subset of the dataset for distributed, 10 | with repeated augmentation. 11 | It ensures that different each augmented version of a sample will be visible to a 12 | different process (GPU) 13 | Heavily based on torch.utils.data.DistributedSampler 14 | """ 15 | 16 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 17 | if num_replicas is None: 18 | if not dist.is_available(): 19 | raise RuntimeError("Requires distributed package to be available") 20 | num_replicas = dist.get_world_size() 21 | if rank is None: 22 | if not dist.is_available(): 23 | raise RuntimeError("Requires distributed package to be available") 24 | rank = dist.get_rank() 25 | self.dataset = dataset 26 | self.num_replicas = num_replicas 27 | self.rank = rank 28 | self.epoch = 0 29 | self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas)) 30 | self.total_size = self.num_samples * self.num_replicas 31 | # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) 32 | self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) 33 | self.shuffle = shuffle 34 | 35 | def __iter__(self): 36 | # deterministically shuffle based on epoch 37 | g = torch.Generator() 38 | g.manual_seed(self.epoch) 39 | if self.shuffle: 40 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 41 | else: 42 | indices = list(range(len(self.dataset))) 43 | 44 | # add extra samples to make it evenly divisible 45 | indices = [ele for ele in indices for i in range(3)] 46 | indices += indices[:(self.total_size - len(indices))] 47 | assert len(indices) == self.total_size 48 | 49 | # subsample 50 | indices = indices[self.rank:self.total_size:self.num_replicas] 51 | assert len(indices) == self.num_samples 52 | 53 | return iter(indices[:self.num_selected_samples]) 54 | 55 | def __len__(self): 56 | return self.num_selected_samples 57 | 58 | def set_epoch(self, epoch): 59 | self.epoch = epoch 60 | -------------------------------------------------------------------------------- /classification/tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = F401,E402,F403,W503,W504 4 | -------------------------------------------------------------------------------- /classification/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | import io 9 | import os 10 | import time 11 | from collections import defaultdict, deque 12 | import datetime 13 | 14 | import torch 15 | import torch.distributed as dist 16 | 17 | 18 | class SmoothedValue(object): 19 | """Track a series of values and provide access to smoothed values over a 20 | window or the global series average. 21 | """ 22 | 23 | def __init__(self, window_size=20, fmt=None): 24 | if fmt is None: 25 | fmt = "{median:.4f} ({global_avg:.4f})" 26 | self.deque = deque(maxlen=window_size) 27 | self.total = 0.0 28 | self.count = 0 29 | self.fmt = fmt 30 | 31 | def update(self, value, n=1): 32 | self.deque.append(value) 33 | self.count += n 34 | self.total += value * n 35 | 36 | def synchronize_between_processes(self): 37 | """ 38 | Warning: does not synchronize the deque! 39 | """ 40 | if not is_dist_avail_and_initialized(): 41 | return 42 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') 43 | dist.barrier() 44 | dist.all_reduce(t) 45 | t = t.tolist() 46 | self.count = int(t[0]) 47 | self.total = t[1] 48 | 49 | @property 50 | def median(self): 51 | d = torch.tensor(list(self.deque)) 52 | return d.median().item() 53 | 54 | @property 55 | def avg(self): 56 | d = torch.tensor(list(self.deque), dtype=torch.float32) 57 | return d.mean().item() 58 | 59 | @property 60 | def global_avg(self): 61 | return self.total / self.count 62 | 63 | @property 64 | def max(self): 65 | return max(self.deque) 66 | 67 | @property 68 | def value(self): 69 | return self.deque[-1] 70 | 71 | def __str__(self): 72 | return self.fmt.format( 73 | median=self.median, 74 | avg=self.avg, 75 | global_avg=self.global_avg, 76 | max=self.max, 77 | value=self.value) 78 | 79 | 80 | class MetricLogger(object): 81 | def __init__(self, delimiter="\t"): 82 | self.meters = defaultdict(SmoothedValue) 83 | self.delimiter = delimiter 84 | 85 | def update(self, **kwargs): 86 | for k, v in kwargs.items(): 87 | if isinstance(v, torch.Tensor): 88 | v = v.item() 89 | assert isinstance(v, (float, int)) 90 | self.meters[k].update(v) 91 | 92 | def __getattr__(self, attr): 93 | if attr in self.meters: 94 | return self.meters[attr] 95 | if attr in self.__dict__: 96 | return self.__dict__[attr] 97 | raise AttributeError("'{}' object has no attribute '{}'".format( 98 | type(self).__name__, attr)) 99 | 100 | def __str__(self): 101 | loss_str = [] 102 | for name, meter in self.meters.items(): 103 | loss_str.append( 104 | "{}: {}".format(name, str(meter)) 105 | ) 106 | return self.delimiter.join(loss_str) 107 | 108 | def synchronize_between_processes(self): 109 | for meter in self.meters.values(): 110 | meter.synchronize_between_processes() 111 | 112 | def add_meter(self, name, meter): 113 | self.meters[name] = meter 114 | 115 | def log_every(self, iterable, print_freq, header=None): 116 | i = 0 117 | if not header: 118 | header = '' 119 | start_time = time.time() 120 | end = time.time() 121 | iter_time = SmoothedValue(fmt='{avg:.4f}') 122 | data_time = SmoothedValue(fmt='{avg:.4f}') 123 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 124 | log_msg = [ 125 | header, 126 | '[{0' + space_fmt + '}/{1}]', 127 | 'eta: {eta}', 128 | '{meters}', 129 | 'time: {time}', 130 | 'data: {data}' 131 | ] 132 | if torch.cuda.is_available(): 133 | log_msg.append('max mem: {memory:.0f}') 134 | log_msg = self.delimiter.join(log_msg) 135 | MB = 1024.0 * 1024.0 136 | for obj in iterable: 137 | data_time.update(time.time() - end) 138 | yield obj 139 | iter_time.update(time.time() - end) 140 | if i % print_freq == 0 or i == len(iterable) - 1: 141 | eta_seconds = iter_time.global_avg * (len(iterable) - i) 142 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 143 | if torch.cuda.is_available(): 144 | print(log_msg.format( 145 | i, len(iterable), eta=eta_string, 146 | meters=str(self), 147 | time=str(iter_time), data=str(data_time), 148 | memory=torch.cuda.max_memory_allocated() / MB)) 149 | else: 150 | print(log_msg.format( 151 | i, len(iterable), eta=eta_string, 152 | meters=str(self), 153 | time=str(iter_time), data=str(data_time))) 154 | i += 1 155 | end = time.time() 156 | total_time = time.time() - start_time 157 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 158 | print('{} Total time: {} ({:.4f} s / it)'.format( 159 | header, total_time_str, total_time / len(iterable))) 160 | 161 | 162 | def _load_checkpoint_for_ema(model_ema, checkpoint): 163 | """ 164 | Workaround for ModelEma._load_checkpoint to accept an already-loaded object 165 | """ 166 | mem_file = io.BytesIO() 167 | torch.save(checkpoint, mem_file) 168 | mem_file.seek(0) 169 | model_ema._load_checkpoint(mem_file) 170 | 171 | 172 | def setup_for_distributed(is_master): 173 | """ 174 | This function disables printing when not in master process 175 | """ 176 | import builtins as __builtin__ 177 | builtin_print = __builtin__.print 178 | 179 | def print(*args, **kwargs): 180 | force = kwargs.pop('force', False) 181 | if is_master or force: 182 | builtin_print(*args, **kwargs) 183 | 184 | __builtin__.print = print 185 | 186 | 187 | def is_dist_avail_and_initialized(): 188 | if not dist.is_available(): 189 | return False 190 | if not dist.is_initialized(): 191 | return False 192 | return True 193 | 194 | 195 | def get_world_size(): 196 | if not is_dist_avail_and_initialized(): 197 | return 1 198 | return dist.get_world_size() 199 | 200 | 201 | def get_rank(): 202 | if not is_dist_avail_and_initialized(): 203 | return 0 204 | return dist.get_rank() 205 | 206 | 207 | def is_main_process(): 208 | return get_rank() == 0 209 | 210 | 211 | def save_on_master(*args, **kwargs): 212 | if is_main_process(): 213 | torch.save(*args, **kwargs) 214 | 215 | 216 | def init_distributed_mode(args): 217 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 218 | args.rank = int(os.environ["RANK"]) 219 | args.world_size = int(os.environ['WORLD_SIZE']) 220 | args.gpu = int(os.environ['LOCAL_RANK']) 221 | elif 'SLURM_PROCID' in os.environ: 222 | args.rank = int(os.environ['SLURM_PROCID']) 223 | args.gpu = args.rank % torch.cuda.device_count() 224 | else: 225 | print('Not using distributed mode') 226 | args.distributed = False 227 | return 228 | 229 | args.distributed = True 230 | 231 | torch.cuda.set_device(args.gpu) 232 | args.dist_backend = 'nccl' 233 | print('| distributed init (rank {}): {}'.format( 234 | args.rank, args.dist_url), flush=True) 235 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 236 | world_size=args.world_size, rank=args.rank) 237 | torch.distributed.barrier() 238 | setup_for_distributed(args.rank == 0) 239 | -------------------------------------------------------------------------------- /detection/README.md: -------------------------------------------------------------------------------- 1 | # DPT for Object Detection 2 | 3 | Here is our code for ImageNet classification. Please check [our paper](https://arxiv.org/abs/2107.14467) for detailed information. 4 | 5 | ## Instructions 6 | 7 | ### Preparations 8 | 9 | First, install pytorch as for classification. 10 | ```bash 11 | conda install pytorch==1.7.1 torchvision==0.8.2 cudatoolkit=10.1 -c pytorch 12 | pip install timm==0.3.2 13 | ``` 14 | 15 | We develop our method under environment `mmcv==1.2.7` and `mmdet==2.8.0`. We recommand you [this document](https://github.com/open-mmlab/mmdetection/blob/v2.8.0/docs/get_started.md) for detailed instructions. 16 | 17 | ### Evaluation 18 | 19 | To evaluate RetinaNet on COCO val2017 with 8 gpus run: 20 | ``` 21 | ./dist_test.sh /path/to/config/file /path/to/checkpoint_file 8 --eval bbox 22 | ``` 23 | 24 | For example, to evaluate RetinaNet with DPT-Tiny: 25 | 26 | ``` 27 | ./dist_test.sh configs/retinanet_dpt_t_fpn_1x_coco.py pretrained/detection/retinanet_dpt_t_1x.pth 8 --eval bbox 28 | ``` 29 | 30 | 31 | To evaluate Mask R-CNN on COCO val2017 with 8 gpus run: 32 | 33 | ``` 34 | ./dist_test.sh /path/to/config/file /path/to/checkpoint_file 8 --eval bbox segm 35 | ``` 36 | 37 | For example, to evaluate Mask R-CNN with DPT-Tiny: 38 | 39 | ``` 40 | ./dist_test.sh configs/mask_rcnn_dpt_t_fpn_1x_coco.py pretrained/detection/mrcnn_dpt_t_1x.pth 8 --eval bbox segm 41 | ``` 42 | 43 | ### Training 44 | 45 | Train with certain config file: 46 | 47 | ``` 48 | dist_train.sh /path/to/config/file $NUM_GPUS 49 | ``` 50 | 51 | 52 | For example, to train DPT-Small + Mask R-CNN on COCO train2017 for 12 epochs with 8 gpus: 53 | 54 | ``` 55 | dist_train.sh configs/mask_rcnn_dpt_s_fpn_1x_coco.py 8 56 | ``` 57 | 58 | 59 | ## Results and Models 60 | ### RetinaNet Results 61 | 62 | | Method | #Params (M) | Schedule | mAP | AP50 | AP75 | APs | APm | APl | Download | 63 | |------------|:-----------:|:--------:|:----:|:----:|:----:|:----:|:----:|:----:|:--------:| 64 | | DPT-Tiny | 24.9 | 1x | 39.5 | 60.4 | 41.8 | 23.7 | 43.2 | 52.2 |[Google Drive](https://drive.google.com/file/d/1S84hbeVxPjtcmjeOcae5Sn_XYqyagMn3/view?usp=sharing)| 65 | | DPT-Tiny | 24.9 | MS+3x | 41.2 | 62.0 | 44.0 | 25.7 | 44.6 | 53.9 |[Google Drive](https://drive.google.com/file/d/1OdMjRxjGdwqow124ZB-COgEh-AgV8TxH/view?usp=sharing)| 66 | | DPT-Small | 36.1 | 1x | 42.5 | 63.6 | 45.3 | 26.2 | 45.7 | 56.9 |[Google Drive](https://drive.google.com/file/d/1iVmK6MTdX8n2A7nS31GHUCaY0wcjhWzS/view?usp=sharing)| 67 | | DPT-Small | 36.1 | MS+3x | 43.3 | 64.0 | 46.5 | 27.8 | 46.3 | 58.5 |[Google Drive](https://drive.google.com/file/d/1PvoQYQC6UklSFavkhwqaSO-2zrYnJYkB/view?usp=sharing)| 68 | | DPT-Medium | 55.9 | 1x | 43.3 | 64.6 | 45.9 | 27.2 | 46.7 | 58.6 |[Google Drive](https://drive.google.com/file/d/1AWYLEEZN27sKmkCyV3WEy6mWbp2ar0gI/view?usp=sharing)| 69 | | DPT-Medium | 55.9 | MS+3x | 43.7 | 64.6 | 46.4 | 27.2 | 47.0 | 58.4 |[Google Drive](https://drive.google.com/file/d/1AeAq2nCSohMfKp1Q4WUROx0csYLOZaZG/view?usp=sharing)| 70 | 71 | ### Mask R-CNN Results 72 | 73 | | Method | #Params (M) | Schedule | box mAP | box AP50 | box AP75 | mask mAP | mask AP50 | mask AP75 | Download | 74 | |------------|:-----------:|:--------:|:-------:|:--------:|:--------:|:--------:|:---------:|:---------:|:--------:| 75 | | DPT-Tiny | 34.8 | 1x | 40.2 | 62.8 | 43.8 | 37.7 | 59.8 | 40.4 |[Google Drive](https://drive.google.com/file/d/1jCp5mYqHnNs1Uzrh8uopOyRSw0aVLDQp/view?usp=sharing)| 76 | | DPT-Tiny | 34.8 | MS+3x | 42.2 | 64.4 | 46.1 | 39.4 | 61.5 | 42.3 |[Google Drive](https://drive.google.com/file/d/1S3_ERb4Ak4ksWPXryXUTV1CEEVAP2BAj/view?usp=sharing)| 77 | | DPT-Small | 46.1 | 1x | 43.1 | 65.7 | 47.2 | 39.9 | 62.9 | 43.0 |[Google Drive](https://drive.google.com/file/d/17hg0oLhH96nFTc8H9GAoDpX1Qb7oMGGJ/view?usp=sharing)| 78 | | DPT-Small | 46.1 | MS+3x | 44.4 | 66.5 | 48.9 | 41.0 | 63.6 | 44.2 |[Google Drive](https://drive.google.com/file/d/1aOLw_rVs-LGCKbcMXZN4ogGIOp0rc-UY/view?usp=sharing)| 79 | | DPT-Medium | 65.8 | 1x | 43.8 | 66.2 | 48.3 | 40.3 | 63.1 | 43.4 |[Google Drive](https://drive.google.com/file/d/1pl8W7WW_MN9N9TxgNZB87FuwEw6pM_n_/view?usp=sharing)| 80 | | DPT-Medium | 65.8 | MS+3x | 44.3 | 65.6 | 48.8 | 40.7 | 63.1 | 44.1 |[Google Drive](https://drive.google.com/file/d/1_m4Huy1sNiwBDKamPhrvo6cLE6JNJ_kY/view?usp=sharing)| 81 | 82 | ### Other links 83 | 84 | These models can also be obtained from [BaiduNetdisk](https://pan.baidu.com/s/19nJXoOAK_mljV4BPx1sUSQ). Password for extraction is **DPTs**. 85 | Our result is pretrained on the ImageNet1k dataset. ImageNet1k-pretrained models can be found [here](../classification/README.md). 86 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_detection.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'CocoDataset' 2 | data_root = '/gdata/MSCOCO2017/' 3 | img_norm_cfg = dict( 4 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', with_bbox=True), 8 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 9 | dict(type='RandomFlip', flip_ratio=0.5), 10 | dict(type='Normalize', **img_norm_cfg), 11 | dict(type='Pad', size_divisor=32), 12 | dict(type='DefaultFormatBundle'), 13 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 14 | ] 15 | test_pipeline = [ 16 | dict(type='LoadImageFromFile'), 17 | dict( 18 | type='MultiScaleFlipAug', 19 | img_scale=(1333, 800), 20 | flip=False, 21 | transforms=[ 22 | dict(type='Resize', keep_ratio=True), 23 | dict(type='RandomFlip'), 24 | dict(type='Normalize', **img_norm_cfg), 25 | dict(type='Pad', size_divisor=32), 26 | dict(type='ImageToTensor', keys=['img']), 27 | dict(type='Collect', keys=['img']), 28 | ]) 29 | ] 30 | data = dict( 31 | samples_per_gpu=2, 32 | workers_per_gpu=2, 33 | train=dict( 34 | type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict( 39 | type=dataset_type, 40 | ann_file=data_root + 'annotations/instances_val2017.json', 41 | img_prefix=data_root + 'val2017/', 42 | pipeline=test_pipeline), 43 | test=dict( 44 | type=dataset_type, 45 | ann_file=data_root + 'annotations/instances_val2017.json', 46 | img_prefix=data_root + 'val2017/', 47 | pipeline=test_pipeline)) 48 | evaluation = dict(interval=1, metric='bbox') 49 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_instance.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'CocoDataset' 2 | data_root = '/gdata/MSCOCO2017/' 3 | img_norm_cfg = dict( 4 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 8 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 9 | dict(type='RandomFlip', flip_ratio=0.5), 10 | dict(type='Normalize', **img_norm_cfg), 11 | dict(type='Pad', size_divisor=32), 12 | dict(type='DefaultFormatBundle'), 13 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 14 | ] 15 | test_pipeline = [ 16 | dict(type='LoadImageFromFile'), 17 | dict( 18 | type='MultiScaleFlipAug', 19 | img_scale=(1333, 800), 20 | flip=False, 21 | transforms=[ 22 | dict(type='Resize', keep_ratio=True), 23 | dict(type='RandomFlip'), 24 | dict(type='Normalize', **img_norm_cfg), 25 | dict(type='Pad', size_divisor=32), 26 | dict(type='ImageToTensor', keys=['img']), 27 | dict(type='Collect', keys=['img']), 28 | ]) 29 | ] 30 | data = dict( 31 | samples_per_gpu=2, 32 | workers_per_gpu=2, 33 | train=dict( 34 | type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict( 39 | type=dataset_type, 40 | ann_file=data_root + 'annotations/instances_val2017.json', 41 | img_prefix=data_root + 'val2017/', 42 | pipeline=test_pipeline), 43 | test=dict( 44 | type=dataset_type, 45 | ann_file=data_root + 'annotations/instances_val2017.json', 46 | img_prefix=data_root + 'val2017/', 47 | pipeline=test_pipeline)) 48 | evaluation = dict(metric=['bbox', 'segm']) 49 | -------------------------------------------------------------------------------- /detection/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | checkpoint_config = dict(interval=1) 2 | # yapf:disable 3 | log_config = dict( 4 | interval=50, 5 | hooks=[ 6 | dict(type='TextLoggerHook'), 7 | # dict(type='TensorboardLoggerHook') 8 | ]) 9 | # yapf:enable 10 | dist_params = dict(backend='nccl') 11 | log_level = 'INFO' 12 | load_from = None 13 | resume_from = None 14 | workflow = [('train', 1)] 15 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/mask_rcnn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='MaskRCNN', 4 | pretrained='torchvision://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | norm_cfg=dict(type='BN', requires_grad=True), 12 | norm_eval=True, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[8], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[4, 8, 16, 32, 64]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | roi_head=dict( 36 | type='StandardRoIHead', 37 | bbox_roi_extractor=dict( 38 | type='SingleRoIExtractor', 39 | roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), 40 | out_channels=256, 41 | featmap_strides=[4, 8, 16, 32]), 42 | bbox_head=dict( 43 | type='Shared2FCBBoxHead', 44 | in_channels=256, 45 | fc_out_channels=1024, 46 | roi_feat_size=7, 47 | num_classes=80, 48 | bbox_coder=dict( 49 | type='DeltaXYWHBBoxCoder', 50 | target_means=[0., 0., 0., 0.], 51 | target_stds=[0.1, 0.1, 0.2, 0.2]), 52 | reg_class_agnostic=False, 53 | loss_cls=dict( 54 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 55 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 56 | mask_roi_extractor=dict( 57 | type='SingleRoIExtractor', 58 | roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), 59 | out_channels=256, 60 | featmap_strides=[4, 8, 16, 32]), 61 | mask_head=dict( 62 | type='FCNMaskHead', 63 | num_convs=4, 64 | in_channels=256, 65 | conv_out_channels=256, 66 | num_classes=80, 67 | loss_mask=dict( 68 | type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) 69 | # model training and testing settings 70 | train_cfg = dict( 71 | rpn=dict( 72 | assigner=dict( 73 | type='MaxIoUAssigner', 74 | pos_iou_thr=0.7, 75 | neg_iou_thr=0.3, 76 | min_pos_iou=0.3, 77 | match_low_quality=True, 78 | ignore_iof_thr=-1), 79 | sampler=dict( 80 | type='RandomSampler', 81 | num=256, 82 | pos_fraction=0.5, 83 | neg_pos_ub=-1, 84 | add_gt_as_proposals=False), 85 | allowed_border=-1, 86 | pos_weight=-1, 87 | debug=False), 88 | rpn_proposal=dict( 89 | nms_across_levels=False, 90 | nms_pre=2000, 91 | nms_post=1000, 92 | max_num=1000, 93 | nms_thr=0.7, 94 | min_bbox_size=0), 95 | rcnn=dict( 96 | assigner=dict( 97 | type='MaxIoUAssigner', 98 | pos_iou_thr=0.5, 99 | neg_iou_thr=0.5, 100 | min_pos_iou=0.5, 101 | match_low_quality=True, 102 | ignore_iof_thr=-1), 103 | sampler=dict( 104 | type='RandomSampler', 105 | num=512, 106 | pos_fraction=0.25, 107 | neg_pos_ub=-1, 108 | add_gt_as_proposals=True), 109 | mask_size=28, 110 | pos_weight=-1, 111 | debug=False)) 112 | test_cfg = dict( 113 | rpn=dict( 114 | nms_across_levels=False, 115 | nms_pre=1000, 116 | nms_post=1000, 117 | max_num=1000, 118 | nms_thr=0.7, 119 | min_bbox_size=0), 120 | rcnn=dict( 121 | score_thr=0.05, 122 | nms=dict(type='nms', iou_threshold=0.5), 123 | max_per_img=100, 124 | mask_thr_binary=0.5)) 125 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='torchvision://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | norm_cfg=dict(type='BN', requires_grad=True), 12 | norm_eval=True, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs='on_input', 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=80, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | anchor_generator=dict( 28 | type='AnchorGenerator', 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | ratios=[0.5, 1.0, 2.0], 32 | strides=[8, 16, 32, 64, 128]), 33 | bbox_coder=dict( 34 | type='DeltaXYWHBBoxCoder', 35 | target_means=[.0, .0, .0, .0], 36 | target_stds=[1.0, 1.0, 1.0, 1.0]), 37 | loss_cls=dict( 38 | type='FocalLoss', 39 | use_sigmoid=True, 40 | gamma=2.0, 41 | alpha=0.25, 42 | loss_weight=1.0), 43 | loss_bbox=dict(type='L1Loss', loss_weight=1.0))) 44 | # training and testing settings 45 | train_cfg = dict( 46 | assigner=dict( 47 | type='MaxIoUAssigner', 48 | pos_iou_thr=0.5, 49 | neg_iou_thr=0.4, 50 | min_pos_iou=0, 51 | ignore_iof_thr=-1), 52 | allowed_border=-1, 53 | pos_weight=-1, 54 | debug=False) 55 | test_cfg = dict( 56 | nms_pre=1000, 57 | min_bbox_size=0, 58 | score_thr=0.05, 59 | nms=dict(type='nms', iou_threshold=0.5), 60 | max_per_img=100) 61 | -------------------------------------------------------------------------------- /detection/configs/detr_dpt_s_8x2_50ep_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/datasets/coco_detection.py', 3 | '_base_/default_runtime.py' 4 | ] 5 | model = dict( 6 | type='DETR', 7 | pretrained='pretrained/dpt_small.pth', 8 | backbone=dict( 9 | type='dpt_small_f4', 10 | style='pytorch'), 11 | bbox_head=dict( 12 | type='TransformerHead', 13 | num_classes=80, 14 | in_channels=512, 15 | num_fcs=2, 16 | transformer=dict( 17 | type='Transformer', 18 | embed_dims=256, 19 | num_heads=8, 20 | num_encoder_layers=6, 21 | num_decoder_layers=6, 22 | feedforward_channels=2048, 23 | dropout=0.1, 24 | act_cfg=dict(type='ReLU', inplace=True), 25 | norm_cfg=dict(type='LN'), 26 | num_fcs=2, 27 | pre_norm=False, 28 | return_intermediate_dec=True), 29 | positional_encoding=dict( 30 | type='SinePositionalEncoding', num_feats=128, normalize=True), 31 | loss_cls=dict( 32 | type='CrossEntropyLoss', 33 | bg_cls_weight=0.1, 34 | use_sigmoid=False, 35 | loss_weight=1.0, 36 | class_weight=1.0), 37 | loss_bbox=dict(type='L1Loss', loss_weight=5.0), 38 | loss_iou=dict(type='GIoULoss', loss_weight=2.0))) 39 | # training and testing settings 40 | train_cfg = dict( 41 | assigner=dict( 42 | type='HungarianAssigner', 43 | cls_cost=dict(type='ClassificationCost', weight=1.), 44 | reg_cost=dict(type='BBoxL1Cost', weight=5.0), 45 | iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))) 46 | test_cfg = dict(max_per_img=100) 47 | img_norm_cfg = dict( 48 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 49 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different 50 | # from the default setting in mmdet. 51 | train_pipeline = [ 52 | dict(type='LoadImageFromFile'), 53 | dict(type='LoadAnnotations', with_bbox=True), 54 | dict( 55 | type='Resize', 56 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 57 | (1333, 768), (1333, 800)], 58 | multiscale_mode='value', 59 | keep_ratio=True), 60 | dict(type='RandomFlip', flip_ratio=0.5), 61 | dict(type='Normalize', **img_norm_cfg), 62 | dict(type='Pad', size_divisor=32), 63 | dict(type='DefaultFormatBundle'), 64 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 65 | ] 66 | # test_pipeline, NOTE the Pad's size_divisor is different from the default 67 | # setting (size_divisor=32). While there is little effect on the performance 68 | # whether we use the default setting or use size_divisor=1. 69 | test_pipeline = [ 70 | dict(type='LoadImageFromFile'), 71 | dict( 72 | type='MultiScaleFlipAug', 73 | img_scale=(1333, 800), 74 | flip=False, 75 | transforms=[ 76 | dict(type='Resize', keep_ratio=True), 77 | dict(type='RandomFlip'), 78 | dict(type='Normalize', **img_norm_cfg), 79 | dict(type='Pad', size_divisor=32), 80 | dict(type='ImageToTensor', keys=['img']), 81 | dict(type='Collect', keys=['img']) 82 | ]) 83 | ] 84 | data = dict( 85 | samples_per_gpu=2, 86 | workers_per_gpu=2, 87 | train=dict(pipeline=train_pipeline), 88 | val=dict(pipeline=test_pipeline), 89 | test=dict(pipeline=test_pipeline)) 90 | # optimizer 91 | optimizer = dict( 92 | type='AdamW', 93 | lr=0.0001, 94 | weight_decay=0.0001, 95 | paramwise_cfg=dict( 96 | custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) 97 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) 98 | # learning policy 99 | lr_config = dict(policy='step', step=[33]) 100 | total_epochs = 50 101 | -------------------------------------------------------------------------------- /detection/configs/detr_pvt_s_8x2_50ep_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/datasets/coco_detection.py', 3 | '_base_/default_runtime.py' 4 | ] 5 | model = dict( 6 | type='DETR', 7 | pretrained='pretrained/pvt_small.pth', 8 | backbone=dict( 9 | type='pvt_small_f4', 10 | style='pytorch'), 11 | bbox_head=dict( 12 | type='TransformerHead', 13 | num_classes=80, 14 | in_channels=512, 15 | num_fcs=2, 16 | transformer=dict( 17 | type='Transformer', 18 | embed_dims=256, 19 | num_heads=8, 20 | num_encoder_layers=6, 21 | num_decoder_layers=6, 22 | feedforward_channels=2048, 23 | dropout=0.1, 24 | act_cfg=dict(type='ReLU', inplace=True), 25 | norm_cfg=dict(type='LN'), 26 | num_fcs=2, 27 | pre_norm=False, 28 | return_intermediate_dec=True), 29 | positional_encoding=dict( 30 | type='SinePositionalEncoding', num_feats=128, normalize=True), 31 | loss_cls=dict( 32 | type='CrossEntropyLoss', 33 | bg_cls_weight=0.1, 34 | use_sigmoid=False, 35 | loss_weight=1.0, 36 | class_weight=1.0), 37 | loss_bbox=dict(type='L1Loss', loss_weight=5.0), 38 | loss_iou=dict(type='GIoULoss', loss_weight=2.0))) 39 | # training and testing settings 40 | train_cfg = dict( 41 | assigner=dict( 42 | type='HungarianAssigner', 43 | cls_cost=dict(type='ClassificationCost', weight=1.), 44 | reg_cost=dict(type='BBoxL1Cost', weight=5.0), 45 | iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))) 46 | test_cfg = dict(max_per_img=100) 47 | img_norm_cfg = dict( 48 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 49 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different 50 | # from the default setting in mmdet. 51 | train_pipeline = [ 52 | dict(type='LoadImageFromFile'), 53 | dict(type='LoadAnnotations', with_bbox=True), 54 | dict( 55 | type='Resize', 56 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 57 | (1333, 768), (1333, 800)], 58 | multiscale_mode='value', 59 | keep_ratio=True), 60 | dict(type='RandomFlip', flip_ratio=0.5), 61 | dict(type='Normalize', **img_norm_cfg), 62 | dict(type='Pad', size_divisor=32), 63 | dict(type='DefaultFormatBundle'), 64 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 65 | ] 66 | # test_pipeline, NOTE the Pad's size_divisor is different from the default 67 | # setting (size_divisor=32). While there is little effect on the performance 68 | # whether we use the default setting or use size_divisor=1. 69 | test_pipeline = [ 70 | dict(type='LoadImageFromFile'), 71 | dict( 72 | type='MultiScaleFlipAug', 73 | img_scale=(1333, 800), 74 | flip=False, 75 | transforms=[ 76 | dict(type='Resize', keep_ratio=True), 77 | dict(type='RandomFlip'), 78 | dict(type='Normalize', **img_norm_cfg), 79 | dict(type='Pad', size_divisor=32), 80 | dict(type='ImageToTensor', keys=['img']), 81 | dict(type='Collect', keys=['img']) 82 | ]) 83 | ] 84 | data = dict( 85 | samples_per_gpu=2, 86 | workers_per_gpu=2, 87 | train=dict(pipeline=train_pipeline), 88 | val=dict(pipeline=test_pipeline), 89 | test=dict(pipeline=test_pipeline)) 90 | # optimizer 91 | optimizer = dict( 92 | type='AdamW', 93 | lr=0.0001, 94 | weight_decay=0.0001, 95 | paramwise_cfg=dict( 96 | custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) 97 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) 98 | # learning policy 99 | lr_config = dict(policy='step', step=[33]) 100 | total_epochs = 50 101 | -------------------------------------------------------------------------------- /detection/configs/detr_r50_8x2_50ep_coco_baseline.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/datasets/coco_detection.py', 3 | '_base_/default_runtime.py' 4 | ] 5 | model = dict( 6 | type='DETR', 7 | # pretrained='torchvision://resnet50', 8 | pretrained='pretrained/resnet50-19c8e357.pth', 9 | backbone=dict( 10 | type='ResNet', 11 | depth=50, 12 | num_stages=4, 13 | out_indices=(3,), 14 | frozen_stages=1, 15 | norm_cfg=dict(type='BN', requires_grad=False), 16 | norm_eval=True, 17 | style='pytorch'), 18 | bbox_head=dict( 19 | type='TransformerHead', 20 | num_classes=80, 21 | in_channels=2048, 22 | num_fcs=2, 23 | transformer=dict( 24 | type='Transformer', 25 | embed_dims=256, 26 | num_heads=8, 27 | num_encoder_layers=6, 28 | num_decoder_layers=6, 29 | feedforward_channels=2048, 30 | dropout=0.1, 31 | act_cfg=dict(type='ReLU', inplace=True), 32 | norm_cfg=dict(type='LN'), 33 | num_fcs=2, 34 | pre_norm=False, 35 | return_intermediate_dec=True), 36 | positional_encoding=dict( 37 | type='SinePositionalEncoding', num_feats=128, normalize=True), 38 | loss_cls=dict( 39 | type='CrossEntropyLoss', 40 | bg_cls_weight=0.1, 41 | use_sigmoid=False, 42 | loss_weight=1.0, 43 | class_weight=1.0), 44 | loss_bbox=dict(type='L1Loss', loss_weight=5.0), 45 | loss_iou=dict(type='GIoULoss', loss_weight=2.0))) 46 | # training and testing settings 47 | train_cfg = dict( 48 | assigner=dict( 49 | type='HungarianAssigner', 50 | cls_cost=dict(type='ClassificationCost', weight=1.), 51 | reg_cost=dict(type='BBoxL1Cost', weight=5.0), 52 | iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))) 53 | test_cfg = dict(max_per_img=100) 54 | img_norm_cfg = dict( 55 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 56 | # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different 57 | # from the default setting in mmdet. 58 | train_pipeline = [ 59 | dict(type='LoadImageFromFile'), 60 | dict(type='LoadAnnotations', with_bbox=True), 61 | dict( 62 | type='Resize', 63 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 64 | (1333, 768), (1333, 800)], 65 | multiscale_mode='value', 66 | keep_ratio=True), 67 | dict(type='RandomFlip', flip_ratio=0.5), 68 | dict(type='Normalize', **img_norm_cfg), 69 | dict(type='Pad', size_divisor=32), 70 | dict(type='DefaultFormatBundle'), 71 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 72 | ] 73 | # test_pipeline, NOTE the Pad's size_divisor is different from the default 74 | # setting (size_divisor=32). While there is little effect on the performance 75 | # whether we use the default setting or use size_divisor=1. 76 | test_pipeline = [ 77 | dict(type='LoadImageFromFile'), 78 | dict( 79 | type='MultiScaleFlipAug', 80 | img_scale=(1333, 800), 81 | flip=False, 82 | transforms=[ 83 | dict(type='Resize', keep_ratio=True), 84 | dict(type='RandomFlip'), 85 | dict(type='Normalize', **img_norm_cfg), 86 | dict(type='Pad', size_divisor=1), 87 | dict(type='ImageToTensor', keys=['img']), 88 | dict(type='Collect', keys=['img']) 89 | ]) 90 | ] 91 | data = dict( 92 | samples_per_gpu=2, 93 | workers_per_gpu=2, 94 | train=dict(pipeline=train_pipeline), 95 | val=dict(pipeline=test_pipeline), 96 | test=dict(pipeline=test_pipeline)) 97 | # optimizer 98 | optimizer = dict( 99 | type='AdamW', 100 | lr=0.0001, 101 | weight_decay=0.0001, 102 | paramwise_cfg=dict( 103 | custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) 104 | optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) 105 | # learning policy 106 | lr_config = dict(policy='step', step=[33]) 107 | total_epochs = 50 108 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_m_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_medium.pth', 9 | backbone=dict( 10 | type='dpt_medium', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_m_fpn_mstrain-poly_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_medium.pth', 9 | backbone=dict( 10 | type='dpt_medium', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # multi-scale 18 | img_norm_cfg = dict( 19 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 20 | train_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='LoadAnnotations', 24 | with_bbox=True, 25 | with_mask=True, 26 | poly2mask=False), 27 | dict( 28 | type='Resize', 29 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 30 | (1333, 768), (1333, 800)], 31 | multiscale_mode='value', 32 | keep_ratio=True), 33 | dict(type='RandomFlip', flip_ratio=0.5), 34 | dict(type='Normalize', **img_norm_cfg), 35 | dict(type='Pad', size_divisor=32), 36 | dict(type='DefaultFormatBundle'), 37 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 38 | ] 39 | test_pipeline = [ 40 | dict(type='LoadImageFromFile'), 41 | dict( 42 | type='MultiScaleFlipAug', 43 | img_scale=(1333, 800), 44 | flip=False, 45 | transforms=[ 46 | dict(type='Resize', keep_ratio=True), 47 | dict(type='RandomFlip'), 48 | dict(type='Normalize', **img_norm_cfg), 49 | dict(type='Pad', size_divisor=32), 50 | dict(type='ImageToTensor', keys=['img']), 51 | dict(type='Collect', keys=['img']), 52 | ]) 53 | ] 54 | data = dict( 55 | train=dict(pipeline=train_pipeline), 56 | val=dict(pipeline=test_pipeline), 57 | test=dict(pipeline=test_pipeline)) 58 | # optimizer 59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 60 | optimizer_config = dict(grad_clip=None) 61 | # learning policy 62 | lr_config = dict( 63 | policy='step', 64 | warmup='linear', 65 | warmup_iters=500, 66 | warmup_ratio=0.001, 67 | step=[28, 34]) 68 | total_epochs = 36 69 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_s_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_small.pth', 9 | backbone=dict( 10 | type='dpt_small', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_s_fpn_mstrain-poly_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_small.pth', 9 | backbone=dict( 10 | type='dpt_small', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # multi-scale 18 | img_norm_cfg = dict( 19 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 20 | train_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='LoadAnnotations', 24 | with_bbox=True, 25 | with_mask=True, 26 | poly2mask=False), 27 | dict( 28 | type='Resize', 29 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 30 | (1333, 768), (1333, 800)], 31 | multiscale_mode='value', 32 | keep_ratio=True), 33 | dict(type='RandomFlip', flip_ratio=0.5), 34 | dict(type='Normalize', **img_norm_cfg), 35 | dict(type='Pad', size_divisor=32), 36 | dict(type='DefaultFormatBundle'), 37 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 38 | ] 39 | test_pipeline = [ 40 | dict(type='LoadImageFromFile'), 41 | dict( 42 | type='MultiScaleFlipAug', 43 | img_scale=(1333, 800), 44 | flip=False, 45 | transforms=[ 46 | dict(type='Resize', keep_ratio=True), 47 | dict(type='RandomFlip'), 48 | dict(type='Normalize', **img_norm_cfg), 49 | dict(type='Pad', size_divisor=32), 50 | dict(type='ImageToTensor', keys=['img']), 51 | dict(type='Collect', keys=['img']), 52 | ]) 53 | ] 54 | data = dict( 55 | train=dict(pipeline=train_pipeline), 56 | val=dict(pipeline=test_pipeline), 57 | test=dict(pipeline=test_pipeline)) 58 | # optimizer 59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 60 | optimizer_config = dict(grad_clip=None) 61 | # learning policy 62 | lr_config = dict( 63 | policy='step', 64 | warmup='linear', 65 | warmup_iters=500, 66 | warmup_ratio=0.001, 67 | step=[28, 34]) 68 | total_epochs = 36 69 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_t_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_tiny.pth', 9 | backbone=dict( 10 | type='dpt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_dpt_t_fpn_mstrain-poly_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_tiny.pth', 9 | backbone=dict( 10 | type='dpt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # multi-scale 18 | img_norm_cfg = dict( 19 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 20 | train_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='LoadAnnotations', 24 | with_bbox=True, 25 | with_mask=True, 26 | poly2mask=False), 27 | dict( 28 | type='Resize', 29 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 30 | (1333, 768), (1333, 800)], 31 | multiscale_mode='value', 32 | keep_ratio=True), 33 | dict(type='RandomFlip', flip_ratio=0.5), 34 | dict(type='Normalize', **img_norm_cfg), 35 | dict(type='Pad', size_divisor=32), 36 | dict(type='DefaultFormatBundle'), 37 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 38 | ] 39 | test_pipeline = [ 40 | dict(type='LoadImageFromFile'), 41 | dict( 42 | type='MultiScaleFlipAug', 43 | img_scale=(1333, 800), 44 | flip=False, 45 | transforms=[ 46 | dict(type='Resize', keep_ratio=True), 47 | dict(type='RandomFlip'), 48 | dict(type='Normalize', **img_norm_cfg), 49 | dict(type='Pad', size_divisor=32), 50 | dict(type='ImageToTensor', keys=['img']), 51 | dict(type='Collect', keys=['img']), 52 | ]) 53 | ] 54 | data = dict( 55 | train=dict(pipeline=train_pipeline), 56 | val=dict(pipeline=test_pipeline), 57 | test=dict(pipeline=test_pipeline)) 58 | # optimizer 59 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 60 | optimizer_config = dict(grad_clip=None) 61 | # learning policy 62 | lr_config = dict( 63 | policy='step', 64 | warmup='linear', 65 | warmup_iters=500, 66 | warmup_ratio=0.001, 67 | step=[28, 34]) 68 | total_epochs = 36 69 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_pvt_s_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/pvt_small.pth', 9 | backbone=dict( 10 | type='pvt_small', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn_pvt_t_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/mask_rcnn_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_instance.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/pvt_tiny.pth', 9 | backbone=dict( 10 | type='pvt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_m_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/retinanet_r50_fpn.py', 3 | '_base_/datasets/coco_detection.py', 4 | '_base_/default_runtime.py' 5 | ] 6 | model = dict( 7 | pretrained='pretrained/dpt_medium.pth', 8 | backbone=dict( 9 | type='dpt_medium', 10 | style='pytorch'), 11 | neck=dict( 12 | type='FPN', 13 | in_channels=[64, 128, 320, 512], 14 | out_channels=256, 15 | start_level=1, 16 | add_extra_convs='on_input', 17 | num_outs=5)) 18 | # optimizer 19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 20 | optimizer_config = dict(grad_clip=None) 21 | # learning policy 22 | lr_config = dict( 23 | policy='step', 24 | warmup='linear', 25 | warmup_iters=500, 26 | warmup_ratio=0.001, 27 | step=[8, 11]) 28 | total_epochs = 12 29 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_m_fpn_mstrain_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_detection.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_medium.pth', 9 | backbone=dict( 10 | type='dpt_medium', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs='on_input', 18 | num_outs=5)) 19 | img_norm_cfg = dict( 20 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 21 | train_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict(type='LoadAnnotations', with_bbox=True), 24 | dict( 25 | type='Resize', 26 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 27 | (1333, 768), (1333, 800)], 28 | multiscale_mode='value', 29 | keep_ratio=True), 30 | dict(type='RandomFlip', flip_ratio=0.5), 31 | dict(type='Normalize', **img_norm_cfg), 32 | dict(type='Pad', size_divisor=32), 33 | dict(type='DefaultFormatBundle'), 34 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 35 | ] 36 | test_pipeline = [ 37 | dict(type='LoadImageFromFile'), 38 | dict( 39 | type='MultiScaleFlipAug', 40 | img_scale=(1333, 800), 41 | flip=False, 42 | transforms=[ 43 | dict(type='Resize', keep_ratio=True), 44 | dict(type='RandomFlip'), 45 | dict(type='Normalize', **img_norm_cfg), 46 | dict(type='Pad', size_divisor=32), 47 | dict(type='ImageToTensor', keys=['img']), 48 | dict(type='Collect', keys=['img']), 49 | ]) 50 | ] 51 | data = dict( 52 | train=dict(pipeline=train_pipeline), 53 | val=dict(pipeline=test_pipeline), 54 | test=dict(pipeline=test_pipeline)) 55 | # optimizer 56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 57 | optimizer_config = dict(grad_clip=None) 58 | # learning policy 59 | lr_config = dict( 60 | policy='step', 61 | warmup='linear', 62 | warmup_iters=500, 63 | warmup_ratio=0.001, 64 | step=[28, 34]) 65 | total_epochs = 36 66 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_s_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/retinanet_r50_fpn.py', 3 | '_base_/datasets/coco_detection.py', 4 | '_base_/default_runtime.py' 5 | ] 6 | model = dict( 7 | pretrained='pretrained/dpt_small.pth', 8 | backbone=dict( 9 | type='dpt_small', 10 | style='pytorch'), 11 | neck=dict( 12 | type='FPN', 13 | in_channels=[64, 128, 320, 512], 14 | out_channels=256, 15 | start_level=1, 16 | add_extra_convs='on_input', 17 | num_outs=5)) 18 | # optimizer 19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 20 | optimizer_config = dict(grad_clip=None) 21 | # learning policy 22 | lr_config = dict( 23 | policy='step', 24 | warmup='linear', 25 | warmup_iters=500, 26 | warmup_ratio=0.001, 27 | step=[8, 11]) 28 | total_epochs = 12 29 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_s_fpn_mstrain_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_detection.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_small.pth', 9 | backbone=dict( 10 | type='dpt_small', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs='on_input', 18 | num_outs=5)) 19 | img_norm_cfg = dict( 20 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 21 | train_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict(type='LoadAnnotations', with_bbox=True), 24 | dict( 25 | type='Resize', 26 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 27 | (1333, 768), (1333, 800)], 28 | multiscale_mode='value', 29 | keep_ratio=True), 30 | dict(type='RandomFlip', flip_ratio=0.5), 31 | dict(type='Normalize', **img_norm_cfg), 32 | dict(type='Pad', size_divisor=32), 33 | dict(type='DefaultFormatBundle'), 34 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 35 | ] 36 | test_pipeline = [ 37 | dict(type='LoadImageFromFile'), 38 | dict( 39 | type='MultiScaleFlipAug', 40 | img_scale=(1333, 800), 41 | flip=False, 42 | transforms=[ 43 | dict(type='Resize', keep_ratio=True), 44 | dict(type='RandomFlip'), 45 | dict(type='Normalize', **img_norm_cfg), 46 | dict(type='Pad', size_divisor=32), 47 | dict(type='ImageToTensor', keys=['img']), 48 | dict(type='Collect', keys=['img']), 49 | ]) 50 | ] 51 | data = dict( 52 | train=dict(pipeline=train_pipeline), 53 | val=dict(pipeline=test_pipeline), 54 | test=dict(pipeline=test_pipeline)) 55 | # optimizer 56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 57 | optimizer_config = dict(grad_clip=None) 58 | # learning policy 59 | lr_config = dict( 60 | policy='step', 61 | warmup='linear', 62 | warmup_iters=500, 63 | warmup_ratio=0.001, 64 | step=[28, 34]) 65 | total_epochs = 36 66 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_t_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_detection.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_tiny.pth', 9 | backbone=dict( 10 | type='dpt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs='on_input', 18 | num_outs=5)) 19 | # optimizer 20 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 21 | optimizer_config = dict(grad_clip=None) 22 | # learning policy 23 | lr_config = dict( 24 | policy='step', 25 | warmup='linear', 26 | warmup_iters=500, 27 | warmup_ratio=0.001, 28 | step=[8, 11]) 29 | total_epochs = 12 30 | -------------------------------------------------------------------------------- /detection/configs/retinanet_dpt_t_fpn_mstrain_3x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_detection.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/dpt_tiny.pth', 9 | backbone=dict( 10 | type='dpt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs='on_input', 18 | num_outs=5)) 19 | img_norm_cfg = dict( 20 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 21 | train_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict(type='LoadAnnotations', with_bbox=True), 24 | dict( 25 | type='Resize', 26 | img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), 27 | (1333, 768), (1333, 800)], 28 | multiscale_mode='value', 29 | keep_ratio=True), 30 | dict(type='RandomFlip', flip_ratio=0.5), 31 | dict(type='Normalize', **img_norm_cfg), 32 | dict(type='Pad', size_divisor=32), 33 | dict(type='DefaultFormatBundle'), 34 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 35 | ] 36 | test_pipeline = [ 37 | dict(type='LoadImageFromFile'), 38 | dict( 39 | type='MultiScaleFlipAug', 40 | img_scale=(1333, 800), 41 | flip=False, 42 | transforms=[ 43 | dict(type='Resize', keep_ratio=True), 44 | dict(type='RandomFlip'), 45 | dict(type='Normalize', **img_norm_cfg), 46 | dict(type='Pad', size_divisor=32), 47 | dict(type='ImageToTensor', keys=['img']), 48 | dict(type='Collect', keys=['img']), 49 | ]) 50 | ] 51 | data = dict( 52 | train=dict(pipeline=train_pipeline), 53 | val=dict(pipeline=test_pipeline), 54 | test=dict(pipeline=test_pipeline)) 55 | # optimizer 56 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 57 | optimizer_config = dict(grad_clip=None) 58 | # learning policy 59 | lr_config = dict( 60 | policy='step', 61 | warmup='linear', 62 | warmup_iters=500, 63 | warmup_ratio=0.001, 64 | step=[28, 34]) 65 | total_epochs = 36 66 | -------------------------------------------------------------------------------- /detection/configs/retinanet_pvt_s_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/retinanet_r50_fpn.py', 3 | '_base_/datasets/coco_detection.py', 4 | '_base_/default_runtime.py' 5 | ] 6 | model = dict( 7 | pretrained='pretrained/pvt_small.pth', 8 | backbone=dict( 9 | type='pvt_small', 10 | style='pytorch'), 11 | neck=dict( 12 | type='FPN', 13 | in_channels=[64, 128, 320, 512], 14 | out_channels=256, 15 | start_level=1, 16 | add_extra_convs='on_input', 17 | num_outs=5)) 18 | # optimizer 19 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 20 | optimizer_config = dict(grad_clip=None) 21 | # learning policy 22 | lr_config = dict( 23 | policy='step', 24 | warmup='linear', 25 | warmup_iters=500, 26 | warmup_ratio=0.001, 27 | step=[8, 11]) 28 | total_epochs = 12 29 | -------------------------------------------------------------------------------- /detection/configs/retinanet_pvt_s_fpn_1x_coco_640.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/default_runtime.py' 4 | ] 5 | model = dict( 6 | pretrained='pretrained/pvt_small.pth', 7 | backbone=dict( 8 | type='pvt_small', 9 | style='pytorch'), 10 | neck=dict( 11 | type='FPN', 12 | in_channels=[64, 128, 320, 512], 13 | out_channels=256, 14 | start_level=1, 15 | add_extra_convs='on_input', 16 | num_outs=5)) 17 | # optimizer 18 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 19 | optimizer_config = dict(grad_clip=None) 20 | # learning policy 21 | lr_config = dict( 22 | policy='step', 23 | warmup='linear', 24 | warmup_iters=500, 25 | warmup_ratio=0.001, 26 | step=[8, 11]) 27 | total_epochs = 12 28 | # dataset 29 | dataset_type = 'CocoDataset' 30 | data_root = 'data/coco/' 31 | img_norm_cfg = dict( 32 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 33 | train_pipeline = [ 34 | dict(type='LoadImageFromFile'), 35 | dict(type='LoadAnnotations', with_bbox=True), 36 | dict(type='Resize', img_scale=(1066, 640), keep_ratio=True), 37 | dict(type='RandomFlip', flip_ratio=0.5), 38 | dict(type='Normalize', **img_norm_cfg), 39 | dict(type='Pad', size_divisor=32), 40 | dict(type='DefaultFormatBundle'), 41 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 42 | ] 43 | test_pipeline = [ 44 | dict(type='LoadImageFromFile'), 45 | dict( 46 | type='MultiScaleFlipAug', 47 | img_scale=(1066, 640), 48 | flip=False, 49 | transforms=[ 50 | dict(type='Resize', keep_ratio=True), 51 | dict(type='RandomFlip'), 52 | dict(type='Normalize', **img_norm_cfg), 53 | dict(type='Pad', size_divisor=32), 54 | dict(type='ImageToTensor', keys=['img']), 55 | dict(type='Collect', keys=['img']), 56 | ]) 57 | ] 58 | data = dict( 59 | samples_per_gpu=2, 60 | workers_per_gpu=2, 61 | train=dict( 62 | type=dataset_type, 63 | ann_file=data_root + 'annotations/instances_train2017.json', 64 | img_prefix=data_root + 'train2017/', 65 | pipeline=train_pipeline), 66 | val=dict( 67 | type=dataset_type, 68 | ann_file=data_root + 'annotations/instances_val2017.json', 69 | img_prefix=data_root + 'val2017/', 70 | pipeline=test_pipeline), 71 | test=dict( 72 | type=dataset_type, 73 | ann_file=data_root + 'annotations/instances_val2017.json', 74 | img_prefix=data_root + 'val2017/', 75 | pipeline=test_pipeline)) 76 | evaluation = dict(interval=1, metric='bbox') 77 | -------------------------------------------------------------------------------- /detection/configs/retinanet_pvt_t_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../configs/_base_/models/retinanet_r50_fpn.py', 3 | '../configs/_base_/datasets/coco_detection.py', 4 | # '../configs/_base_/schedules/schedule_1x.py', 5 | '../configs/_base_/default_runtime.py' 6 | ] 7 | model = dict( 8 | pretrained='pretrained/pvt_tiny.pth', 9 | backbone=dict( 10 | type='pvt_tiny', 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[64, 128, 320, 512], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs='on_input', 18 | num_outs=5)) 19 | # optimizer 20 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0001) 21 | optimizer_config = dict(grad_clip=None) 22 | # learning policy 23 | lr_config = dict( 24 | policy='step', 25 | warmup='linear', 26 | warmup_iters=500, 27 | warmup_ratio=0.001, 28 | step=[8, 11]) 29 | total_epochs = 12 -------------------------------------------------------------------------------- /detection/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29500} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /detection/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29500} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=66667 \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /detection/dpt_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpt import * 2 | -------------------------------------------------------------------------------- /detection/dpt_models/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | 7 | __all__ = ["pointCoder", "pointwhCoder"] 8 | 9 | 10 | class pointCoder(nn.Module): 11 | def __init__(self, input_size, patch_count, weights=(1., 1.), tanh=True): 12 | super().__init__() 13 | self.input_size = input_size 14 | self.patch_count = patch_count 15 | self.weights = weights 16 | #self._generate_anchor() 17 | self.tanh = tanh 18 | 19 | def _generate_anchor(self, device="cpu"): 20 | anchors = [] 21 | patch_stride_y, patch_stride_x = 1. / self.patch_count[0], 1. / self.patch_count[1] 22 | for i in range(self.patch_count[0]): 23 | for j in range(self.patch_count[1]): 24 | y = (0.5+i)*patch_stride_y 25 | x = (0.5+j)*patch_stride_x 26 | anchors.append([x, y]) 27 | anchors = torch.as_tensor(anchors) 28 | self.anchor = torch.as_tensor(anchors, device=device) 29 | #self.register_buffer("anchor", anchors) 30 | 31 | @torch.cuda.amp.autocast(enabled=False) 32 | def forward(self, pts, model_offset=None): 33 | assert model_offset is None 34 | self.boxes = self.decode(pts) 35 | return self.boxes 36 | 37 | def decode(self, rel_codes): 38 | # print ('xyxy decoding') 39 | boxes = self.anchor 40 | pixel = 1./self.patch_count 41 | wx, wy = self.weights 42 | 43 | dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel if self.tanh else rel_codes[:, :, 0]*pixel / wx 44 | dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel if self.tanh else rel_codes[:, :, 1]*pixel / wy 45 | 46 | pred_boxes = torch.zeros_like(rel_codes) 47 | 48 | ref_x = boxes[:,0].unsqueeze(0) 49 | ref_y = boxes[:,1].unsqueeze(0) 50 | 51 | pred_boxes[:, :, 0] = dx + ref_x 52 | pred_boxes[:, :, 1] = dy + ref_y 53 | pred_boxes = pred_boxes.clamp_(min=0., max=1.) 54 | 55 | return pred_boxes 56 | 57 | def get_offsets(self): 58 | return (self.boxes - self.anchor) * self.input_size 59 | 60 | 61 | class pointwhCoder(pointCoder): 62 | def __init__(self, input_size, patch_count, weights=(1., 1.), pts=1, tanh=True, wh_bias=None): 63 | super().__init__(input_size=input_size, patch_count=patch_count, weights=weights, tanh=tanh) 64 | self.patch_pixel = pts 65 | self.wh_bias = None 66 | if wh_bias is not None: 67 | self.wh_bias = nn.Parameter(torch.zeros(2) + wh_bias) 68 | 69 | @torch.cuda.amp.autocast(enabled=False) 70 | def forward(self, boxes, img_size, output_size): 71 | self.input_size = img_size 72 | self.patch_count = output_size 73 | self._generate_anchor(device=boxes.device) 74 | if self.wh_bias is not None: 75 | boxes[:, :, 2:] = boxes[:, :, 2:] + self.wh_bias 76 | self.boxes = self.decode(boxes) 77 | points = self.meshgrid(self.boxes) 78 | return points 79 | 80 | def decode(self, rel_codes): 81 | # print ('xyxy decoding') 82 | boxes = self.anchor 83 | pixel_x, pixel_y = 1./self.patch_count[1], 1./self.patch_count[0] 84 | wx, wy, wh, ww = self.weights 85 | 86 | dx = F.tanh(rel_codes[:, :, 0]/wx) * pixel_x if self.tanh else rel_codes[:, :, 0]*pixel_x / wx 87 | dy = F.tanh(rel_codes[:, :, 1]/wy) * pixel_y if self.tanh else rel_codes[:, :, 1]*pixel_y / wy 88 | 89 | dw = F.relu(F.tanh(rel_codes[:, :, 2]/ww)) * pixel_x 90 | dh = F.relu(F.tanh(rel_codes[:, :, 3]/wh)) * pixel_y 91 | 92 | pred_boxes = torch.zeros_like(rel_codes) 93 | 94 | ref_x = boxes[:,0].unsqueeze(0) 95 | ref_y = boxes[:,1].unsqueeze(0) 96 | 97 | pred_boxes[:, :, 0] = dx + ref_x - dw 98 | pred_boxes[:, :, 1] = dy + ref_y - dh 99 | pred_boxes[:, :, 2] = dx + ref_x + dw 100 | pred_boxes[:, :, 3] = dy + ref_y + dh 101 | pred_boxes = pred_boxes.clamp_(min=0., max=1.) 102 | 103 | return pred_boxes 104 | 105 | def get_offsets(self): 106 | return (self.boxes - self.anchor.repeat(1,2)) * self.input_size 107 | 108 | def get_scales(self): 109 | return (self.boxes[:, :, 2:] - self.boxes[:, :, :2]) * self.input_size 110 | 111 | def meshgrid(self, boxes): 112 | B = boxes.shape[0] 113 | xs, ys = boxes[:, :, 0::2], boxes[: , :, 1::2] 114 | xs = torch.nn.functional.interpolate(xs, size=self.patch_pixel, mode='linear', align_corners=True) 115 | ys = torch.nn.functional.interpolate(ys, size=self.patch_pixel, mode='linear', align_corners=True) 116 | xs, ys = xs.unsqueeze(3).repeat_interleave(self.patch_pixel, dim=3), ys.unsqueeze(2).repeat_interleave(self.patch_pixel, dim=2) 117 | results = torch.stack([xs, ys], dim = -1) 118 | results = results.reshape(B, self.patch_count[0]*self.patch_count[1]*self.patch_pixel*self.patch_pixel, 2) 119 | return results 120 | -------------------------------------------------------------------------------- /detection/dpt_models/depatch_embed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from functools import partial 4 | 5 | from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 6 | from timm.models.helpers import load_pretrained 7 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 8 | from timm.models.resnet import resnet26d, resnet50d 9 | from timm.models.registry import register_model 10 | 11 | from timm.models import create_model 12 | from timm.models.vision_transformer import _cfg, Block 13 | from .ms_deform_attn_func import MSDeformAttnFunction 14 | 15 | class PatchEmbed(nn.Module): 16 | """ Image to Patch Embedding 17 | """ 18 | def __init__(self, img_size=224, patch_size=16, patch_count=14, in_chans=3, embed_dim=768, with_norm=False): 19 | super().__init__() 20 | patch_stride = img_size // patch_count 21 | patch_pad = (patch_stride * (patch_count - 1) + patch_size - img_size) // 2 22 | img_size = to_2tuple(img_size) 23 | patch_size = to_2tuple(patch_size) 24 | num_patches = patch_count * patch_count 25 | self.img_size = img_size 26 | self.patch_size = patch_size 27 | self.num_patches = num_patches 28 | 29 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=patch_pad) 30 | if with_norm: 31 | self.norm = nn.LayerNorm(embed_dim) 32 | 33 | def forward(self, x, **kwargs): 34 | B, C, H, W = x.shape 35 | # FIXME look at relaxing size constraints 36 | #assert H == self.img_size[0] and W == self.img_size[1], \ 37 | # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." 38 | x = self.proj(x).flatten(2).transpose(1, 2) 39 | if hasattr(self, "norm"): 40 | x = self.norm(x) 41 | #assert x.shape[1] == self.num_patches 42 | return x 43 | 44 | 45 | class Simple_Patch(nn.Module): 46 | def __init__(self, offset_embed, img_size=224, patch_size=16, patch_pixel=16, patch_count=14, 47 | in_chans=3, embed_dim=192, another_linear=False, use_GE=False, local_feature=False, with_norm=False): 48 | super().__init__() 49 | self.H, self.W = patch_count, patch_count 50 | self.num_patches = patch_count * patch_count 51 | self.another_linear = another_linear 52 | if self.another_linear: 53 | self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, embed_dim, with_norm=with_norm) 54 | self.act = nn.GELU() if use_GE else nn.Identity() 55 | self.offset_predictor = nn.Linear(embed_dim, offset_embed, bias=False) 56 | else: 57 | self.patch_embed = PatchEmbed(img_size, 1 if local_feature else patch_size, patch_count, in_chans, offset_embed) 58 | 59 | self.img_size, self.patch_size, self.patch_pixel, self.patch_count = img_size, patch_size, patch_pixel, patch_count 60 | self.in_chans, self.embed_dim = in_chans, embed_dim 61 | 62 | def reset_offset(self): 63 | if self.another_linear: 64 | nn.init.constant_(self.offset_predictor.weight, 0) 65 | if hasattr(self.offset_predictor, "bias") and self.offset_predictor.bias is not None: 66 | nn.init.constant_(self.offset_predictor.bias, 0) 67 | else: 68 | nn.init.constant_(self.patch_embed.proj.weight, 0) 69 | if hasattr(self.patch_embed.proj, "bias") and self.patch_embed.proj.bias is not None: 70 | nn.init.constant_(self.patch_embed.proj.bias, 0) 71 | print("Parameter for offsets reseted.") 72 | 73 | @torch.cuda.amp.autocast(enabled=False) 74 | def forward(self, x): 75 | #if x.dim() == 3: 76 | # B, H, W = x.shape[0], self.img_size, self.img_size 77 | # assert x.shape[1] == H * W 78 | # x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 79 | B, C, H, W = x.shape 80 | img = x 81 | x = self.patch_embed(x) 82 | if self.another_linear: 83 | pred_offset = self.offset_predictor(self.act(x)) 84 | else: 85 | pred_offset = x.contiguous() 86 | output_size = (H // self.patch_size, W // self.patch_size) 87 | return self.get_output(img, pred_offset, img_size=(H, W), output_size=output_size), output_size 88 | 89 | class Simple_DePatch(Simple_Patch): 90 | def __init__(self, box_coder, show_dim=4, **kwargs): 91 | super().__init__(show_dim, **kwargs) 92 | self.box_coder = box_coder 93 | #self.register_buffer("value_spatial_shapes", torch.as_tensor([[self.img_size, self.img_size]], dtype=torch.long)) 94 | self.register_buffer("value_level_start_index", torch.as_tensor([0], dtype=torch.long)) 95 | self.output_proj = nn.Linear(self.in_chans * self.patch_pixel * self.patch_pixel, self.embed_dim) 96 | if kwargs["with_norm"]: 97 | self.with_norm=True 98 | self.norm = nn.LayerNorm(self.embed_dim) 99 | else: 100 | self.with_norm=False 101 | 102 | def get_output(self, img, pred_offset, img_size, output_size): 103 | #copyed 104 | B = img.shape[0] 105 | value_spatial_shapes = torch.as_tensor(img_size, dtype=torch.long, device=pred_offset.device).view(1, 2) 106 | num_sample_points = self.patch_pixel * self.patch_pixel * output_size[0] * output_size[1] 107 | 108 | sample_location = self.box_coder(pred_offset, img_size=img_size, output_size=output_size) 109 | sampling_locations = sample_location.view(B, num_sample_points,1,1,1,2).to(torch.float) 110 | attention_weights = torch.ones((B, num_sample_points, 1, 1, 1), device=img.device) 111 | x = img.view(B, self.in_chans, 1, -1).transpose(1, 3).contiguous() 112 | output = MSDeformAttnFunction.apply(x, value_spatial_shapes, self.value_level_start_index, sampling_locations, attention_weights, 1) 113 | # output_proj 114 | output = output.view(B, output_size[0]*output_size[1], self.in_chans*self.patch_pixel*self.patch_pixel) 115 | output = self.output_proj(output) 116 | if self.with_norm: 117 | output = self.norm(output) 118 | return output 119 | -------------------------------------------------------------------------------- /detection/dpt_models/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /detection/pvt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from functools import partial 5 | 6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 7 | from timm.models.registry import register_model 8 | from timm.models.vision_transformer import _cfg 9 | from mmdet.models.builder import BACKBONES 10 | from mmdet.utils import get_root_logger 11 | from mmcv.runner import load_checkpoint 12 | 13 | 14 | class Mlp(nn.Module): 15 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): 16 | super().__init__() 17 | out_features = out_features or in_features 18 | hidden_features = hidden_features or in_features 19 | self.fc1 = nn.Linear(in_features, hidden_features) 20 | self.act = act_layer() 21 | self.fc2 = nn.Linear(hidden_features, out_features) 22 | self.drop = nn.Dropout(drop) 23 | 24 | def forward(self, x): 25 | x = self.fc1(x) 26 | x = self.act(x) 27 | x = self.drop(x) 28 | x = self.fc2(x) 29 | x = self.drop(x) 30 | return x 31 | 32 | 33 | class Attention(nn.Module): 34 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1): 35 | super().__init__() 36 | assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." 37 | 38 | self.dim = dim 39 | self.num_heads = num_heads 40 | head_dim = dim // num_heads 41 | self.scale = qk_scale or head_dim ** -0.5 42 | 43 | self.q = nn.Linear(dim, dim, bias=qkv_bias) 44 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | self.sr_ratio = sr_ratio 50 | if sr_ratio > 1: 51 | self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) 52 | self.norm = nn.LayerNorm(dim) 53 | 54 | def forward(self, x, H, W): 55 | B, N, C = x.shape 56 | q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) 57 | 58 | if self.sr_ratio > 1: 59 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W) 60 | x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) 61 | x_ = self.norm(x_) 62 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 63 | else: 64 | kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 65 | k, v = kv[0], kv[1] 66 | 67 | attn = (q @ k.transpose(-2, -1)) * self.scale 68 | attn = attn.softmax(dim=-1) 69 | attn = self.attn_drop(attn) 70 | 71 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 72 | x = self.proj(x) 73 | x = self.proj_drop(x) 74 | 75 | return x 76 | 77 | 78 | class Block(nn.Module): 79 | 80 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., 81 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1): 82 | super().__init__() 83 | self.norm1 = norm_layer(dim) 84 | self.attn = Attention( 85 | dim, 86 | num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, 87 | attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) 88 | # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here 89 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 90 | self.norm2 = norm_layer(dim) 91 | mlp_hidden_dim = int(dim * mlp_ratio) 92 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) 93 | 94 | def forward(self, x, H, W): 95 | x = x + self.drop_path(self.attn(self.norm1(x), H, W)) 96 | x = x + self.drop_path(self.mlp(self.norm2(x))) 97 | 98 | return x 99 | 100 | 101 | class PatchEmbed(nn.Module): 102 | """ Image to Patch Embedding 103 | """ 104 | 105 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): 106 | super().__init__() 107 | img_size = to_2tuple(img_size) 108 | patch_size = to_2tuple(patch_size) 109 | 110 | self.img_size = img_size 111 | self.patch_size = patch_size 112 | assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \ 113 | f"img_size {img_size} should be divided by patch_size {patch_size}." 114 | self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] 115 | self.num_patches = self.H * self.W 116 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) 117 | self.norm = nn.LayerNorm(embed_dim) 118 | 119 | def forward(self, x): 120 | B, C, H, W = x.shape 121 | 122 | x = self.proj(x).flatten(2).transpose(1, 2) 123 | x = self.norm(x) 124 | H, W = H // self.patch_size[0], W // self.patch_size[1] 125 | 126 | return x, (H, W) 127 | 128 | 129 | class PyramidVisionTransformer(nn.Module): 130 | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512], 131 | num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., 132 | attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, 133 | depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], F4=False): 134 | super().__init__() 135 | self.num_classes = num_classes 136 | self.depths = depths 137 | self.F4 = F4 138 | 139 | # patch_embed 140 | self.patch_embed1 = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, 141 | embed_dim=embed_dims[0]) 142 | self.patch_embed2 = PatchEmbed(img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0], 143 | embed_dim=embed_dims[1]) 144 | self.patch_embed3 = PatchEmbed(img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1], 145 | embed_dim=embed_dims[2]) 146 | self.patch_embed4 = PatchEmbed(img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2], 147 | embed_dim=embed_dims[3]) 148 | 149 | # pos_embed 150 | self.pos_embed1 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[0])) 151 | self.pos_drop1 = nn.Dropout(p=drop_rate) 152 | self.pos_embed2 = nn.Parameter(torch.zeros(1, self.patch_embed2.num_patches, embed_dims[1])) 153 | self.pos_drop2 = nn.Dropout(p=drop_rate) 154 | self.pos_embed3 = nn.Parameter(torch.zeros(1, self.patch_embed3.num_patches, embed_dims[2])) 155 | self.pos_drop3 = nn.Dropout(p=drop_rate) 156 | self.pos_embed4 = nn.Parameter(torch.zeros(1, self.patch_embed4.num_patches + 1, embed_dims[3])) 157 | self.pos_drop4 = nn.Dropout(p=drop_rate) 158 | 159 | # transformer encoder 160 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule 161 | cur = 0 162 | self.block1 = nn.ModuleList([Block( 163 | dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, 164 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, 165 | sr_ratio=sr_ratios[0]) 166 | for i in range(depths[0])]) 167 | 168 | cur += depths[0] 169 | self.block2 = nn.ModuleList([Block( 170 | dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, 171 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, 172 | sr_ratio=sr_ratios[1]) 173 | for i in range(depths[1])]) 174 | 175 | cur += depths[1] 176 | self.block3 = nn.ModuleList([Block( 177 | dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, 178 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, 179 | sr_ratio=sr_ratios[2]) 180 | for i in range(depths[2])]) 181 | 182 | cur += depths[2] 183 | self.block4 = nn.ModuleList([Block( 184 | dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, 185 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, 186 | sr_ratio=sr_ratios[3]) 187 | for i in range(depths[3])]) 188 | 189 | # init weights 190 | trunc_normal_(self.pos_embed1, std=.02) 191 | trunc_normal_(self.pos_embed2, std=.02) 192 | trunc_normal_(self.pos_embed3, std=.02) 193 | trunc_normal_(self.pos_embed4, std=.02) 194 | self.apply(self._init_weights) 195 | 196 | def init_weights(self, pretrained=None): 197 | if isinstance(pretrained, str): 198 | logger = get_root_logger() 199 | load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) 200 | 201 | def reset_drop_path(self, drop_path_rate): 202 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))] 203 | cur = 0 204 | for i in range(self.depths[0]): 205 | self.block1[i].drop_path.drop_prob = dpr[cur + i] 206 | 207 | cur += self.depths[0] 208 | for i in range(self.depths[1]): 209 | self.block2[i].drop_path.drop_prob = dpr[cur + i] 210 | 211 | cur += self.depths[1] 212 | for i in range(self.depths[2]): 213 | self.block3[i].drop_path.drop_prob = dpr[cur + i] 214 | 215 | cur += self.depths[2] 216 | for i in range(self.depths[3]): 217 | self.block4[i].drop_path.drop_prob = dpr[cur + i] 218 | 219 | def _init_weights(self, m): 220 | if isinstance(m, nn.Linear): 221 | trunc_normal_(m.weight, std=.02) 222 | if isinstance(m, nn.Linear) and m.bias is not None: 223 | nn.init.constant_(m.bias, 0) 224 | elif isinstance(m, nn.LayerNorm): 225 | nn.init.constant_(m.bias, 0) 226 | nn.init.constant_(m.weight, 1.0) 227 | 228 | def _get_pos_embed(self, pos_embed, patch_embed, H, W): 229 | if H * W == self.patch_embed1.num_patches: 230 | return pos_embed 231 | else: 232 | return F.interpolate( 233 | pos_embed.reshape(1, patch_embed.H, patch_embed.W, -1).permute(0, 3, 1, 2), 234 | size=(H, W), mode="bilinear").reshape(1, -1, H * W).permute(0, 2, 1) 235 | 236 | def forward_features(self, x): 237 | outs = [] 238 | 239 | B = x.shape[0] 240 | 241 | # stage 1 242 | x, (H, W) = self.patch_embed1(x) 243 | pos_embed1 = self._get_pos_embed(self.pos_embed1, self.patch_embed1, H, W) 244 | x = x + pos_embed1 245 | x = self.pos_drop1(x) 246 | for blk in self.block1: 247 | x = blk(x, H, W) 248 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 249 | outs.append(x) 250 | 251 | # stage 2 252 | x, (H, W) = self.patch_embed2(x) 253 | pos_embed2 = self._get_pos_embed(self.pos_embed2, self.patch_embed2, H, W) 254 | x = x + pos_embed2 255 | x = self.pos_drop2(x) 256 | for blk in self.block2: 257 | x = blk(x, H, W) 258 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 259 | outs.append(x) 260 | 261 | # stage 3 262 | x, (H, W) = self.patch_embed3(x) 263 | pos_embed3 = self._get_pos_embed(self.pos_embed3, self.patch_embed3, H, W) 264 | x = x + pos_embed3 265 | x = self.pos_drop3(x) 266 | for blk in self.block3: 267 | x = blk(x, H, W) 268 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 269 | outs.append(x) 270 | 271 | # stage 4 272 | x, (H, W) = self.patch_embed4(x) 273 | pos_embed4 = self._get_pos_embed(self.pos_embed4[:, 1:], self.patch_embed4, H, W) 274 | x = x + pos_embed4 275 | x = self.pos_drop4(x) 276 | for blk in self.block4: 277 | x = blk(x, H, W) 278 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 279 | outs.append(x) 280 | 281 | return outs 282 | 283 | def forward(self, x): 284 | x = self.forward_features(x) 285 | 286 | if self.F4: 287 | x = x[3:4] 288 | 289 | return x 290 | 291 | 292 | def _conv_filter(state_dict, patch_size=16): 293 | """ convert patch embedding weight from manual patchify + linear proj to conv""" 294 | out_dict = {} 295 | for k, v in state_dict.items(): 296 | if 'patch_embed.proj.weight' in k: 297 | v = v.reshape((v.shape[0], 3, patch_size, patch_size)) 298 | out_dict[k] = v 299 | 300 | return out_dict 301 | 302 | 303 | @BACKBONES.register_module() 304 | class pvt_tiny(PyramidVisionTransformer): 305 | def __init__(self, **kwargs): 306 | super(pvt_tiny, self).__init__( 307 | patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], 308 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], 309 | sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) 310 | 311 | 312 | @BACKBONES.register_module() 313 | class pvt_small(PyramidVisionTransformer): 314 | def __init__(self, **kwargs): 315 | super(pvt_small, self).__init__( 316 | patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], 317 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], 318 | sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) 319 | 320 | 321 | @BACKBONES.register_module() 322 | class pvt_small_f4(PyramidVisionTransformer): 323 | def __init__(self, **kwargs): 324 | super(pvt_small_f4, self).__init__( 325 | patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], 326 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], 327 | sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1, F4=True) 328 | -------------------------------------------------------------------------------- /detection/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import warnings 4 | 5 | import mmcv 6 | import torch 7 | from mmcv import Config, DictAction 8 | from mmcv.cnn import fuse_conv_bn 9 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 10 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, 11 | wrap_fp16_model) 12 | 13 | from mmdet.apis import multi_gpu_test, single_gpu_test 14 | from mmdet.datasets import (build_dataloader, build_dataset, 15 | replace_ImageToTensor) 16 | from mmdet.models import build_detector 17 | import pvt 18 | import dpt_models 19 | 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser( 23 | description='MMDet test (and eval) a model') 24 | parser.add_argument('config', help='test config file path') 25 | parser.add_argument('checkpoint', help='checkpoint file') 26 | parser.add_argument('--out', help='output result file in pickle format') 27 | parser.add_argument( 28 | '--fuse-conv-bn', 29 | action='store_true', 30 | help='Whether to fuse conv and bn, this will slightly increase' 31 | 'the inference speed') 32 | parser.add_argument( 33 | '--format-only', 34 | action='store_true', 35 | help='Format the output results without perform evaluation. It is' 36 | 'useful when you want to format the result to a specific format and ' 37 | 'submit it to the test server') 38 | parser.add_argument( 39 | '--eval', 40 | type=str, 41 | nargs='+', 42 | help='evaluation metrics, which depends on the dataset, e.g., "bbox",' 43 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 44 | parser.add_argument('--show', action='store_true', help='show results') 45 | parser.add_argument( 46 | '--show-dir', help='directory where painted images will be saved') 47 | parser.add_argument( 48 | '--show-score-thr', 49 | type=float, 50 | default=0.3, 51 | help='score threshold (default: 0.3)') 52 | parser.add_argument( 53 | '--gpu-collect', 54 | action='store_true', 55 | help='whether to use gpu to collect results.') 56 | parser.add_argument( 57 | '--tmpdir', 58 | help='tmp directory used for collecting results from multiple ' 59 | 'workers, available when gpu-collect is not specified') 60 | parser.add_argument( 61 | '--cfg-options', 62 | nargs='+', 63 | action=DictAction, 64 | help='override some settings in the used config, the key-value pair ' 65 | 'in xxx=yyy format will be merged into config file. If the value to ' 66 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 67 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 68 | 'Note that the quotation marks are necessary and that no white space ' 69 | 'is allowed.') 70 | parser.add_argument( 71 | '--options', 72 | nargs='+', 73 | action=DictAction, 74 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 75 | 'format will be kwargs for dataset.evaluate() function (deprecate), ' 76 | 'change to --eval-options instead.') 77 | parser.add_argument( 78 | '--eval-options', 79 | nargs='+', 80 | action=DictAction, 81 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 82 | 'format will be kwargs for dataset.evaluate() function') 83 | parser.add_argument( 84 | '--launcher', 85 | choices=['none', 'pytorch', 'slurm', 'mpi'], 86 | default='none', 87 | help='job launcher') 88 | parser.add_argument('--local_rank', type=int, default=0) 89 | args = parser.parse_args() 90 | if 'LOCAL_RANK' not in os.environ: 91 | os.environ['LOCAL_RANK'] = str(args.local_rank) 92 | 93 | if args.options and args.eval_options: 94 | raise ValueError( 95 | '--options and --eval-options cannot be both ' 96 | 'specified, --options is deprecated in favor of --eval-options') 97 | if args.options: 98 | warnings.warn('--options is deprecated in favor of --eval-options') 99 | args.eval_options = args.options 100 | return args 101 | 102 | 103 | def main(): 104 | args = parse_args() 105 | 106 | assert args.out or args.eval or args.format_only or args.show \ 107 | or args.show_dir, \ 108 | ('Please specify at least one operation (save/eval/format/show the ' 109 | 'results / save the results) with the argument "--out", "--eval"' 110 | ', "--format-only", "--show" or "--show-dir"') 111 | 112 | if args.eval and args.format_only: 113 | raise ValueError('--eval and --format_only cannot be both specified') 114 | 115 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 116 | raise ValueError('The output file must be a pkl file.') 117 | 118 | cfg = Config.fromfile(args.config) 119 | if args.cfg_options is not None: 120 | cfg.merge_from_dict(args.cfg_options) 121 | # import modules from string list. 122 | if cfg.get('custom_imports', None): 123 | from mmcv.utils import import_modules_from_strings 124 | import_modules_from_strings(**cfg['custom_imports']) 125 | # set cudnn_benchmark 126 | if cfg.get('cudnn_benchmark', False): 127 | torch.backends.cudnn.benchmark = True 128 | cfg.model.pretrained = None 129 | if cfg.model.get('neck'): 130 | if isinstance(cfg.model.neck, list): 131 | for neck_cfg in cfg.model.neck: 132 | if neck_cfg.get('rfp_backbone'): 133 | if neck_cfg.rfp_backbone.get('pretrained'): 134 | neck_cfg.rfp_backbone.pretrained = None 135 | elif cfg.model.neck.get('rfp_backbone'): 136 | if cfg.model.neck.rfp_backbone.get('pretrained'): 137 | cfg.model.neck.rfp_backbone.pretrained = None 138 | 139 | # in case the test dataset is concatenated 140 | if isinstance(cfg.data.test, dict): 141 | cfg.data.test.test_mode = True 142 | elif isinstance(cfg.data.test, list): 143 | for ds_cfg in cfg.data.test: 144 | ds_cfg.test_mode = True 145 | 146 | # init distributed env first, since logger depends on the dist info. 147 | if args.launcher == 'none': 148 | distributed = False 149 | else: 150 | distributed = True 151 | init_dist(args.launcher, **cfg.dist_params) 152 | 153 | # build the dataloader 154 | samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) 155 | if samples_per_gpu > 1: 156 | # Replace 'ImageToTensor' to 'DefaultFormatBundle' 157 | cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) 158 | dataset = build_dataset(cfg.data.test) 159 | data_loader = build_dataloader( 160 | dataset, 161 | samples_per_gpu=samples_per_gpu, 162 | workers_per_gpu=cfg.data.workers_per_gpu, 163 | dist=distributed, 164 | shuffle=False) 165 | 166 | # build the model and load checkpoint 167 | model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) 168 | fp16_cfg = cfg.get('fp16', None) 169 | if fp16_cfg is not None: 170 | wrap_fp16_model(model) 171 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 172 | if args.fuse_conv_bn: 173 | model = fuse_conv_bn(model) 174 | # old versions did not save class info in checkpoints, this walkaround is 175 | # for backward compatibility 176 | if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']: 177 | model.CLASSES = checkpoint['meta']['CLASSES'] 178 | else: 179 | model.CLASSES = dataset.CLASSES 180 | 181 | if not distributed: 182 | model = MMDataParallel(model, device_ids=[0]) 183 | outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, 184 | args.show_score_thr) 185 | else: 186 | model = MMDistributedDataParallel( 187 | model.cuda(), 188 | device_ids=[torch.cuda.current_device()], 189 | broadcast_buffers=False) 190 | outputs = multi_gpu_test(model, data_loader, args.tmpdir, 191 | args.gpu_collect) 192 | 193 | rank, _ = get_dist_info() 194 | if rank == 0: 195 | if args.out: 196 | print(f'\nwriting results to {args.out}') 197 | mmcv.dump(outputs, args.out) 198 | kwargs = {} if args.eval_options is None else args.eval_options 199 | if args.format_only: 200 | dataset.format_results(outputs, **kwargs) 201 | if args.eval: 202 | eval_kwargs = cfg.get('evaluation', {}).copy() 203 | # hard-code way to remove EvalHook args 204 | for key in [ 205 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 206 | 'rule' 207 | ]: 208 | eval_kwargs.pop(key, None) 209 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 210 | print(dataset.evaluate(outputs, **eval_kwargs)) 211 | 212 | 213 | if __name__ == '__main__': 214 | main() 215 | -------------------------------------------------------------------------------- /detection/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import os 4 | import os.path as osp 5 | import time 6 | import warnings 7 | 8 | import mmcv 9 | import torch 10 | from mmcv import Config, DictAction 11 | from mmcv.runner import get_dist_info, init_dist 12 | from mmcv.utils import get_git_hash 13 | 14 | from mmdet import __version__ 15 | from mmdet.apis import set_random_seed, train_detector 16 | from mmdet.datasets import build_dataset 17 | from mmdet.models import build_detector 18 | from mmdet.utils import collect_env, get_root_logger 19 | import pvt 20 | import dpt_models 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser(description='Train a detector') 25 | parser.add_argument('config', help='train config file path') 26 | parser.add_argument('--work-dir', help='the dir to save logs and models') 27 | parser.add_argument( 28 | '--resume-from', help='the checkpoint file to resume from') 29 | parser.add_argument( 30 | '--no-validate', 31 | action='store_true', 32 | help='whether not to evaluate the checkpoint during training') 33 | group_gpus = parser.add_mutually_exclusive_group() 34 | group_gpus.add_argument( 35 | '--gpus', 36 | type=int, 37 | help='number of gpus to use ' 38 | '(only applicable to non-distributed training)') 39 | group_gpus.add_argument( 40 | '--gpu-ids', 41 | type=int, 42 | nargs='+', 43 | help='ids of gpus to use ' 44 | '(only applicable to non-distributed training)') 45 | parser.add_argument('--seed', type=int, default=None, help='random seed') 46 | parser.add_argument( 47 | '--deterministic', 48 | action='store_true', 49 | help='whether to set deterministic options for CUDNN backend.') 50 | parser.add_argument( 51 | '--options', 52 | nargs='+', 53 | action=DictAction, 54 | help='override some settings in the used config, the key-value pair ' 55 | 'in xxx=yyy format will be merged into config file (deprecate), ' 56 | 'change to --cfg-options instead.') 57 | parser.add_argument( 58 | '--cfg-options', 59 | nargs='+', 60 | action=DictAction, 61 | help='override some settings in the used config, the key-value pair ' 62 | 'in xxx=yyy format will be merged into config file. If the value to ' 63 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 64 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 65 | 'Note that the quotation marks are necessary and that no white space ' 66 | 'is allowed.') 67 | parser.add_argument( 68 | '--launcher', 69 | choices=['none', 'pytorch', 'slurm', 'mpi'], 70 | default='none', 71 | help='job launcher') 72 | parser.add_argument('--local_rank', type=int, default=0) 73 | args = parser.parse_args() 74 | if 'LOCAL_RANK' not in os.environ: 75 | os.environ['LOCAL_RANK'] = str(args.local_rank) 76 | 77 | if args.options and args.cfg_options: 78 | raise ValueError( 79 | '--options and --cfg-options cannot be both ' 80 | 'specified, --options is deprecated in favor of --cfg-options') 81 | if args.options: 82 | warnings.warn('--options is deprecated in favor of --cfg-options') 83 | args.cfg_options = args.options 84 | 85 | return args 86 | 87 | 88 | def main(): 89 | args = parse_args() 90 | 91 | cfg = Config.fromfile(args.config) 92 | if args.cfg_options is not None: 93 | cfg.merge_from_dict(args.cfg_options) 94 | # import modules from string list. 95 | if cfg.get('custom_imports', None): 96 | from mmcv.utils import import_modules_from_strings 97 | import_modules_from_strings(**cfg['custom_imports']) 98 | # set cudnn_benchmark 99 | if cfg.get('cudnn_benchmark', False): 100 | torch.backends.cudnn.benchmark = True 101 | 102 | # work_dir is determined in this priority: CLI > segment in file > filename 103 | if args.work_dir is not None: 104 | # update configs according to CLI args if args.work_dir is not None 105 | cfg.work_dir = args.work_dir 106 | elif cfg.get('work_dir', None) is None: 107 | # use config filename as default work_dir if cfg.work_dir is None 108 | cfg.work_dir = osp.join('./work_dirs', 109 | osp.splitext(osp.basename(args.config))[0]) 110 | if args.resume_from is not None: 111 | cfg.resume_from = args.resume_from 112 | if args.gpu_ids is not None: 113 | cfg.gpu_ids = args.gpu_ids 114 | else: 115 | cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) 116 | 117 | # init distributed env first, since logger depends on the dist info. 118 | if args.launcher == 'none': 119 | distributed = False 120 | else: 121 | distributed = True 122 | init_dist(args.launcher, **cfg.dist_params) 123 | # re-set gpu_ids with distributed training mode 124 | _, world_size = get_dist_info() 125 | cfg.gpu_ids = range(world_size) 126 | 127 | # create work_dir 128 | mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) 129 | # dump config 130 | cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) 131 | # init the logger before other steps 132 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 133 | log_file = osp.join(cfg.work_dir, f'{timestamp}.log') 134 | logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) 135 | 136 | # init the meta dict to record some important information such as 137 | # environment info and seed, which will be logged 138 | meta = dict() 139 | # log env info 140 | env_info_dict = collect_env() 141 | env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) 142 | dash_line = '-' * 60 + '\n' 143 | logger.info('Environment info:\n' + dash_line + env_info + '\n' + 144 | dash_line) 145 | meta['env_info'] = env_info 146 | meta['config'] = cfg.pretty_text 147 | # log some basic info 148 | logger.info(f'Distributed training: {distributed}') 149 | logger.info(f'Config:\n{cfg.pretty_text}') 150 | 151 | # set random seeds 152 | if args.seed is not None: 153 | logger.info(f'Set random seed to {args.seed}, ' 154 | f'deterministic: {args.deterministic}') 155 | set_random_seed(args.seed, deterministic=args.deterministic) 156 | cfg.seed = args.seed 157 | meta['seed'] = args.seed 158 | meta['exp_name'] = osp.basename(args.config) 159 | 160 | model = build_detector( 161 | cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) 162 | 163 | datasets = [build_dataset(cfg.data.train)] 164 | if len(cfg.workflow) == 2: 165 | val_dataset = copy.deepcopy(cfg.data.val) 166 | val_dataset.pipeline = cfg.data.train.pipeline 167 | datasets.append(build_dataset(val_dataset)) 168 | if cfg.checkpoint_config is not None: 169 | # save mmdet version, config file content and class names in 170 | # checkpoints as meta data 171 | cfg.checkpoint_config.meta = dict( 172 | mmdet_version=__version__ + get_git_hash()[:7], 173 | CLASSES=datasets[0].CLASSES) 174 | # add an attribute for visualization convenience 175 | model.CLASSES = datasets[0].CLASSES 176 | train_detector( 177 | model, 178 | datasets, 179 | cfg, 180 | distributed=distributed, 181 | validate=(not args.no_validate), 182 | timestamp=timestamp, 183 | meta=meta) 184 | 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import warnings 14 | import math 15 | 16 | import torch 17 | from torch import nn 18 | import torch.nn.functional as F 19 | from torch.nn.init import xavier_uniform_, constant_ 20 | 21 | from ..functions import MSDeformAttnFunction 22 | 23 | 24 | def _is_power_of_2(n): 25 | if (not isinstance(n, int)) or (n < 0): 26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 27 | return (n & (n-1) == 0) and n != 0 28 | 29 | 30 | class MSDeformAttn(nn.Module): 31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 32 | """ 33 | Multi-Scale Deformable Attention Module 34 | :param d_model hidden dimension 35 | :param n_levels number of feature levels 36 | :param n_heads number of attention heads 37 | :param n_points number of sampling points per attention head per feature level 38 | """ 39 | super().__init__() 40 | if d_model % n_heads != 0: 41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 42 | _d_per_head = d_model // n_heads 43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 44 | if not _is_power_of_2(_d_per_head): 45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 46 | "which is more efficient in our CUDA implementation.") 47 | 48 | self.im2col_step = 64 49 | 50 | self.d_model = d_model 51 | self.n_levels = n_levels 52 | self.n_heads = n_heads 53 | self.n_points = n_points 54 | 55 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 56 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 57 | self.value_proj = nn.Linear(d_model, d_model) 58 | self.output_proj = nn.Linear(d_model, d_model) 59 | 60 | self._reset_parameters() 61 | 62 | def _reset_parameters(self): 63 | constant_(self.sampling_offsets.weight.data, 0.) 64 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 65 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 66 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 67 | for i in range(self.n_points): 68 | grid_init[:, :, i, :] *= i + 1 69 | with torch.no_grad(): 70 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 71 | constant_(self.attention_weights.weight.data, 0.) 72 | constant_(self.attention_weights.bias.data, 0.) 73 | xavier_uniform_(self.value_proj.weight.data) 74 | constant_(self.value_proj.bias.data, 0.) 75 | xavier_uniform_(self.output_proj.weight.data) 76 | constant_(self.output_proj.bias.data, 0.) 77 | 78 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 79 | """ 80 | :param query (N, Length_{query}, C) 81 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 82 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 83 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 84 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 85 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 86 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 87 | 88 | :return output (N, Length_{query}, C) 89 | """ 90 | N, Len_q, _ = query.shape 91 | N, Len_in, _ = input_flatten.shape 92 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 93 | 94 | value = self.value_proj(input_flatten) 95 | if input_padding_mask is not None: 96 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 97 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 98 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 99 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 100 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 101 | # N, Len_q, n_heads, n_levels, n_points, 2 102 | if reference_points.shape[-1] == 2: 103 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 104 | sampling_locations = reference_points[:, :, None, :, None, :] \ 105 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 106 | elif reference_points.shape[-1] == 4: 107 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 108 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 109 | else: 110 | raise ValueError( 111 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 112 | output = MSDeformAttnFunction.apply( 113 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 114 | output = self.output_proj(output) 115 | return output 116 | -------------------------------------------------------------------------------- /ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | --------------------------------------------------------------------------------