├── LICENSE ├── README.md ├── config ├── Config_Crnn.yaml ├── Config_DB.yaml ├── Config_ICT.yaml ├── Config_Maskformer.yaml ├── Config_ReBiSe.yaml ├── Config_Seg.yaml ├── Config_Solo.yaml └── Config_Yolox.yaml ├── data ├── __init__.py ├── boxes.py ├── cityscapes │ └── cityscapes_info.json ├── coco │ ├── coco_classes.py │ └── coco_stuff_10k_classes.py ├── data_utils.py ├── dataloader.py └── dataset.py ├── model ├── __init__.py ├── backbone.py ├── head.py ├── model_factory.py ├── models.py ├── neck.py └── utils │ ├── __init__.py │ ├── csp_utils.py │ ├── maskformer_utils.py │ ├── mobilenetv3_utils.py │ ├── mobilevit_utils.py │ ├── ops.py │ ├── rebise_utils.py │ ├── res_utils.py │ ├── swin_utils.py │ └── transformer_utils.py ├── requirements.txt ├── setup.py ├── tools ├── __init__.py ├── augmentation.py ├── boxes.py ├── evaluation_tools.py ├── loss │ ├── SigmoidFocalLoss_cuda.cpython-38-x86_64-linux-gnu.so │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── detr_criterion.cpython-38.pyc │ │ ├── detr_matcher.cpython-38.pyc │ │ ├── loss.cpython-38.pyc │ │ └── loss_utils.cpython-38.pyc │ ├── detr_criterion.py │ ├── detr_matcher.py │ ├── loss.py │ ├── loss_utils.py │ ├── sigmoid_focal_loss_cuda.cpython-38-x86_64-linux-gnu.so │ └── src │ │ ├── SigmoidFocalLoss.cpp │ │ ├── SigmoidFocalLoss_cuda.cu │ │ ├── sigmoid_focal_loss.cpp │ │ └── sigmoid_focal_loss_cuda.cu ├── misc.py ├── nms.py └── nninit.py ├── train_ddp.py ├── trainer_ddp.py └── utils ├── __init__.py ├── chars_v1_p.txt ├── common.py ├── standard_tools.py └── visualize.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch-Devkit 2 | 3 | This repository contains related algorithms in many fields: 4 | * object detection 5 | * segmentation 6 | * OCR 7 | * image translation 8 | 9 | ## Table of Contents 10 | - [Pytorch-Devkit](#pytorch-devkit) 11 | - [Table of Contents](#table-of-contents) 12 | - [About Details](#about-details) 13 | - [How to Use](#how-to-use) 14 | - [Requirements](#requirements) 15 | 16 | ## About Details 17 | - Support distributed training 18 | - Support mixed precision training 19 | - Support multiple augments 20 | - Backbone: Mobilenetv3 MobileViT Resnet Swintransformer DarkNet StdcNet 21 | - NECK:FPN、PAFPN 22 | - Character Recognition:CRNN 23 | - Character detection:DBNET 24 | - Object detection:YOLOX 25 | - Segmentation:ReBiSegNet MaskFormer SOLOV2 26 | - Image translation:ICT 27 | ## How to Use 28 | 29 | ```bash 30 | $ python setup.py develop 31 | $ python train_ddp.py -f ./config/Config_Yolox.yaml 32 | ``` 33 | 34 | For details about how to configure related algorithms, see examples. 35 | 36 | 37 | ## Requirements 38 | 39 | * `requirements.txt` -------------------------------------------------------------------------------- /config/Config_Crnn.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: CRNN 2 | config_name: 'basic' 3 | backbone: 'basic' 4 | 5 | optimizer: 'adamW' 6 | scheduler: 'OneCycleLR' 7 | 8 | lr: 0.001 9 | batch_size: 2 10 | epoch: 10 11 | num_workers: 0 12 | 13 | aug: True 14 | augmentation_type: 'cls' 15 | aug_dicts: 16 | affine: 17 | translate_percent: [-0.02, 0.02] 18 | scale: [0.8, 1.2] 19 | rotate: [-3, 3] 20 | mode: 'constant' 21 | cval: [200, 255] 22 | addgaussiannoise: 23 | loc: 0 24 | scale: [0.0, 8.0] 25 | per_channel: 0.5 26 | multiply: 27 | mul: [0.4, 1.5] 28 | grayscale: 29 | alpha: [0.5, 1.0] 30 | gaussianblur: 31 | sigma: [0.6,1.4] 32 | 33 | input_size: (32, 256) 34 | num_classes: 6624 35 | 36 | dataset_name: 'OcrRec' 37 | train_list: ['/home/shaoran/github_source/OCR/text_renderer-master/example_data/output/chn_data/images', '/home/shaoran/github_source/OCR/text_renderer-master/example_data/output/chn_data/chn_data.txt','/home/shaoran/git/Pytorch_Projectization_Tools/utils/chars_v1_p.txt'] 38 | test_list: ['/home/shaoran/github_source/OCR/text_renderer-master/example_data/output/chn_data/images', '/home/shaoran/github_source/OCR/text_renderer-master/example_data/output/chn_data/chn_data.txt','/home/shaoran/git/Pytorch_Projectization_Tools/utils/chars_v1_p.txt'] -------------------------------------------------------------------------------- /config/Config_DB.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: DB 2 | config_name: 'mobilevit_xxs-PAFPN-640x640' 3 | backbone: 'mobilevit_xxs' 4 | neck: 'PAFPN' 5 | 6 | optimizer: 'adamW' 7 | scheduler: 'OneCycleLR' 8 | 9 | lr: 0.001 10 | batch_size: 2 11 | epoch: 10 12 | num_workers: 1 13 | 14 | aug: True 15 | augmentation_type: 'polygon' 16 | aug_dicts: 17 | affine: 18 | translate_percent: [-0.05, 0.05] 19 | scale: [0.5, 1.5] 20 | rotate: [-5, 5] 21 | mode: 'constant' 22 | cval: [200, 255] 23 | fliplr: 24 | p: 0.5 25 | flipud: 26 | p: 0.5 27 | addgaussiannoise: 28 | loc: 0 29 | scale: [0.0, 8.0] 30 | per_channel: 0.5 31 | multiply: 32 | mul: [0.4, 1.5] 33 | grayscale: 34 | alpha: [0.5, 1.0] 35 | gaussianblur: 36 | sigma: [0.6,1.4] 37 | 38 | 39 | input_size: (640, 640) 40 | 41 | dataset_name: 'OcrDet' 42 | train_list: ['/home/shaoran/datasets/OCR/DET/datasets/test/img', '/home/shaoran/datasets/OCR/DET/datasets/test/gt'] 43 | test_list: ['/home/shaoran/datasets/OCR/DET/datasets/test/img', '/home/shaoran/datasets/OCR/DET/datasets/test/gt'] -------------------------------------------------------------------------------- /config/Config_ICT.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: ICT 2 | backbone: 'res_18' 3 | d_model: 512 4 | num_heads: 8 5 | dff: 1024 6 | num_layers: 2 7 | target_vocab_size: 1426 8 | max_length: 50 9 | pad_index: 1 10 | 11 | optimizer: 'adamW' 12 | scheduler: 'OneCycleLR' 13 | 14 | lr: 0.001 15 | batch_size: 2 16 | epoch: 10 17 | num_workers: 0 18 | 19 | aug: False 20 | augmentation_type: 'det' 21 | 22 | input_size: (64, 320) 23 | num_channel: 3 24 | 25 | dataset_name: 'IC' 26 | 27 | images_dir: '/home/shaoran/datasets/starsee/formula/rec/files/latex10k_test_v1_songti/images' 28 | labels_dir: '/home/shaoran/datasets/starsee/formula/rec/files/latex10k_test_v1_songti/matching.txt' 29 | chars_file: '/home/shaoran/datasets/starsee/formula/rec/files/vocab.txt' 30 | 31 | images_dir_val: '/home/shaoran/datasets/starsee/formula/rec/files/latex10k_test_v1_songti/images' 32 | labels_dir_val: '/home/shaoran/datasets/starsee/formula/rec/files/latex10k_test_v1_songti/matching.txt' 33 | 34 | 35 | -------------------------------------------------------------------------------- /config/Config_Maskformer.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: MaskFormer 2 | config_name: 'mobilenetv3_s' 3 | backbone: 'mobilenetv3_s' 4 | 5 | optimizer: 'adamW' 6 | scheduler: 'OneCycleLR' 7 | 8 | lr: 0.0001 9 | batch_size: 2 10 | epoch: 10 11 | num_workers: 2 12 | 13 | aug: True 14 | augmentation_type: 'seg' 15 | aug_dicts: 16 | affine: 17 | translate_percent: [-0.2, 0.2] 18 | scale: [0.5, 1.5] 19 | rotate: [-5, 5] 20 | mode: 'constant' 21 | cval: [200, 255] 22 | fliplr: 23 | p: 0.5 24 | flipud: 25 | p: 0.5 26 | multiply: 27 | mul: [0.9, 1.1] 28 | gaussianblur: 29 | sigma: [0.9,1.1] 30 | 31 | 32 | input_size: (512, 512) 33 | 34 | dataset_name: 'Coco_stff_10k' 35 | data_dir: '/home/shaoran/datasets/coco/coco_stuff_10k' 36 | num_classes: 171 37 | 38 | loss_criteria: True -------------------------------------------------------------------------------- /config/Config_ReBiSe.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: ReBiSe 2 | backbone: 'stdc_l' 3 | 4 | optimizer: 'adamW' 5 | scheduler: 'OneCycleLR' 6 | 7 | lr: 0.001 8 | batch_size: 4 9 | epoch: 10 10 | num_workers: 1 11 | 12 | aug: False 13 | augmentation_type: 'seg' 14 | input_size: (512, 1024) 15 | ignore_lb: 255 16 | 17 | dataset_name: 'Cityscapes' 18 | num_classes: 19 19 | images_dir: '/home/shaoran/datasets/cityscapes' 20 | json_file: '/home/shaoran/git/Pytorch_Projectization_Tools/data/cityscapes/cityscapes_info.json' 21 | 22 | name: 'val' 23 | 24 | name_val: 'val' 25 | loss_criteria: False -------------------------------------------------------------------------------- /config/Config_Seg.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: Seg 2 | encoder: 'se_resnext50_32x4d' 3 | weights: 'imagenet' 4 | activation: 'sigmoid' 5 | 6 | optimizer: 'adamW' 7 | scheduler: 'OneCycleLR' 8 | 9 | lr: 0.001 10 | batch_size: 2 11 | epoch: 10 12 | num_workers: 2 13 | 14 | aug: True 15 | augmentation_type: 'seg' 16 | input_size: (512, 512) 17 | 18 | classes: ['visible_row', 'visible_column', 'unvisible_row', 'unvisible_column'] 19 | dataset_name: 'test_seg' 20 | train_list: ['./torch_data/seg_test/images', './torch_data/seg_test/masks_vr', './torch_data/seg_test/masks_vc', './torch_data/seg_test/masks_ur', './torch_data/seg_test/masks_uc'] 21 | test_list: ['./torch_data/seg_test/images', './torch_data/seg_test/masks_vr', './torch_data/seg_test/masks_vc', './torch_data/seg_test/masks_ur', './torch_data/seg_test/masks_uc'] -------------------------------------------------------------------------------- /config/Config_Solo.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: SOLO 2 | config_name: 'res_18-768x512' 3 | backbone: 'res_18' 4 | 5 | optimizer: 'adamW' 6 | scheduler: 'OneCycleLR' 7 | 8 | lr: 0.0001 9 | batch_size: 1 10 | epoch: 50 11 | num_workers: 0 12 | 13 | aug: False 14 | augmentation_type: 'seg' 15 | aug_dicts: 16 | affine: 17 | translate_percent: [-0.1, 0.1] 18 | scale: [0.5, 1.5] 19 | rotate: [-5, 5] 20 | mode: 'constant' 21 | cval: [200, 255] 22 | fliplr: 23 | p: 0.5 24 | flipud: 25 | p: 0.5 26 | addgaussiannoise: 27 | loc: 0 28 | scale: [0.0, 8.0] 29 | per_channel: 0.5 30 | multiply: 31 | mul: [0.8, 1.2] 32 | 33 | 34 | # (768, 512):scale ((1, 56), (28, 112), (56, 224), (112, 448), (224, 896)) 35 | # (852, 512)): scale ((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), 36 | # (1333, 800):scale ((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)) 37 | input_size: (768, 512) 38 | 39 | dataset_name: 'Coco' 40 | training_mission: 'instance' 41 | num_classes: 80 42 | data_dir: '/home/shaoran/datasets/coco2017' 43 | # 'val_annotation.json' 'instances_val2017.json' 44 | json_file: 'instances_val2017.json' 45 | # 'val2017' '' 46 | name: 'val2017' 47 | 48 | # 'val_annotation.json' 'instances_val2017.json' 49 | json_file_val: 'instances_val2017.json' 50 | # 'val2017' '' 51 | name_val: 'val2017' -------------------------------------------------------------------------------- /config/Config_Yolox.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: YOLOX 2 | config_name: 'dark_s-FPN' 3 | backbone: 'dark_s' 4 | neck: 'FPN' 5 | act: 'lrelu' 6 | 7 | optimizer: 'adamW' 8 | scheduler: 'OneCycleLR' 9 | 10 | lr: 0.0001 11 | batch_size: 2 12 | epoch: 10 13 | num_workers: 1 14 | 15 | aug: True 16 | augmentation_type: 'det' 17 | aug_dicts: 18 | affine: 19 | translate_percent: [-0.05, 0.05] 20 | scale: [0.5, 1.5] 21 | rotate: [-5, 5] 22 | mode: 'constant' 23 | cval: [200, 255] 24 | fliplr: 25 | p: 0.5 26 | flipud: 27 | p: 0.5 28 | addgaussiannoise: 29 | loc: 0 30 | scale: [0.0, 8.0] 31 | per_channel: 0.5 32 | multiply: 33 | mul: [0.4, 1.5] 34 | cutout: 35 | nb_iterations: [2, 6] 36 | size: 0.05 37 | squared: False 38 | fill_mode: "constant" 39 | cval: [0, 255] 40 | fill_per_channel: 0.5 41 | add: 42 | value: [-40, 40] 43 | per_channel: 0.5 44 | grayscale: 45 | alpha: [0.5, 1.0] 46 | gaussianblur: 47 | sigma: [0.6,1.4] 48 | 49 | input_size: (512, 512) 50 | 51 | dataset_name: 'Coco' 52 | training_mission: 'det' 53 | num_classes: 80 54 | data_dir: '/home/shaoran/Datasets/coco2017/' 55 | 56 | json_file: 'instances_val2017.json' 57 | name: 'val2017' 58 | 59 | json_file_val: 'instances_val2017.json' 60 | name_val: 'val2017' 61 | loss_criteria: True -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/data/__init__.py -------------------------------------------------------------------------------- /data/boxes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import numpy as np 12 | 13 | import torch 14 | import torchvision 15 | 16 | __all__ = [ 17 | "filter_box", 18 | "postprocess", 19 | "bboxes_iou", 20 | "matrix_iou", 21 | "adjust_box_anns", 22 | "xyxy2xywh", 23 | "xyxy2cxcywh", 24 | ] 25 | 26 | 27 | def filter_box(output, scale_range): 28 | """ 29 | output: (N, 5+class) shape 30 | """ 31 | min_scale, max_scale = scale_range 32 | w = output[:, 2] - output[:, 0] 33 | h = output[:, 3] - output[:, 1] 34 | keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) 35 | return output[keep] 36 | 37 | 38 | def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45): 39 | box_corner = prediction.new(prediction.shape) 40 | box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 41 | box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 42 | box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 43 | box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 44 | prediction[:, :, :4] = box_corner[:, :, :4] 45 | 46 | output = [None for _ in range(len(prediction))] 47 | for i, image_pred in enumerate(prediction): 48 | 49 | if not image_pred.size(0): 50 | continue 51 | class_conf, class_pred = torch.max( 52 | image_pred[:, 5 : 5 + num_classes], 1, keepdim=True 53 | ) 54 | 55 | conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() 56 | detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1) 57 | detections = detections[conf_mask] 58 | if not detections.size(0): 59 | continue 60 | 61 | nms_out_index = torchvision.ops.batched_nms( 62 | detections[:, :4], 63 | detections[:, 4] * detections[:, 5], 64 | detections[:, 6], 65 | nms_thre, 66 | ) 67 | detections = detections[nms_out_index] 68 | if output[i] is None: 69 | output[i] = detections 70 | else: 71 | output[i] = torch.cat((output[i], detections)) 72 | 73 | return output 74 | 75 | 76 | def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): 77 | if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: 78 | raise IndexError 79 | 80 | if xyxy: 81 | tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) 82 | br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) 83 | area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) 84 | area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) 85 | else: 86 | tl = torch.max( 87 | (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), 88 | (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), 89 | ) 90 | br = torch.min( 91 | (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), 92 | (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), 93 | ) 94 | 95 | area_a = torch.prod(bboxes_a[:, 2:], 1) 96 | area_b = torch.prod(bboxes_b[:, 2:], 1) 97 | en = (tl < br).type(tl.type()).prod(dim=2) 98 | area_i = torch.prod(br - tl, 2) * en 99 | return area_i / (area_a[:, None] + area_b - area_i) 100 | 101 | 102 | def matrix_iou(a, b): 103 | """ 104 | return iou of a and b, numpy version for data augenmentation 105 | """ 106 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 107 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 108 | 109 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 110 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 111 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 112 | return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) 113 | 114 | 115 | def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): 116 | bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) 117 | bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) 118 | return bbox 119 | 120 | 121 | def xyxy2xywh(bboxes): 122 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] 123 | bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] 124 | return bboxes 125 | 126 | 127 | def xyxy2cxcywh(bboxes): 128 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] 129 | bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] 130 | bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5 131 | bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5 132 | return bboxes 133 | -------------------------------------------------------------------------------- /data/cityscapes/cityscapes_info.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "hasInstances": false, 4 | "category": "void", 5 | "catid": 0, 6 | "name": "unlabeled", 7 | "ignoreInEval": true, 8 | "id": 0, 9 | "color": [ 10 | 0, 11 | 0, 12 | 0 13 | ], 14 | "trainId": 255 15 | }, 16 | { 17 | "hasInstances": false, 18 | "category": "void", 19 | "catid": 0, 20 | "name": "ego vehicle", 21 | "ignoreInEval": true, 22 | "id": 1, 23 | "color": [ 24 | 0, 25 | 0, 26 | 0 27 | ], 28 | "trainId": 255 29 | }, 30 | { 31 | "hasInstances": false, 32 | "category": "void", 33 | "catid": 0, 34 | "name": "rectification border", 35 | "ignoreInEval": true, 36 | "id": 2, 37 | "color": [ 38 | 0, 39 | 0, 40 | 0 41 | ], 42 | "trainId": 255 43 | }, 44 | { 45 | "hasInstances": false, 46 | "category": "void", 47 | "catid": 0, 48 | "name": "out of roi", 49 | "ignoreInEval": true, 50 | "id": 3, 51 | "color": [ 52 | 0, 53 | 0, 54 | 0 55 | ], 56 | "trainId": 255 57 | }, 58 | { 59 | "hasInstances": false, 60 | "category": "void", 61 | "catid": 0, 62 | "name": "static", 63 | "ignoreInEval": true, 64 | "id": 4, 65 | "color": [ 66 | 0, 67 | 0, 68 | 0 69 | ], 70 | "trainId": 255 71 | }, 72 | { 73 | "hasInstances": false, 74 | "category": "void", 75 | "catid": 0, 76 | "name": "dynamic", 77 | "ignoreInEval": true, 78 | "id": 5, 79 | "color": [ 80 | 111, 81 | 74, 82 | 0 83 | ], 84 | "trainId": 255 85 | }, 86 | { 87 | "hasInstances": false, 88 | "category": "void", 89 | "catid": 0, 90 | "name": "ground", 91 | "ignoreInEval": true, 92 | "id": 6, 93 | "color": [ 94 | 81, 95 | 0, 96 | 81 97 | ], 98 | "trainId": 255 99 | }, 100 | { 101 | "hasInstances": false, 102 | "category": "flat", 103 | "catid": 1, 104 | "name": "road", 105 | "ignoreInEval": false, 106 | "id": 7, 107 | "color": [ 108 | 128, 109 | 64, 110 | 128 111 | ], 112 | "trainId": 0 113 | }, 114 | { 115 | "hasInstances": false, 116 | "category": "flat", 117 | "catid": 1, 118 | "name": "sidewalk", 119 | "ignoreInEval": false, 120 | "id": 8, 121 | "color": [ 122 | 244, 123 | 35, 124 | 232 125 | ], 126 | "trainId": 1 127 | }, 128 | { 129 | "hasInstances": false, 130 | "category": "flat", 131 | "catid": 1, 132 | "name": "parking", 133 | "ignoreInEval": true, 134 | "id": 9, 135 | "color": [ 136 | 250, 137 | 170, 138 | 160 139 | ], 140 | "trainId": 255 141 | }, 142 | { 143 | "hasInstances": false, 144 | "category": "flat", 145 | "catid": 1, 146 | "name": "rail track", 147 | "ignoreInEval": true, 148 | "id": 10, 149 | "color": [ 150 | 230, 151 | 150, 152 | 140 153 | ], 154 | "trainId": 255 155 | }, 156 | { 157 | "hasInstances": false, 158 | "category": "construction", 159 | "catid": 2, 160 | "name": "building", 161 | "ignoreInEval": false, 162 | "id": 11, 163 | "color": [ 164 | 70, 165 | 70, 166 | 70 167 | ], 168 | "trainId": 2 169 | }, 170 | { 171 | "hasInstances": false, 172 | "category": "construction", 173 | "catid": 2, 174 | "name": "wall", 175 | "ignoreInEval": false, 176 | "id": 12, 177 | "color": [ 178 | 102, 179 | 102, 180 | 156 181 | ], 182 | "trainId": 3 183 | }, 184 | { 185 | "hasInstances": false, 186 | "category": "construction", 187 | "catid": 2, 188 | "name": "fence", 189 | "ignoreInEval": false, 190 | "id": 13, 191 | "color": [ 192 | 190, 193 | 153, 194 | 153 195 | ], 196 | "trainId": 4 197 | }, 198 | { 199 | "hasInstances": false, 200 | "category": "construction", 201 | "catid": 2, 202 | "name": "guard rail", 203 | "ignoreInEval": true, 204 | "id": 14, 205 | "color": [ 206 | 180, 207 | 165, 208 | 180 209 | ], 210 | "trainId": 255 211 | }, 212 | { 213 | "hasInstances": false, 214 | "category": "construction", 215 | "catid": 2, 216 | "name": "bridge", 217 | "ignoreInEval": true, 218 | "id": 15, 219 | "color": [ 220 | 150, 221 | 100, 222 | 100 223 | ], 224 | "trainId": 255 225 | }, 226 | { 227 | "hasInstances": false, 228 | "category": "construction", 229 | "catid": 2, 230 | "name": "tunnel", 231 | "ignoreInEval": true, 232 | "id": 16, 233 | "color": [ 234 | 150, 235 | 120, 236 | 90 237 | ], 238 | "trainId": 255 239 | }, 240 | { 241 | "hasInstances": false, 242 | "category": "object", 243 | "catid": 3, 244 | "name": "pole", 245 | "ignoreInEval": false, 246 | "id": 17, 247 | "color": [ 248 | 153, 249 | 153, 250 | 153 251 | ], 252 | "trainId": 5 253 | }, 254 | { 255 | "hasInstances": false, 256 | "category": "object", 257 | "catid": 3, 258 | "name": "polegroup", 259 | "ignoreInEval": true, 260 | "id": 18, 261 | "color": [ 262 | 153, 263 | 153, 264 | 153 265 | ], 266 | "trainId": 255 267 | }, 268 | { 269 | "hasInstances": false, 270 | "category": "object", 271 | "catid": 3, 272 | "name": "traffic light", 273 | "ignoreInEval": false, 274 | "id": 19, 275 | "color": [ 276 | 250, 277 | 170, 278 | 30 279 | ], 280 | "trainId": 6 281 | }, 282 | { 283 | "hasInstances": false, 284 | "category": "object", 285 | "catid": 3, 286 | "name": "traffic sign", 287 | "ignoreInEval": false, 288 | "id": 20, 289 | "color": [ 290 | 220, 291 | 220, 292 | 0 293 | ], 294 | "trainId": 7 295 | }, 296 | { 297 | "hasInstances": false, 298 | "category": "nature", 299 | "catid": 4, 300 | "name": "vegetation", 301 | "ignoreInEval": false, 302 | "id": 21, 303 | "color": [ 304 | 107, 305 | 142, 306 | 35 307 | ], 308 | "trainId": 8 309 | }, 310 | { 311 | "hasInstances": false, 312 | "category": "nature", 313 | "catid": 4, 314 | "name": "terrain", 315 | "ignoreInEval": false, 316 | "id": 22, 317 | "color": [ 318 | 152, 319 | 251, 320 | 152 321 | ], 322 | "trainId": 9 323 | }, 324 | { 325 | "hasInstances": false, 326 | "category": "sky", 327 | "catid": 5, 328 | "name": "sky", 329 | "ignoreInEval": false, 330 | "id": 23, 331 | "color": [ 332 | 70, 333 | 130, 334 | 180 335 | ], 336 | "trainId": 10 337 | }, 338 | { 339 | "hasInstances": true, 340 | "category": "human", 341 | "catid": 6, 342 | "name": "person", 343 | "ignoreInEval": false, 344 | "id": 24, 345 | "color": [ 346 | 220, 347 | 20, 348 | 60 349 | ], 350 | "trainId": 11 351 | }, 352 | { 353 | "hasInstances": true, 354 | "category": "human", 355 | "catid": 6, 356 | "name": "rider", 357 | "ignoreInEval": false, 358 | "id": 25, 359 | "color": [ 360 | 255, 361 | 0, 362 | 0 363 | ], 364 | "trainId": 12 365 | }, 366 | { 367 | "hasInstances": true, 368 | "category": "vehicle", 369 | "catid": 7, 370 | "name": "car", 371 | "ignoreInEval": false, 372 | "id": 26, 373 | "color": [ 374 | 0, 375 | 0, 376 | 142 377 | ], 378 | "trainId": 13 379 | }, 380 | { 381 | "hasInstances": true, 382 | "category": "vehicle", 383 | "catid": 7, 384 | "name": "truck", 385 | "ignoreInEval": false, 386 | "id": 27, 387 | "color": [ 388 | 0, 389 | 0, 390 | 70 391 | ], 392 | "trainId": 14 393 | }, 394 | { 395 | "hasInstances": true, 396 | "category": "vehicle", 397 | "catid": 7, 398 | "name": "bus", 399 | "ignoreInEval": false, 400 | "id": 28, 401 | "color": [ 402 | 0, 403 | 60, 404 | 100 405 | ], 406 | "trainId": 15 407 | }, 408 | { 409 | "hasInstances": true, 410 | "category": "vehicle", 411 | "catid": 7, 412 | "name": "caravan", 413 | "ignoreInEval": true, 414 | "id": 29, 415 | "color": [ 416 | 0, 417 | 0, 418 | 90 419 | ], 420 | "trainId": 255 421 | }, 422 | { 423 | "hasInstances": true, 424 | "category": "vehicle", 425 | "catid": 7, 426 | "name": "trailer", 427 | "ignoreInEval": true, 428 | "id": 30, 429 | "color": [ 430 | 0, 431 | 0, 432 | 110 433 | ], 434 | "trainId": 255 435 | }, 436 | { 437 | "hasInstances": true, 438 | "category": "vehicle", 439 | "catid": 7, 440 | "name": "train", 441 | "ignoreInEval": false, 442 | "id": 31, 443 | "color": [ 444 | 0, 445 | 80, 446 | 100 447 | ], 448 | "trainId": 16 449 | }, 450 | { 451 | "hasInstances": true, 452 | "category": "vehicle", 453 | "catid": 7, 454 | "name": "motorcycle", 455 | "ignoreInEval": false, 456 | "id": 32, 457 | "color": [ 458 | 0, 459 | 0, 460 | 230 461 | ], 462 | "trainId": 17 463 | }, 464 | { 465 | "hasInstances": true, 466 | "category": "vehicle", 467 | "catid": 7, 468 | "name": "bicycle", 469 | "ignoreInEval": false, 470 | "id": 33, 471 | "color": [ 472 | 119, 473 | 11, 474 | 32 475 | ], 476 | "trainId": 18 477 | }, 478 | { 479 | "hasInstances": false, 480 | "category": "vehicle", 481 | "catid": 7, 482 | "name": "license plate", 483 | "ignoreInEval": true, 484 | "id": -1, 485 | "color": [ 486 | 0, 487 | 0, 488 | 142 489 | ], 490 | "trainId": -1 491 | } 492 | ] -------------------------------------------------------------------------------- /data/coco/coco_classes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | # coding: utf-8 11 | 12 | COCO_CLASSES = ( 13 | "person", 14 | "bicycle", 15 | "car", 16 | "motorcycle", 17 | "airplane", 18 | "bus", 19 | "train", 20 | "truck", 21 | "boat", 22 | "traffic light", 23 | "fire hydrant", 24 | "stop sign", 25 | "parking meter", 26 | "bench", 27 | "bird", 28 | "cat", 29 | "dog", 30 | "horse", 31 | "sheep", 32 | "cow", 33 | "elephant", 34 | "bear", 35 | "zebra", 36 | "giraffe", 37 | "backpack", 38 | "umbrella", 39 | "handbag", 40 | "tie", 41 | "suitcase", 42 | "frisbee", 43 | "skis", 44 | "snowboard", 45 | "sports ball", 46 | "kite", 47 | "baseball bat", 48 | "baseball glove", 49 | "skateboard", 50 | "surfboard", 51 | "tennis racket", 52 | "bottle", 53 | "wine glass", 54 | "cup", 55 | "fork", 56 | "knife", 57 | "spoon", 58 | "bowl", 59 | "banana", 60 | "apple", 61 | "sandwich", 62 | "orange", 63 | "broccoli", 64 | "carrot", 65 | "hot dog", 66 | "pizza", 67 | "donut", 68 | "cake", 69 | "chair", 70 | "couch", 71 | "potted plant", 72 | "bed", 73 | "dining table", 74 | "toilet", 75 | "tv", 76 | "laptop", 77 | "mouse", 78 | "remote", 79 | "keyboard", 80 | "cell phone", 81 | "microwave", 82 | "oven", 83 | "toaster", 84 | "sink", 85 | "refrigerator", 86 | "book", 87 | "clock", 88 | "vase", 89 | "scissors", 90 | "teddy bear", 91 | "hair drier", 92 | "toothbrush", 93 | ) 94 | 95 | COCO_LABEL = [1, 2, 3, 4, 5, 6, 7, 8, 96 | 9, 10, 11, 13, 14, 15, 16, 17, 97 | 18, 19, 20, 21, 22, 23, 24, 25, 98 | 27, 28, 31, 32, 33, 34, 35, 36, 99 | 37, 38, 39, 40, 41, 42, 43, 44, 100 | 46, 47, 48, 49, 50, 51, 52, 53, 101 | 54, 55, 56, 57, 58, 59, 60, 61, 102 | 62, 63, 64, 65, 67, 70, 72, 73, 103 | 74, 75, 76, 77, 78, 79, 80, 81, 104 | 82, 84, 85, 86, 87, 88, 89, 90] 105 | 106 | COCO_LABEL_MAP = { 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 107 | 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 108 | 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 109 | 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 110 | 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 111 | 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 112 | 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 113 | 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 114 | 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72, 115 | 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80} -------------------------------------------------------------------------------- /data/coco/coco_stuff_10k_classes.py: -------------------------------------------------------------------------------- 1 | COCO_STUFF_10k_CATEGORIES = [ 2 | {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, 3 | {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, 4 | {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, 5 | {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, 6 | {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, 7 | {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, 8 | {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, 9 | {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, 10 | {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, 11 | {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, 12 | {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, 13 | {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, 14 | {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, 15 | {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, 16 | {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, 17 | {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, 18 | {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, 19 | {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, 20 | {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, 21 | {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, 22 | {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, 23 | {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, 24 | {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, 25 | {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, 26 | {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, 27 | {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, 28 | {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, 29 | {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, 30 | {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, 31 | {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, 32 | {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, 33 | {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, 34 | {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, 35 | {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, 36 | {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, 37 | {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, 38 | {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, 39 | {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, 40 | {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, 41 | {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, 42 | {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, 43 | {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, 44 | {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, 45 | {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, 46 | {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, 47 | {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, 48 | {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, 49 | {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, 50 | {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, 51 | {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, 52 | {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, 53 | {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, 54 | {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, 55 | {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, 56 | {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, 57 | {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, 58 | {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, 59 | {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, 60 | {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, 61 | {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, 62 | {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, 63 | {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, 64 | {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, 65 | {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, 66 | {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, 67 | {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, 68 | {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, 69 | {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, 70 | {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, 71 | {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, 72 | {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, 73 | {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, 74 | {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, 75 | {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, 76 | {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, 77 | {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, 78 | {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, 79 | {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, 80 | {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, 81 | {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, 82 | {"id": 92, "name": "banner", "supercategory": "textile"}, 83 | {"id": 93, "name": "blanket", "supercategory": "textile"}, 84 | {"id": 94, "name": "branch", "supercategory": "plant"}, 85 | {"id": 95, "name": "bridge", "supercategory": "building"}, 86 | {"id": 96, "name": "building-other", "supercategory": "building"}, 87 | {"id": 97, "name": "bush", "supercategory": "plant"}, 88 | {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"}, 89 | {"id": 99, "name": "cage", "supercategory": "structural"}, 90 | {"id": 100, "name": "cardboard", "supercategory": "raw-material"}, 91 | {"id": 101, "name": "carpet", "supercategory": "floor"}, 92 | {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"}, 93 | {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"}, 94 | {"id": 104, "name": "cloth", "supercategory": "textile"}, 95 | {"id": 105, "name": "clothes", "supercategory": "textile"}, 96 | {"id": 106, "name": "clouds", "supercategory": "sky"}, 97 | {"id": 107, "name": "counter", "supercategory": "furniture-stuff"}, 98 | {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"}, 99 | {"id": 109, "name": "curtain", "supercategory": "textile"}, 100 | {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"}, 101 | {"id": 111, "name": "dirt", "supercategory": "ground"}, 102 | {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"}, 103 | {"id": 113, "name": "fence", "supercategory": "structural"}, 104 | {"id": 114, "name": "floor-marble", "supercategory": "floor"}, 105 | {"id": 115, "name": "floor-other", "supercategory": "floor"}, 106 | {"id": 116, "name": "floor-stone", "supercategory": "floor"}, 107 | {"id": 117, "name": "floor-tile", "supercategory": "floor"}, 108 | {"id": 118, "name": "floor-wood", "supercategory": "floor"}, 109 | {"id": 119, "name": "flower", "supercategory": "plant"}, 110 | {"id": 120, "name": "fog", "supercategory": "water"}, 111 | {"id": 121, "name": "food-other", "supercategory": "food-stuff"}, 112 | {"id": 122, "name": "fruit", "supercategory": "food-stuff"}, 113 | {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"}, 114 | {"id": 124, "name": "grass", "supercategory": "plant"}, 115 | {"id": 125, "name": "gravel", "supercategory": "ground"}, 116 | {"id": 126, "name": "ground-other", "supercategory": "ground"}, 117 | {"id": 127, "name": "hill", "supercategory": "solid"}, 118 | {"id": 128, "name": "house", "supercategory": "building"}, 119 | {"id": 129, "name": "leaves", "supercategory": "plant"}, 120 | {"id": 130, "name": "light", "supercategory": "furniture-stuff"}, 121 | {"id": 131, "name": "mat", "supercategory": "textile"}, 122 | {"id": 132, "name": "metal", "supercategory": "raw-material"}, 123 | {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"}, 124 | {"id": 134, "name": "moss", "supercategory": "plant"}, 125 | {"id": 135, "name": "mountain", "supercategory": "solid"}, 126 | {"id": 136, "name": "mud", "supercategory": "ground"}, 127 | {"id": 137, "name": "napkin", "supercategory": "textile"}, 128 | {"id": 138, "name": "net", "supercategory": "structural"}, 129 | {"id": 139, "name": "paper", "supercategory": "raw-material"}, 130 | {"id": 140, "name": "pavement", "supercategory": "ground"}, 131 | {"id": 141, "name": "pillow", "supercategory": "textile"}, 132 | {"id": 142, "name": "plant-other", "supercategory": "plant"}, 133 | {"id": 143, "name": "plastic", "supercategory": "raw-material"}, 134 | {"id": 144, "name": "platform", "supercategory": "ground"}, 135 | {"id": 145, "name": "playingfield", "supercategory": "ground"}, 136 | {"id": 146, "name": "railing", "supercategory": "structural"}, 137 | {"id": 147, "name": "railroad", "supercategory": "ground"}, 138 | {"id": 148, "name": "river", "supercategory": "water"}, 139 | {"id": 149, "name": "road", "supercategory": "ground"}, 140 | {"id": 150, "name": "rock", "supercategory": "solid"}, 141 | {"id": 151, "name": "roof", "supercategory": "building"}, 142 | {"id": 152, "name": "rug", "supercategory": "textile"}, 143 | {"id": 153, "name": "salad", "supercategory": "food-stuff"}, 144 | {"id": 154, "name": "sand", "supercategory": "ground"}, 145 | {"id": 155, "name": "sea", "supercategory": "water"}, 146 | {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"}, 147 | {"id": 157, "name": "sky-other", "supercategory": "sky"}, 148 | {"id": 158, "name": "skyscraper", "supercategory": "building"}, 149 | {"id": 159, "name": "snow", "supercategory": "ground"}, 150 | {"id": 160, "name": "solid-other", "supercategory": "solid"}, 151 | {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"}, 152 | {"id": 162, "name": "stone", "supercategory": "solid"}, 153 | {"id": 163, "name": "straw", "supercategory": "plant"}, 154 | {"id": 164, "name": "structural-other", "supercategory": "structural"}, 155 | {"id": 165, "name": "table", "supercategory": "furniture-stuff"}, 156 | {"id": 166, "name": "tent", "supercategory": "building"}, 157 | {"id": 167, "name": "textile-other", "supercategory": "textile"}, 158 | {"id": 168, "name": "towel", "supercategory": "textile"}, 159 | {"id": 169, "name": "tree", "supercategory": "plant"}, 160 | {"id": 170, "name": "vegetable", "supercategory": "food-stuff"}, 161 | {"id": 171, "name": "wall-brick", "supercategory": "wall"}, 162 | {"id": 172, "name": "wall-concrete", "supercategory": "wall"}, 163 | {"id": 173, "name": "wall-other", "supercategory": "wall"}, 164 | {"id": 174, "name": "wall-panel", "supercategory": "wall"}, 165 | {"id": 175, "name": "wall-stone", "supercategory": "wall"}, 166 | {"id": 176, "name": "wall-tile", "supercategory": "wall"}, 167 | {"id": 177, "name": "wall-wood", "supercategory": "wall"}, 168 | {"id": 178, "name": "water-other", "supercategory": "water"}, 169 | {"id": 179, "name": "waterdrops", "supercategory": "water"}, 170 | {"id": 180, "name": "window-blind", "supercategory": "window"}, 171 | {"id": 181, "name": "window-other", "supercategory": "window"}, 172 | {"id": 182, "name": "wood", "supercategory": "solid"}, 173 | ] 174 | -------------------------------------------------------------------------------- /data/dataloader.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | import torch 11 | from torch.utils.data import DataLoader 12 | 13 | import torchvision 14 | from torchvision import datasets 15 | from torchvision.transforms import ToTensor, Resize 16 | 17 | from loguru import logger 18 | import os 19 | import sys 20 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 21 | sys.path.append(__dir__) 22 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 23 | 24 | from tools.augmentation import Augmentation, BaseAugmentation 25 | from data.dataset import CustomDataset, CustomDataset_seg, OcrDetDataset, OcrRecDataset, CocoDataset,\ 26 | ImageCaptionDataset, CityscapesDataset, CocoStuff_10kDataset 27 | from data.data_utils import TrainTransform, TrainTransform_Instance 28 | 29 | trans = torchvision.transforms.Compose([ ToTensor(), Resize(size=(224, 224)) ]) 30 | 31 | class Data_loader(): 32 | def __init__(self, config, args): 33 | self.is_main_process = True if args.rank== 0 else False 34 | self.config = config 35 | self.train_list = config.get('train_list', None) 36 | self.test_list = config.get('test_list', None) 37 | self.dataset_name = self.config['dataset_name'] 38 | augmentation_type = self.config.get('augmentation_type', None) 39 | aug_dicts = config.get('aug_dicts', None) 40 | if aug_dicts is not None: 41 | aug=BaseAugmentation(config['aug_dicts'])() 42 | if self.is_main_process: 43 | print('Using config augmentation') 44 | else: 45 | aug = None 46 | if self.is_main_process: 47 | print('Using default augmentation') 48 | self.augmentation = Augmentation(task_type='cls' if augmentation_type is None else augmentation_type, aug=aug) 49 | 50 | 51 | def get_train(self, distributed=False, nprocs=1): 52 | if self.is_main_process: 53 | logger.info("Trian Dataset name: {}".format(self.dataset_name)) 54 | if self.dataset_name == 'custom': 55 | train_dataset = CustomDataset(lists=self.train_list, shape=eval(self.config['input_size']), 56 | augmentation=self.augmentation if self.config['aug'] else None, dtype='train' 57 | ) 58 | elif self.dataset_name == 'test_cls': 59 | train_dataset = datasets.CIFAR10( 60 | root="../torch_data", 61 | train=True, 62 | download=True, 63 | transform=trans 64 | ) 65 | elif self.dataset_name == 'test_seg': 66 | train_dataset = CustomDataset_seg(lists=self.train_list, shape=eval(self.config['input_size']), 67 | augmentation=self.augmentation if self.config['aug'] else None, dtype='train' 68 | ) 69 | elif self.dataset_name == 'OcrDet': 70 | train_dataset = OcrDetDataset(lists=self.train_list, shape=eval(self.config['input_size']), 71 | augmentation=self.augmentation if self.config['aug'] else None, dtype='train' 72 | ) 73 | elif self.dataset_name == 'OcrRec': 74 | train_dataset = OcrRecDataset( lists=self.train_list, shape=eval(self.config['input_size']), 75 | augmentation=self.augmentation if self.config['aug'] else None, dtype='train' 76 | ) 77 | elif self.dataset_name == 'Coco': 78 | train_dataset = CocoDataset(data_dir=self.config['data_dir'], json_file=self.config['json_file'], name=self.config['name'], 79 | shape=eval(self.config['input_size']), augmentation=self.augmentation if self.config['aug'] else None, 80 | preproc=TrainTransform(rgb_means=(0.485, 0.456, 0.406), 81 | std=(0.229, 0.224, 0.225), 82 | max_labels=50 83 | ) if self.config.get('training_mission', 'det') != 'instance' else TrainTransform_Instance(rgb_means=(0.485, 0.456, 0.406), 84 | std=(0.229, 0.224, 0.225)), 85 | mode= self.config.get('training_mission', 'det'), 86 | dtype='train' 87 | ) 88 | elif self.dataset_name == 'IC': 89 | train_dataset = ImageCaptionDataset(images_dir=self.config['images_dir'], labels_dir=self.config['labels_dir'], chars_file=self.config['chars_file'], 90 | shape=eval(self.config['input_size']), augmentation=self.augmentation if self.config['aug'] else None, 91 | max_length = int(self.config.get('max_length', 50)), 92 | num_channel= int(self.config.get('num_channel', 3)), 93 | dtype='train' 94 | ) 95 | elif self.dataset_name == 'Cityscapes': 96 | train_dataset = CityscapesDataset(data_dir=self.config['images_dir'], json_file=self.config.get('json_file', None), shape=eval(self.config['input_size']), 97 | augmentation=self.augmentation if self.config['aug'] else None, name=self.config['name'], 98 | dtype='train', ignore_lb=int(self.config.get('ignore_lb', 255)), 99 | scales=(0.125, 0.25, 0.375, 0.5, 0.675, 0.75, 0.875, 1.0, 1.25, 1.5) 100 | ) 101 | 102 | elif self.dataset_name == 'Coco_stff_10k': 103 | train_dataset = CocoStuff_10kDataset(data_dir=self.config['data_dir'], shape=eval(self.config['input_size']), 104 | dtype='train', mean=None, std=None 105 | ) 106 | 107 | else: 108 | raise NotImplementedError('{} dataset_name not supported.'.format(self.dataset_name)) 109 | 110 | if self.is_main_process: 111 | logger.info("Train Dataset samples: {}".format(len(train_dataset))) 112 | 113 | if not distributed: 114 | return DataLoader(train_dataset, batch_size=self.config['batch_size'], shuffle=self.config.get('shuffle', True), 115 | num_workers=self.config['num_workers'], collate_fn=train_dataset.get_collate_fn()) 116 | 117 | else: 118 | assert self.config['batch_size'] % nprocs == 0 119 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 120 | sampler_flag = self.config.get('sampler', True) 121 | train_loader = DataLoader(train_dataset, 122 | batch_size=self.config['batch_size'] // nprocs, 123 | num_workers=max(self.config['num_workers'] // nprocs, 1), 124 | pin_memory=True, 125 | sampler=train_sampler if sampler_flag else None, 126 | collate_fn=train_dataset.get_collate_fn()) 127 | return train_loader, train_sampler 128 | 129 | 130 | def get_test(self, distributed=False, nprocs=1): 131 | if self.is_main_process: 132 | logger.info("Test Dataset name: {}".format(self.dataset_name)) 133 | 134 | if self.dataset_name == 'custom': 135 | test_dataset = CustomDataset(lists=self.test_list, shape=eval(self.config['input_size']), 136 | augmentation=None, dtype='val' 137 | ) 138 | elif self.dataset_name == 'test_cls': 139 | test_dataset = datasets.CIFAR10( 140 | root="../torch_data", 141 | train=False, 142 | download=True, 143 | transform=trans 144 | ) 145 | elif self.dataset_name == 'test_seg': 146 | test_dataset = CustomDataset_seg(lists=self.test_list, shape=eval(self.config['input_size']), 147 | augmentation=None, dtype='val' 148 | ) 149 | elif self.dataset_name == 'OcrDet': 150 | test_dataset = OcrDetDataset(lists=self.test_list, shape=eval(self.config['input_size']), 151 | augmentation=None, dtype='val' 152 | ) 153 | elif self.dataset_name == 'OcrRec': 154 | test_dataset = OcrRecDataset( lists=self.test_list, shape=eval(self.config['input_size']), 155 | augmentation=None, dtype='val' 156 | ) 157 | elif self.dataset_name == 'Coco': 158 | test_dataset = CocoDataset(data_dir=self.config['data_dir'], json_file=self.config['json_file_val'], name=self.config['name_val'], 159 | shape=eval(self.config['input_size']), augmentation=None, 160 | preproc=TrainTransform(rgb_means=(0.485, 0.456, 0.406), 161 | std=(0.229, 0.224, 0.225), 162 | max_labels=50 163 | ) if self.config.get('training_mission', 'det') != 'instance' else TrainTransform_Instance(rgb_means=(0.485, 0.456, 0.406), 164 | std=(0.229, 0.224, 0.225), with_box=True), 165 | mode= self.config.get('training_mission', 'det'), 166 | dtype='val' 167 | ) 168 | elif self.dataset_name == 'IC': 169 | test_dataset = ImageCaptionDataset(images_dir=self.config['images_dir_val'], labels_dir=self.config['labels_dir_val'], chars_file=self.config['chars_file'], 170 | shape=eval(self.config['input_size']), augmentation= None, 171 | max_length = int(self.config.get('max_length', 50)), 172 | num_channel= int(self.config.get('num_channel', 3)), 173 | dtype='val' 174 | ) 175 | 176 | elif self.dataset_name == 'Cityscapes': 177 | test_dataset = CityscapesDataset(data_dir=self.config['images_dir'], json_file=self.config.get('json_file', None), shape=eval(self.config['input_size']), 178 | augmentation=None, name=self.config['name_val'], 179 | dtype='val', ignore_lb=int(self.config.get('ignore_lb', 255)), 180 | scales=(0.125, 0.25, 0.375, 0.5, 0.675, 0.75, 0.875, 1.0, 1.25, 1.5) 181 | ) 182 | elif self.dataset_name == 'Coco_stff_10k': 183 | test_dataset = CocoStuff_10kDataset(data_dir=self.config['data_dir'], shape=eval(self.config['input_size']), 184 | dtype='test', mean=self.config.get('mean', None), std=self.config.get('std', None) 185 | ) 186 | 187 | else: 188 | raise NotImplementedError('{} dataset_name not supported.'.format(self.dataset_name)) 189 | if self.is_main_process: 190 | logger.info("Test Dataset samples: {}".format(len(test_dataset))) 191 | 192 | if not distributed: 193 | return DataLoader(test_dataset, batch_size=self.config['batch_size'], shuffle=False, 194 | num_workers=self.config['num_workers'], collate_fn=test_dataset.get_collate_fn()) 195 | else: 196 | test_sampler = torch.utils.data.distributed.DistributedSampler(test_dataset) 197 | sampler_flag = self.config.get('sampler', True) 198 | test_loader = DataLoader(test_dataset, 199 | batch_size=self.config['batch_size'] // nprocs, 200 | num_workers=max(self.config['num_workers'] // nprocs, 1), 201 | pin_memory=True, 202 | sampler=test_sampler if sampler_flag else None, 203 | collate_fn=test_dataset.get_collate_fn() 204 | ) 205 | return test_loader, test_sampler 206 | 207 | 208 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # __all__ = ['DB_Model', 'Segmentation_Model', 'Classify_Model', 'Yolox_Model', 'Crnn_Model', 'Solo_Model', 'ICTransformer', 'ReBiSeNet_Model', 'MaskFormer_Model'] -------------------------------------------------------------------------------- /model/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/model/utils/__init__.py -------------------------------------------------------------------------------- /model/utils/csp_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import numpy as np 15 | 16 | import os 17 | import sys 18 | 19 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 20 | sys.path.append(__dir__) 21 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 22 | 23 | from model.utils.ops import CBA, get_activation 24 | 25 | class ResLayer(nn.Module): 26 | "Residual layer with `in_channels` inputs." 27 | 28 | def __init__(self, in_channels: int): 29 | super().__init__() 30 | mid_channels = in_channels // 2 31 | self.layer1 = CBA( 32 | in_channels, mid_channels, ksize=1, stride=1, act="lrelu" 33 | ) 34 | self.layer2 = CBA( 35 | mid_channels, in_channels, ksize=3, stride=1, act="lrelu" 36 | ) 37 | 38 | def forward(self, x): 39 | out = self.layer2(self.layer1(x)) 40 | return x + out 41 | 42 | class Focus(nn.Module): 43 | """Focus width and height information into channel space.""" 44 | 45 | def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): 46 | super().__init__() 47 | self.conv = CBA(in_channels * 4, out_channels, ksize, stride, act=act) 48 | 49 | def forward(self, x): 50 | patch_top_left = x[..., ::2, ::2] 51 | patch_top_right = x[..., ::2, 1::2] 52 | patch_bot_left = x[..., 1::2, ::2] 53 | patch_bot_right = x[..., 1::2, 1::2] 54 | x = torch.cat( 55 | ( 56 | patch_top_left, 57 | patch_bot_left, 58 | patch_top_right, 59 | patch_bot_right, 60 | ), 61 | dim=1, 62 | ) 63 | return self.conv(x) 64 | 65 | class Bottleneck(nn.Module): 66 | def __init__( 67 | self, 68 | in_channels, 69 | out_channels, 70 | shortcut=True, 71 | expansion=0.5, 72 | act="silu", 73 | ): 74 | super().__init__() 75 | hidden_channels = int(out_channels * expansion) 76 | self.conv1 = CBA(in_channels, hidden_channels, 1, stride=1, act=act) 77 | self.conv2 = CBA(hidden_channels, out_channels, 3, stride=1, act=act) 78 | self.use_add = shortcut and in_channels == out_channels 79 | 80 | def forward(self, x): 81 | y = self.conv2(self.conv1(x)) 82 | if self.use_add: 83 | y = y + x 84 | return y 85 | 86 | class CSPLayer(nn.Module): 87 | """C3 in yolov5, CSP Bottleneck with 3 convolutions 88 | x: c,w,h 89 | CBA(): o//2,w,h CBA():o//2,w,h 90 | Bottelneck():o//2,w,h 91 | cat(): o,w,h 92 | CBA():o,w,h 93 | 94 | """ 95 | 96 | def __init__( 97 | self, 98 | in_channels, 99 | out_channels, 100 | n=1, 101 | shortcut=False, 102 | expansion=0.5, 103 | act="silu", 104 | ): 105 | """ 106 | Args: 107 | in_channels (int): input channels. 108 | out_channels (int): output channels. 109 | n (int): number of Bottlenecks. Default value: 1. 110 | """ 111 | super().__init__() 112 | hidden_channels = int(out_channels * expansion) 113 | self.conv1 = CBA(in_channels, hidden_channels, 1, stride=1, act=act) 114 | self.conv2 = CBA(in_channels, hidden_channels, 1, stride=1, act=act) 115 | self.conv3 = CBA(2 * hidden_channels, out_channels, 1, stride=1, act=act) 116 | module_list = [ 117 | Bottleneck( 118 | hidden_channels, hidden_channels, shortcut, 1.0, act=act 119 | ) 120 | for _ in range(n) 121 | ] 122 | self.m = nn.Sequential(*module_list) 123 | 124 | def forward(self, x): 125 | x_1 = self.conv1(x) 126 | x_2 = self.conv2(x) 127 | x_1 = self.m(x_1) 128 | x = torch.cat((x_1, x_2), dim=1) 129 | return self.conv3(x) 130 | 131 | class SPPBottleneck(nn.Module): 132 | """Spatial pyramid pooling layer used in YOLOv3-SPP 133 | 134 | x:c,w,h 135 | CBA():c//2,w,h 136 | maxpool2d(5):c//2,w,h *:c//2,w,h maxpool2d(9):c//2,w,h maxpool2d(13):c//2,w,h 137 | cat(): c*2, w, h 138 | CBA():o,w,h 139 | 140 | """ 141 | 142 | def __init__( 143 | self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" 144 | ): 145 | super().__init__() 146 | hidden_channels = in_channels // 2 147 | self.conv1 = CBA(in_channels, hidden_channels, 1, stride=1, act=activation) 148 | self.m = nn.ModuleList( 149 | [ 150 | nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) 151 | for ks in kernel_sizes 152 | ] 153 | ) 154 | conv2_channels = hidden_channels * (len(kernel_sizes) + 1) 155 | self.conv2 = CBA(conv2_channels, out_channels, 1, stride=1, act=activation) 156 | 157 | def forward(self, x): 158 | x = self.conv1(x) 159 | x = torch.cat([x] + [m(x) for m in self.m], dim=1) 160 | x = self.conv2(x) 161 | return x 162 | 163 | class SPPBottleneck_1D(nn.Module): 164 | """Spatial pyramid pooling layer used in YOLOv3-SPP 165 | 166 | x:c,w 167 | CBA():c//2,w 168 | maxpool2d(5):c//2,w *:c//2,w maxpool2d(9):c//2,w maxpool2d(13):c//2,w 169 | cat(): c*2, w 170 | CBA():o,w 171 | 172 | """ 173 | 174 | def __init__( 175 | self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" 176 | ): 177 | super().__init__() 178 | hidden_channels = in_channels // 2 179 | self.conv1 = nn.Sequential( 180 | nn.Conv1d( 181 | in_channels, 182 | hidden_channels, 183 | kernel_size=3, 184 | stride=1, 185 | padding=1, 186 | groups=1, 187 | bias=False, 188 | ), 189 | nn.BatchNorm1d(hidden_channels), 190 | get_activation(activation, inplace=True) 191 | ) 192 | self.m = nn.ModuleList( 193 | [ 194 | nn.MaxPool1d(kernel_size=ks, stride=1, padding=ks // 2) 195 | for ks in kernel_sizes 196 | ] 197 | ) 198 | conv2_channels = hidden_channels * (len(kernel_sizes) + 1) 199 | self.conv2 = nn.Sequential( 200 | nn.Conv1d( 201 | conv2_channels, 202 | out_channels, 203 | kernel_size=3, 204 | stride=1, 205 | padding=1, 206 | groups=1, 207 | bias=False, 208 | ), 209 | nn.BatchNorm1d(out_channels), 210 | get_activation(activation, inplace=True) 211 | ) 212 | 213 | def forward(self, x): 214 | x = self.conv1(x) 215 | x = torch.cat([x] + [m(x) for m in self.m], dim=1) 216 | x = self.conv2(x) 217 | return x -------------------------------------------------------------------------------- /model/utils/maskformer_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | from typing import List, Optional 12 | 13 | import torch 14 | import torch.nn as nn 15 | from torch import Tensor 16 | 17 | import math 18 | 19 | import os 20 | import sys 21 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 22 | sys.path.append(__dir__) 23 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 24 | 25 | from model.utils.ops import clones, get_activation 26 | 27 | class MaskTransformer(nn.Module): 28 | def __init__( 29 | self, 30 | d_model=512, 31 | nhead=8, 32 | num_encoder_layers=6, 33 | num_decoder_layers=6, 34 | dim_feedforward=2048, 35 | dropout=0.1, 36 | activation="relu", 37 | normalize_before=False, 38 | return_intermediate_dec=False, 39 | ): 40 | super().__init__() 41 | 42 | encoder_layer = TransformerEncoderLayer( 43 | d_model, nhead, dim_feedforward, dropout, activation, normalize_before 44 | ) 45 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 46 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 47 | 48 | decoder_layer = TransformerDecoderLayer( 49 | d_model, nhead, dim_feedforward, dropout, activation, normalize_before 50 | ) 51 | decoder_norm = nn.LayerNorm(d_model) 52 | self.decoder = TransformerDecoder( 53 | decoder_layer, 54 | num_decoder_layers, 55 | decoder_norm, 56 | return_intermediate=return_intermediate_dec, 57 | ) 58 | 59 | self._reset_parameters() 60 | 61 | self.d_model = d_model 62 | self.nhead = nhead 63 | 64 | def _reset_parameters(self): 65 | for p in self.parameters(): 66 | if p.dim() > 1: 67 | nn.init.xavier_uniform_(p) 68 | 69 | def forward(self, src, mask, query_embed, pos_embed): 70 | bs, c, h, w = src.shape 71 | src = src.flatten(2).permute(2, 0, 1) 72 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 73 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) 74 | if mask is not None: 75 | mask = mask.flatten(1) 76 | 77 | tgt = torch.zeros_like(query_embed) 78 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) 79 | hs = self.decoder( 80 | tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed 81 | ) 82 | return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) 83 | 84 | 85 | class TransformerEncoder(nn.Module): 86 | def __init__(self, encoder_layer, num_layers, norm=None): 87 | super().__init__() 88 | self.layers = clones(encoder_layer, num_layers) 89 | self.num_layers = num_layers 90 | self.norm = norm 91 | 92 | def forward( 93 | self, 94 | src, 95 | mask: Optional[Tensor] = None, 96 | src_key_padding_mask: Optional[Tensor] = None, 97 | pos: Optional[Tensor] = None, 98 | ): 99 | output = src 100 | 101 | for layer in self.layers: 102 | output = layer( 103 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos 104 | ) 105 | 106 | if self.norm is not None: 107 | output = self.norm(output) 108 | 109 | return output 110 | 111 | 112 | class TransformerDecoder(nn.Module): 113 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 114 | super().__init__() 115 | self.layers = clones(decoder_layer, num_layers) 116 | self.num_layers = num_layers 117 | self.norm = norm 118 | self.return_intermediate = return_intermediate 119 | 120 | def forward( 121 | self, 122 | tgt, 123 | memory, 124 | tgt_mask: Optional[Tensor] = None, 125 | memory_mask: Optional[Tensor] = None, 126 | tgt_key_padding_mask: Optional[Tensor] = None, 127 | memory_key_padding_mask: Optional[Tensor] = None, 128 | pos: Optional[Tensor] = None, 129 | query_pos: Optional[Tensor] = None, 130 | ): 131 | output = tgt 132 | 133 | intermediate = [] 134 | 135 | for layer in self.layers: 136 | output = layer( 137 | output, 138 | memory, 139 | tgt_mask=tgt_mask, 140 | memory_mask=memory_mask, 141 | tgt_key_padding_mask=tgt_key_padding_mask, 142 | memory_key_padding_mask=memory_key_padding_mask, 143 | pos=pos, 144 | query_pos=query_pos, 145 | ) 146 | if self.return_intermediate: 147 | intermediate.append(self.norm(output)) 148 | 149 | if self.norm is not None: 150 | output = self.norm(output) 151 | if self.return_intermediate: 152 | intermediate.pop() 153 | intermediate.append(output) 154 | 155 | if self.return_intermediate: 156 | return torch.stack(intermediate) 157 | 158 | return output.unsqueeze(0) 159 | 160 | 161 | class TransformerEncoderLayer(nn.Module): 162 | def __init__( 163 | self, 164 | d_model, 165 | nhead, 166 | dim_feedforward=2048, 167 | dropout=0.1, 168 | activation="relu", 169 | normalize_before=False, 170 | ): 171 | super().__init__() 172 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 173 | self.linear1 = nn.Linear(d_model, dim_feedforward) 174 | self.dropout = nn.Dropout(dropout) 175 | self.linear2 = nn.Linear(dim_feedforward, d_model) 176 | 177 | self.norm1 = nn.LayerNorm(d_model) 178 | self.norm2 = nn.LayerNorm(d_model) 179 | self.dropout1 = nn.Dropout(dropout) 180 | self.dropout2 = nn.Dropout(dropout) 181 | 182 | self.activation = get_activation(activation) 183 | self.normalize_before = normalize_before 184 | 185 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 186 | return tensor if pos is None else tensor + pos 187 | 188 | def forward_post( 189 | self, 190 | src, 191 | src_mask: Optional[Tensor] = None, 192 | src_key_padding_mask: Optional[Tensor] = None, 193 | pos: Optional[Tensor] = None, 194 | ): 195 | q = k = self.with_pos_embed(src, pos) 196 | src2 = self.self_attn( 197 | q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask 198 | )[0] 199 | src = src + self.dropout1(src2) 200 | src = self.norm1(src) 201 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 202 | src = src + self.dropout2(src2) 203 | src = self.norm2(src) 204 | return src 205 | 206 | def forward_pre( 207 | self, 208 | src, 209 | src_mask: Optional[Tensor] = None, 210 | src_key_padding_mask: Optional[Tensor] = None, 211 | pos: Optional[Tensor] = None, 212 | ): 213 | src2 = self.norm1(src) 214 | q = k = self.with_pos_embed(src2, pos) 215 | src2 = self.self_attn( 216 | q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask 217 | )[0] 218 | src = src + self.dropout1(src2) 219 | src2 = self.norm2(src) 220 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 221 | src = src + self.dropout2(src2) 222 | return src 223 | 224 | def forward( 225 | self, 226 | src, 227 | src_mask: Optional[Tensor] = None, 228 | src_key_padding_mask: Optional[Tensor] = None, 229 | pos: Optional[Tensor] = None, 230 | ): 231 | if self.normalize_before: 232 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 233 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 234 | 235 | 236 | class TransformerDecoderLayer(nn.Module): 237 | def __init__( 238 | self, 239 | d_model, 240 | nhead, 241 | dim_feedforward=2048, 242 | dropout=0.1, 243 | activation="relu", 244 | normalize_before=False, 245 | ): 246 | super().__init__() 247 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 248 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 249 | self.linear1 = nn.Linear(d_model, dim_feedforward) 250 | self.dropout = nn.Dropout(dropout) 251 | self.linear2 = nn.Linear(dim_feedforward, d_model) 252 | 253 | self.norm1 = nn.LayerNorm(d_model) 254 | self.norm2 = nn.LayerNorm(d_model) 255 | self.norm3 = nn.LayerNorm(d_model) 256 | self.dropout1 = nn.Dropout(dropout) 257 | self.dropout2 = nn.Dropout(dropout) 258 | self.dropout3 = nn.Dropout(dropout) 259 | 260 | self.activation = get_activation(activation) 261 | self.normalize_before = normalize_before 262 | 263 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 264 | return tensor if pos is None else tensor + pos 265 | 266 | def forward_post( 267 | self, 268 | tgt, 269 | memory, 270 | tgt_mask: Optional[Tensor] = None, 271 | memory_mask: Optional[Tensor] = None, 272 | tgt_key_padding_mask: Optional[Tensor] = None, 273 | memory_key_padding_mask: Optional[Tensor] = None, 274 | pos: Optional[Tensor] = None, 275 | query_pos: Optional[Tensor] = None, 276 | ): 277 | q = k = self.with_pos_embed(tgt, query_pos) 278 | tgt2 = self.self_attn( 279 | q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask 280 | )[0] 281 | tgt = tgt + self.dropout1(tgt2) 282 | tgt = self.norm1(tgt) 283 | tgt2 = self.multihead_attn( 284 | query=self.with_pos_embed(tgt, query_pos), 285 | key=self.with_pos_embed(memory, pos), 286 | value=memory, 287 | attn_mask=memory_mask, 288 | key_padding_mask=memory_key_padding_mask, 289 | )[0] 290 | tgt = tgt + self.dropout2(tgt2) 291 | tgt = self.norm2(tgt) 292 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 293 | tgt = tgt + self.dropout3(tgt2) 294 | tgt = self.norm3(tgt) 295 | return tgt 296 | 297 | def forward_pre( 298 | self, 299 | tgt, 300 | memory, 301 | tgt_mask: Optional[Tensor] = None, 302 | memory_mask: Optional[Tensor] = None, 303 | tgt_key_padding_mask: Optional[Tensor] = None, 304 | memory_key_padding_mask: Optional[Tensor] = None, 305 | pos: Optional[Tensor] = None, 306 | query_pos: Optional[Tensor] = None, 307 | ): 308 | tgt2 = self.norm1(tgt) 309 | q = k = self.with_pos_embed(tgt2, query_pos) 310 | tgt2 = self.self_attn( 311 | q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask 312 | )[0] 313 | tgt = tgt + self.dropout1(tgt2) 314 | tgt2 = self.norm2(tgt) 315 | tgt2 = self.multihead_attn( 316 | query=self.with_pos_embed(tgt2, query_pos), 317 | key=self.with_pos_embed(memory, pos), 318 | value=memory, 319 | attn_mask=memory_mask, 320 | key_padding_mask=memory_key_padding_mask, 321 | )[0] 322 | tgt = tgt + self.dropout2(tgt2) 323 | tgt2 = self.norm3(tgt) 324 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 325 | tgt = tgt + self.dropout3(tgt2) 326 | return tgt 327 | 328 | def forward( 329 | self, 330 | tgt, 331 | memory, 332 | tgt_mask: Optional[Tensor] = None, 333 | memory_mask: Optional[Tensor] = None, 334 | tgt_key_padding_mask: Optional[Tensor] = None, 335 | memory_key_padding_mask: Optional[Tensor] = None, 336 | pos: Optional[Tensor] = None, 337 | query_pos: Optional[Tensor] = None, 338 | ): 339 | if self.normalize_before: 340 | return self.forward_pre( 341 | tgt, 342 | memory, 343 | tgt_mask, 344 | memory_mask, 345 | tgt_key_padding_mask, 346 | memory_key_padding_mask, 347 | pos, 348 | query_pos, 349 | ) 350 | return self.forward_post( 351 | tgt, 352 | memory, 353 | tgt_mask, 354 | memory_mask, 355 | tgt_key_padding_mask, 356 | memory_key_padding_mask, 357 | pos, 358 | query_pos, 359 | ) 360 | 361 | 362 | class PositionEmbeddingSine(nn.Module): 363 | """ 364 | This is a more standard version of the position embedding, very similar to the one 365 | used by the Attention is all you need paper, generalized to work on images. 366 | """ 367 | 368 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 369 | super().__init__() 370 | self.num_pos_feats = num_pos_feats 371 | self.temperature = temperature 372 | self.normalize = normalize 373 | if scale is not None and normalize is False: 374 | raise ValueError("normalize should be True if scale is passed") 375 | if scale is None: 376 | scale = 2 * math.pi 377 | self.scale = scale 378 | 379 | def forward(self, x, mask=None): 380 | if mask is None: 381 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 382 | not_mask = ~mask 383 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 384 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 385 | if self.normalize: 386 | eps = 1e-6 387 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 388 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 389 | 390 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 391 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 392 | 393 | pos_x = x_embed[:, :, :, None] / dim_t 394 | pos_y = y_embed[:, :, :, None] / dim_t 395 | pos_x = torch.stack( 396 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 397 | ).flatten(3) 398 | pos_y = torch.stack( 399 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 400 | ).flatten(3) 401 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 402 | return pos -------------------------------------------------------------------------------- /model/utils/mobilenetv3_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import init 5 | from torchsummary import summary 6 | import os 7 | import sys 8 | 9 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 10 | sys.path.append(__dir__) 11 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 12 | 13 | from model.utils.ops import hsigmoid, hswish 14 | 15 | class SeModule(nn.Module): 16 | def __init__(self, in_size, reduction=4): 17 | super(SeModule, self).__init__() 18 | self.se = nn.Sequential( 19 | nn.AdaptiveAvgPool2d(1), 20 | nn.Conv2d(in_size, in_size // reduction, kernel_size=1, stride=1, padding=0, bias=False), 21 | nn.BatchNorm2d(in_size // reduction), 22 | nn.ReLU(inplace=True), 23 | nn.Conv2d(in_size // reduction, in_size, kernel_size=1, stride=1, padding=0, bias=False), 24 | nn.BatchNorm2d(in_size), 25 | hsigmoid() 26 | ) 27 | 28 | def forward(self, x): 29 | return x * self.se(x) 30 | 31 | 32 | class MobilenetBlock(nn.Module): 33 | '''expand + depthwise + pointwise''' 34 | def __init__(self, kernel_size, in_size, expand_size, out_size, nolinear, semodule, stride): 35 | super(MobilenetBlock, self).__init__() 36 | self.stride = stride 37 | self.se = semodule 38 | 39 | self.conv1 = nn.Conv2d(in_size, expand_size, kernel_size=1, stride=1, padding=0, bias=False) 40 | self.bn1 = nn.BatchNorm2d(expand_size) 41 | self.nolinear1 = nolinear 42 | self.conv2 = nn.Conv2d(expand_size, expand_size, kernel_size=kernel_size, stride=stride, padding=kernel_size//2, groups=expand_size, bias=False) 43 | self.bn2 = nn.BatchNorm2d(expand_size) 44 | self.nolinear2 = nolinear 45 | self.conv3 = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False) 46 | self.bn3 = nn.BatchNorm2d(out_size) 47 | 48 | self.shortcut = nn.Sequential() 49 | if stride == 1 and in_size != out_size: 50 | self.shortcut = nn.Sequential( 51 | nn.Conv2d(in_size, out_size, kernel_size=1, stride=1, padding=0, bias=False), 52 | nn.BatchNorm2d(out_size), 53 | ) 54 | 55 | def forward(self, x): 56 | out = self.nolinear1(self.bn1(self.conv1(x))) 57 | out = self.nolinear2(self.bn2(self.conv2(out))) 58 | out = self.bn3(self.conv3(out)) 59 | if self.se != None: 60 | out = self.se(out) 61 | out = out + self.shortcut(x) if self.stride==1 else out 62 | return out 63 | 64 | class MobileNetV3_Large_(nn.Module): 65 | def __init__(self, num_classes=1000): 66 | super(MobileNetV3_Large_, self).__init__() 67 | self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False) 68 | self.bn1 = nn.BatchNorm2d(16) 69 | self.hs1 = hswish() 70 | 71 | self.bneck = nn.ModuleList([]) 72 | self.bneck.append(MobilenetBlock(3, 16, 16, 16, nn.ReLU(inplace=True), None, 1)) 73 | self.bneck.append(MobilenetBlock(3, 16, 64, 24, nn.ReLU(inplace=True), None, 2)) 74 | self.bneck.append(MobilenetBlock(3, 24, 72, 24, nn.ReLU(inplace=True), None, 1)) 75 | self.bneck.append(MobilenetBlock(5, 24, 72, 40, nn.ReLU(inplace=True), SeModule(40), 2)) 76 | self.bneck.append(MobilenetBlock(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1)) 77 | self.bneck.append(MobilenetBlock(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1)) 78 | self.bneck.append(MobilenetBlock(3, 40, 240, 80, hswish(), None, 2)) 79 | self.bneck.append(MobilenetBlock(3, 80, 200, 80, hswish(), None, 1)) 80 | self.bneck.append(MobilenetBlock(3, 80, 184, 80, hswish(), None, 1)) 81 | self.bneck.append(MobilenetBlock(3, 80, 184, 80, hswish(), None, 1)) 82 | self.bneck.append(MobilenetBlock(3, 80, 480, 112, hswish(), SeModule(112), 1)) 83 | self.bneck.append(MobilenetBlock(3, 112, 672, 112, hswish(), SeModule(112), 1)) 84 | self.bneck.append(MobilenetBlock(5, 112, 672, 160, hswish(), SeModule(160), 1)) 85 | self.bneck.append(MobilenetBlock(5, 160, 672, 160, hswish(), SeModule(160), 2)) 86 | self.bneck.append(MobilenetBlock(5, 160, 960, 160, hswish(), SeModule(160), 1)) 87 | 88 | self.conv2 = nn.Conv2d(160, 960, kernel_size=1, stride=1, padding=0, bias=False) 89 | self.bn2 = nn.BatchNorm2d(960) 90 | self.hs2 = hswish() 91 | self.linear3 = nn.Linear(960, 1280) 92 | self.bn3 = nn.BatchNorm1d(1280) 93 | self.hs3 = hswish() 94 | self.linear4 = nn.Linear(1280, num_classes) 95 | self.init_params() 96 | 97 | def init_params(self): 98 | for m in self.modules(): 99 | if isinstance(m, nn.Conv2d): 100 | init.kaiming_normal_(m.weight, mode='fan_out') 101 | if m.bias is not None: 102 | init.constant_(m.bias, 0) 103 | elif isinstance(m, nn.BatchNorm2d): 104 | init.constant_(m.weight, 1) 105 | init.constant_(m.bias, 0) 106 | elif isinstance(m, nn.Linear): 107 | init.normal_(m.weight, std=0.001) 108 | if m.bias is not None: 109 | init.constant_(m.bias, 0) 110 | 111 | def forward(self, x): 112 | out = self.hs1(self.bn1(self.conv1(x))) 113 | for layer in self.bneck: 114 | out = layer(out) 115 | print(out.size()) 116 | out = self.hs2(self.bn2(self.conv2(out))) 117 | out = F.avg_pool2d(out, 7) 118 | out = out.view(out.size(0), -1) 119 | out = self.hs3(self.bn3(self.linear3(out))) 120 | out = self.linear4(out) 121 | return out 122 | 123 | class MobileNetV3_Small_(nn.Module): 124 | def __init__(self, num_classes=1000): 125 | super(MobileNetV3_Small_, self).__init__() 126 | self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False) 127 | self.bn1 = nn.BatchNorm2d(16) 128 | self.hs1 = hswish() 129 | 130 | self.bneck = nn.ModuleList([]) 131 | self.bneck.append(MobilenetBlock(3, 16, 16, 16, nn.ReLU(inplace=True), SeModule(16), 2)) 132 | self.bneck.append(MobilenetBlock(3, 16, 72, 24, nn.ReLU(inplace=True), None, 2)) 133 | self.bneck.append(MobilenetBlock(3, 24, 88, 24, nn.ReLU(inplace=True), None, 1)) 134 | self.bneck.append(MobilenetBlock(5, 24, 96, 40, hswish(), SeModule(40), 2)) 135 | self.bneck.append(MobilenetBlock(5, 40, 240, 40, hswish(), SeModule(40), 1)) 136 | self.bneck.append(MobilenetBlock(5, 40, 240, 40, hswish(), SeModule(40), 1)) 137 | self.bneck.append(MobilenetBlock(5, 40, 120, 48, hswish(), SeModule(48), 1)) 138 | self.bneck.append(MobilenetBlock(5, 48, 144, 48, hswish(), SeModule(48), 1)) 139 | self.bneck.append(MobilenetBlock(5, 48, 288, 96, hswish(), SeModule(96), 2)) 140 | self.bneck.append(MobilenetBlock(5, 96, 576, 96, hswish(), SeModule(96), 1)) 141 | self.bneck.append(MobilenetBlock(5, 96, 576, 96, hswish(), SeModule(96), 1)) 142 | 143 | self.conv2 = nn.Conv2d(96, 576, kernel_size=1, stride=1, padding=0, bias=False) 144 | self.bn2 = nn.BatchNorm2d(576) 145 | self.hs2 = hswish() 146 | self.linear3 = nn.Linear(576, 1280) 147 | self.bn3 = nn.BatchNorm1d(1280) 148 | self.hs3 = hswish() 149 | self.linear4 = nn.Linear(1280, num_classes) 150 | self.init_params() 151 | 152 | def init_params(self): 153 | for m in self.modules(): 154 | if isinstance(m, nn.Conv2d): 155 | init.kaiming_normal_(m.weight, mode='fan_out') 156 | if m.bias is not None: 157 | init.constant_(m.bias, 0) 158 | elif isinstance(m, nn.BatchNorm2d): 159 | init.constant_(m.weight, 1) 160 | init.constant_(m.bias, 0) 161 | elif isinstance(m, nn.Linear): 162 | init.normal_(m.weight, std=0.001) 163 | if m.bias is not None: 164 | init.constant_(m.bias, 0) 165 | 166 | def forward(self, x): 167 | out = self.hs1(self.bn1(self.conv1(x))) 168 | for layer in self.bneck: 169 | out = layer(out) 170 | print(out.size()) 171 | out = self.hs2(self.bn2(self.conv2(out))) 172 | out = F.avg_pool2d(out, 7) 173 | out = out.view(out.size(0), -1) 174 | out = self.hs3(self.bn3(self.linear3(out))) 175 | out = self.linear4(out) 176 | return out 177 | 178 | 179 | if __name__ == '__main__': 180 | 181 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 182 | 183 | mobilenetv3_l = MobileNetV3_Large_().to(device) 184 | summary(mobilenetv3_l, (3, 256, 256)) 185 | 186 | mobilenetv3_s = MobileNetV3_Small_().to(device) 187 | summary(mobilenetv3_s, (3, 256, 256)) 188 | 189 | -------------------------------------------------------------------------------- /model/utils/mobilevit_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchsummary import summary 4 | 5 | from einops import rearrange 6 | import os 7 | import sys 8 | 9 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 10 | sys.path.append(__dir__) 11 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 12 | 13 | from model.utils.ops import CBA 14 | 15 | class PreNorm(nn.Module): 16 | def __init__(self, dim, fn): 17 | super().__init__() 18 | self.norm = nn.LayerNorm(dim) 19 | self.fn = fn 20 | 21 | def forward(self, x, **kwargs): 22 | return self.fn(self.norm(x), **kwargs) 23 | 24 | 25 | class FeedForward(nn.Module): 26 | def __init__(self, dim, hidden_dim, dropout=0.): 27 | super().__init__() 28 | self.net = nn.Sequential( 29 | nn.Linear(dim, hidden_dim), 30 | nn.SiLU(), 31 | nn.Dropout(dropout), 32 | nn.Linear(hidden_dim, dim), 33 | nn.Dropout(dropout) 34 | ) 35 | 36 | def forward(self, x): 37 | return self.net(x) 38 | 39 | 40 | class Attention(nn.Module): 41 | def __init__(self, dim, heads=8, dim_head=64, dropout=0.): 42 | super().__init__() 43 | inner_dim = dim_head * heads 44 | project_out = not (heads == 1 and dim_head == dim) 45 | 46 | self.heads = heads 47 | self.scale = dim_head ** -0.5 48 | 49 | self.attend = nn.Softmax(dim = -1) 50 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) 51 | 52 | self.to_out = nn.Sequential( 53 | nn.Linear(inner_dim, dim), 54 | nn.Dropout(dropout) 55 | ) if project_out else nn.Identity() 56 | 57 | def forward(self, x): 58 | qkv = self.to_qkv(x).chunk(3, dim=-1) 59 | q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h = self.heads), qkv) 60 | 61 | dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale 62 | attn = self.attend(dots) 63 | out = torch.matmul(attn, v) 64 | out = rearrange(out, 'b p h n d -> b p n (h d)') 65 | return self.to_out(out) 66 | 67 | 68 | class Transformer(nn.Module): 69 | def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.): 70 | super().__init__() 71 | self.layers = nn.ModuleList([]) 72 | for _ in range(depth): 73 | self.layers.append(nn.ModuleList([ 74 | PreNorm(dim, Attention(dim, heads, dim_head, dropout)), 75 | PreNorm(dim, FeedForward(dim, mlp_dim, dropout)) 76 | ])) 77 | 78 | def forward(self, x): 79 | for attn, ff in self.layers: 80 | x = attn(x) + x 81 | x = ff(x) + x 82 | return x 83 | 84 | 85 | class MV2Block(nn.Module): 86 | def __init__(self, inp, oup, stride=1, expansion=4): 87 | super().__init__() 88 | self.stride = stride 89 | assert stride in [1, 2] 90 | 91 | hidden_dim = int(inp * expansion) 92 | self.use_res_connect = self.stride == 1 and inp == oup 93 | 94 | if expansion == 1: 95 | self.conv = nn.Sequential( 96 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 97 | nn.BatchNorm2d(hidden_dim), 98 | nn.SiLU(), 99 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 100 | nn.BatchNorm2d(oup), 101 | ) 102 | else: 103 | self.conv = nn.Sequential( 104 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 105 | nn.BatchNorm2d(hidden_dim), 106 | nn.SiLU(), 107 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 108 | nn.BatchNorm2d(hidden_dim), 109 | nn.SiLU(), 110 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 111 | nn.BatchNorm2d(oup), 112 | ) 113 | 114 | def forward(self, x): 115 | if self.use_res_connect: 116 | return x + self.conv(x) 117 | else: 118 | return self.conv(x) 119 | 120 | 121 | class MobileViTBlock(nn.Module): 122 | def __init__(self, dim, depth, channel, kernel_size, patch_size, mlp_dim, dropout=0.): 123 | super().__init__() 124 | self.ph, self.pw = patch_size 125 | 126 | self.conv1 = CBA(channel, channel, ksize=kernel_size, stride=1, pad=1, bias=False, act="silu") 127 | self.conv2 = CBA(channel, dim, ksize=1, stride=1, pad=0, bias=False, act="silu") 128 | 129 | self.transformer = Transformer(dim, depth, 4, 8, mlp_dim, dropout) 130 | 131 | self.conv3 = CBA(dim, channel, ksize=1, stride=1, pad=0, bias=False, act="silu") 132 | self.conv4 = CBA(2 * channel, channel, ksize=kernel_size, stride=1, pad=1, bias=False, act="silu") 133 | 134 | def forward(self, x): 135 | y = x.clone() 136 | 137 | x = self.conv1(x) 138 | x = self.conv2(x) 139 | 140 | _, _, h, w = x.shape 141 | x = rearrange(x, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=self.ph, pw=self.pw) 142 | x = self.transformer(x) 143 | x = rearrange(x, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h//self.ph, w=w//self.pw, ph=self.ph, pw=self.pw) 144 | 145 | x = self.conv3(x) 146 | x = torch.cat((x, y), 1) 147 | x = self.conv4(x) 148 | return x 149 | 150 | 151 | class MobileViT_(nn.Module): 152 | def __init__(self, image_size, dims, channels, num_classes, expansion=4, kernel_size=3, patch_size=(2, 2)): 153 | super().__init__() 154 | ih, iw = image_size 155 | ph, pw = patch_size 156 | assert ih % ph == 0 and iw % pw == 0 157 | 158 | L = [2, 4, 3] 159 | 160 | self.conv1 = CBA(3, channels[0], ksize=3, stride=2, pad=1, bias=False, act="silu") 161 | 162 | self.mv2 = nn.ModuleList([]) 163 | self.mv2.append(MV2Block(channels[0], channels[1], 1, expansion)) 164 | self.mv2.append(MV2Block(channels[1], channels[2], 2, expansion)) 165 | self.mv2.append(MV2Block(channels[2], channels[3], 1, expansion)) 166 | self.mv2.append(MV2Block(channels[2], channels[3], 1, expansion)) 167 | self.mv2.append(MV2Block(channels[3], channels[4], 2, expansion)) 168 | self.mv2.append(MV2Block(channels[5], channels[6], 2, expansion)) 169 | self.mv2.append(MV2Block(channels[7], channels[8], 2, expansion)) 170 | 171 | self.mvit = nn.ModuleList([]) 172 | self.mvit.append(MobileViTBlock(dims[0], L[0], channels[5], kernel_size, patch_size, int(dims[0]*2))) 173 | self.mvit.append(MobileViTBlock(dims[1], L[1], channels[7], kernel_size, patch_size, int(dims[1]*4))) 174 | self.mvit.append(MobileViTBlock(dims[2], L[2], channels[9], kernel_size, patch_size, int(dims[2]*4))) 175 | 176 | self.conv2 = CBA(channels[-2], channels[-1], ksize=1, stride=1, pad=0, bias=False, act="silu") 177 | 178 | self.pool = nn.AvgPool2d(ih//32, 1) 179 | self.fc = nn.Linear(channels[-1], num_classes, bias=False) 180 | 181 | def forward(self, x): 182 | x = self.conv1(x) 183 | x = self.mv2[0](x) 184 | print(x.size()) 185 | 186 | x = self.mv2[1](x) 187 | x = self.mv2[2](x) 188 | x = self.mv2[3](x) 189 | print(x.size()) 190 | 191 | x = self.mv2[4](x) 192 | x = self.mvit[0](x) 193 | print(x.size()) 194 | 195 | x = self.mv2[5](x) 196 | x = self.mvit[1](x) 197 | print(x.size()) 198 | 199 | x = self.mv2[6](x) 200 | x = self.mvit[2](x) 201 | x = self.conv2(x) 202 | print(x.size()) 203 | 204 | x = self.pool(x).view(-1, x.shape[1]) 205 | x = self.fc(x) 206 | return x 207 | 208 | 209 | def mobilevit_xxs(): 210 | dims = [64, 80, 96] 211 | channels = [16, 16, 24, 24, 48, 48, 64, 64, 80, 80, 320] 212 | return MobileViT_((256, 256), dims, channels, num_classes=1000, expansion=2) 213 | 214 | 215 | def mobilevit_xs(): 216 | dims = [96, 120, 144] 217 | channels = [16, 32, 48, 48, 64, 64, 80, 80, 96, 96, 384] 218 | return MobileViT_((256, 256), dims, channels, num_classes=1000) 219 | 220 | 221 | def mobilevit_s(): 222 | dims = [144, 192, 240] 223 | channels = [16, 32, 64, 64, 96, 96, 128, 128, 160, 160, 640] 224 | return MobileViT_((256, 256), dims, channels, num_classes=1000) 225 | 226 | 227 | def count_parameters(model): 228 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 229 | 230 | 231 | if __name__ == '__main__': 232 | img = torch.randn(5, 3, 256, 256) 233 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 234 | img = img.to(device) 235 | 236 | vit = mobilevit_xxs() 237 | vit = vit.to(device) 238 | summary(vit, (3, 256, 256)) 239 | print(count_parameters(vit)) 240 | 241 | vit = mobilevit_xs() 242 | vit = vit.to(device) 243 | summary(vit, (3, 256, 256)) 244 | print(count_parameters(vit)) 245 | 246 | vit = mobilevit_s() 247 | vit = vit.to(device) 248 | summary(vit, (3, 256, 256)) 249 | print(count_parameters(vit)) -------------------------------------------------------------------------------- /model/utils/ops.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch.nn.utils import clip_grad 15 | 16 | import math 17 | from fractions import gcd 18 | 19 | import copy 20 | 21 | CONV_SELECT = {'conv1d': nn.Conv1d, 'conv2d': nn.Conv2d, 'conv3d': nn.Conv3d} 22 | BN_SELECT = {'conv1d': nn.BatchNorm1d, 'conv2d': nn.BatchNorm2d, 'LN': nn.LayerNorm} 23 | 24 | def clones(_to_clone_module, _clone_times): 25 | """Produce N identical layers.""" 26 | return nn.ModuleList([copy.deepcopy(_to_clone_module) for _ in range(_clone_times)]) 27 | 28 | def clip_grads(params, clip_norm_val=35): 29 | params = list( 30 | filter(lambda p: p.requires_grad and p.grad is not None, params)) 31 | if len(params) > 0: 32 | return clip_grad.clip_grad_norm_(params, max_norm=clip_norm_val, norm_type=2) 33 | 34 | def drop_path(x, drop_prob: float = 0., training: bool = False): 35 | if drop_prob == 0. or not training: 36 | return x 37 | keep_prob = 1 - drop_prob 38 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) 39 | random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 40 | random_tensor.floor_() 41 | output = x.div(keep_prob) * random_tensor 42 | return output 43 | 44 | def get_activation(name="silu", inplace=True): 45 | if name == "silu": 46 | module = nn.SiLU(inplace=inplace) 47 | elif name == "relu": 48 | module = nn.ReLU(inplace=inplace) 49 | elif name == "lrelu": 50 | module = nn.LeakyReLU(0.1, inplace=inplace) 51 | elif name is None: 52 | module = None 53 | else: 54 | raise AttributeError("Unsupported act type: {}".format(name)) 55 | return module 56 | 57 | class hswish(nn.Module): 58 | def forward(self, x): 59 | out = x * F.relu6(x + 3, inplace=True) / 6 60 | return out 61 | 62 | class hsigmoid(nn.Module): 63 | def forward(self, x): 64 | out = F.relu6(x + 3, inplace=True) / 6 65 | return out 66 | 67 | class DropPath(nn.Module): 68 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 69 | """ 70 | def __init__(self, drop_prob=None): 71 | super(DropPath, self).__init__() 72 | self.drop_prob = drop_prob 73 | 74 | def forward(self, x): 75 | return drop_path(x, self.drop_prob, self.training) 76 | 77 | class SiLU(nn.Module): 78 | """export-friendly version of nn.SiLU()""" 79 | @staticmethod 80 | def forward(x): 81 | return x * torch.sigmoid(x) 82 | 83 | """ 84 | class CBA(nn.Module): 85 | # A Conv2d -> Batchnorm -> silu/leaky relu block 86 | 87 | def __init__( 88 | self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu", use_bn=True, pad=None, norm='BN'): 89 | super().__init__() 90 | if pad is None: 91 | # same padding 92 | pad = (ksize - 1) // 2 93 | self.conv = nn.Conv2d( 94 | in_channels, 95 | out_channels, 96 | kernel_size=ksize, 97 | stride=stride, 98 | padding=pad, 99 | groups=groups, 100 | bias=bias, 101 | ) 102 | 103 | if norm == 'GN': 104 | self.bn = nn.GroupNorm(32, out_channels) 105 | else: 106 | self.bn = nn.BatchNorm2d(out_channels) 107 | 108 | self.act = get_activation(act, inplace=True) 109 | self.use_bn = use_bn 110 | 111 | def forward(self, x): 112 | if self.use_bn: 113 | return self.act(self.bn(self.conv(x))) 114 | else: 115 | return self.act((self.conv(x))) 116 | """ 117 | 118 | class CBA(nn.Module): 119 | def __init__( 120 | self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu", use_bn=True, pad=None, norm='BN', group_num=None, conv='conv2d', res_type=False): 121 | super().__init__() 122 | self.res_type = res_type 123 | 124 | if pad is None: 125 | pad = (ksize - 1) // 2 126 | self.conv = CONV_SELECT[conv]( 127 | in_channels, 128 | out_channels, 129 | kernel_size=ksize, 130 | stride=stride, 131 | padding=pad, 132 | groups=groups, 133 | bias=bias, 134 | ) 135 | 136 | if norm == 'GN': 137 | if group_num is None: 138 | self.bn = nn.GroupNorm(gcd(32, out_channels), out_channels) 139 | else: 140 | self.bn = nn.GroupNorm(gcd(group_num, out_channels), out_channels) 141 | else: 142 | self.bn = BN_SELECT[conv](out_channels) 143 | 144 | self.act = get_activation(act, inplace=True) 145 | if not use_bn: 146 | self.bn = None 147 | 148 | if self.res_type: 149 | assert norm is not None 150 | self.conv2 = CONV_SELECT[conv]( 151 | out_channels, 152 | out_channels, 153 | kernel_size=3, 154 | stride=1, 155 | padding=1, 156 | groups=1, 157 | bias=False, 158 | ) 159 | if norm == 'GN': 160 | self.bn2 = nn.GroupNorm(gcd(32, out_channels), out_channels) if group_num is None else nn.GroupNorm(gcd(group_num, out_channels), out_channels) 161 | else: 162 | self.bn2 = BN_SELECT[conv](out_channels) 163 | 164 | if in_channels != out_channels or stride != 1: 165 | if norm == 'GN': 166 | self.transform = nn.Sequential( 167 | CONV_SELECT[conv](in_channels, out_channels, kernel_size=3 if stride!=1 else 1, stride=stride, padding=1 if stride!=1 else 0, groups=1, bias=False), 168 | nn.GroupNorm(gcd(32, out_channels), out_channels) if group_num is None else nn.GroupNorm(gcd(group_num, out_channels), out_channels)) 169 | elif norm == 'BN': 170 | self.transform = nn.Sequential( 171 | CONV_SELECT[conv](in_channels, out_channels, kernel_size=3 if stride!=1 else 1, stride=stride, padding=1 if stride!=1 else 0, groups=1, bias=False), 172 | BN_SELECT[conv](out_channels)) 173 | else: 174 | raise NotImplementedError('Type {} not supported.'.format(norm)) 175 | else: 176 | self.transform = None 177 | 178 | def forward(self, x): 179 | 180 | out = self.conv(x) 181 | if self.bn is not None: 182 | out = self.bn(out) 183 | if self.act is not None: 184 | out = self.act(out) 185 | 186 | if self.res_type: 187 | out = self.conv2(out) 188 | out = self.bn2(out) 189 | if self.transform is not None: 190 | out += self.transform(x) 191 | else: 192 | out += x 193 | if self.act is not None: 194 | out = self.act(out) 195 | 196 | return out 197 | 198 | class MLP(nn.Module): 199 | """A Linear -> norm -> activation block""" 200 | 201 | def __init__( 202 | self, num_in, num_out=None, bias=True, act="relu", norm='GN', group_num=None, res_type=False): 203 | super().__init__() 204 | if num_out is None: 205 | num_out = num_in 206 | 207 | self.linear = nn.Linear(num_in, num_out, bias=bias) 208 | self.res_type = res_type 209 | 210 | if norm is not None: 211 | if norm == 'GN': 212 | self.norm = nn.GroupNorm(gcd(32, num_out), num_out) if group_num is None else nn.GroupNorm(gcd(group_num, num_out), num_out) 213 | elif norm == 'LN': 214 | self.norm = nn.LayerNorm(num_out) 215 | elif norm == 'BN': 216 | self.norm = nn.BatchNorm1d(num_out) 217 | else: 218 | raise NotImplementedError('Type {} not supported.'.format(norm)) 219 | else: 220 | self.norm = None 221 | 222 | if act is not None: 223 | self.act = get_activation(act, inplace=True) 224 | else: 225 | self.act = None 226 | 227 | if self.res_type: 228 | assert norm is not None 229 | self.linear2 = nn.Linear(num_out, num_out, bias=bias) 230 | if norm == 'GN': 231 | self.norm2 = nn.GroupNorm(gcd(32, num_out), num_out) if group_num is None else nn.GroupNorm(gcd(group_num, num_out), num_out) 232 | elif norm == 'LN': 233 | self.norm2 = nn.LayerNorm(num_out) 234 | elif norm == 'BN': 235 | self.norm2 = nn.BatchNorm1d(num_out) 236 | else: 237 | raise NotImplementedError('Type {} not supported.'.format(norm)) 238 | 239 | if num_in != num_out: 240 | if norm == 'GN': 241 | self.transform = nn.Sequential( 242 | nn.Linear(num_in, num_out, bias=bias), 243 | nn.GroupNorm(gcd(32, num_out), num_out) if group_num is None else nn.GroupNorm(gcd(group_num, num_out), num_out)) 244 | elif norm == 'LN': 245 | self.transform = nn.Sequential( 246 | nn.Linear(num_in, num_out, bias=bias), 247 | nn.LayerNorm(num_out)) 248 | elif norm == 'BN': 249 | self.transform = nn.Sequential( 250 | nn.Linear(num_in, num_out, bias=bias), 251 | nn.BatchNorm1d(num_out)) 252 | else: 253 | raise NotImplementedError('Type {} not supported.'.format(norm)) 254 | else: 255 | self.transform = None 256 | 257 | def forward(self, x): 258 | out = self.linear(x) 259 | if self.norm is not None: 260 | out = self.norm(out) 261 | if self.act is not None: 262 | out = self.act(out) 263 | 264 | if self.res_type: 265 | out = self.linear2(out) 266 | out = self.norm2(out) 267 | if self.transform is not None: 268 | out += self.transform(x) 269 | else: 270 | out += x 271 | if self.act is not None: 272 | out = self.act(out) 273 | 274 | return out 275 | 276 | class SeparableConv(nn.Module): 277 | def __init__(self, in_channels, out_channels=None, act="silu", use_bn=True, norm='BN'): 278 | super(SeparableConv, self).__init__() 279 | self.use_bn = use_bn 280 | if out_channels is None: 281 | out_channels = in_channels 282 | 283 | 284 | self.depthwise_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding='same', groups=in_channels, bias=False) 285 | self.pointwise_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding='same', bias=True) 286 | 287 | if norm == 'GN': 288 | self.bn = nn.GroupNorm(32, out_channels) 289 | else: 290 | self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3) 291 | 292 | if act is not None: 293 | self.act = get_activation(act, inplace=True) 294 | else: 295 | self.act = None 296 | 297 | def forward(self, x): 298 | x = self.depthwise_conv(x) 299 | x = self.pointwise_conv(x) 300 | 301 | if self.use_bn: 302 | x = self.bn(x) 303 | 304 | if self.act is not None: 305 | x = self.act(x) 306 | 307 | return x 308 | 309 | class MaxPool2dStaticSamePadding(nn.Module): 310 | """ 311 | created by Zylo117 312 | The real keras/tensorflow MaxPool2d with same padding 313 | """ 314 | 315 | def __init__(self, *args, **kwargs): 316 | super().__init__() 317 | self.pool = nn.MaxPool2d(*args, **kwargs) 318 | self.stride = self.pool.stride 319 | self.kernel_size = self.pool.kernel_size 320 | 321 | if isinstance(self.stride, int): 322 | self.stride = [self.stride] * 2 323 | elif len(self.stride) == 1: 324 | self.stride = [self.stride[0]] * 2 325 | 326 | if isinstance(self.kernel_size, int): 327 | self.kernel_size = [self.kernel_size] * 2 328 | elif len(self.kernel_size) == 1: 329 | self.kernel_size = [self.kernel_size[0]] * 2 330 | 331 | def forward(self, x): 332 | h, w = x.shape[-2:] 333 | 334 | extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1] 335 | extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0] 336 | 337 | left = extra_h // 2 338 | right = extra_h - left 339 | top = extra_v // 2 340 | bottom = extra_v - top 341 | 342 | x = F.pad(x, [left, right, top, bottom]) 343 | 344 | x = self.pool(x) 345 | return x 346 | 347 | class FFN(nn.Module): 348 | """Very simple multi-layer perceptron (also called FFN)""" 349 | 350 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 351 | super().__init__() 352 | self.num_layers = num_layers 353 | h = [hidden_dim] * (num_layers - 1) 354 | self.layers = nn.ModuleList( 355 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 356 | ) 357 | 358 | def forward(self, x): 359 | for i, layer in enumerate(self.layers): 360 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 361 | return x 362 | 363 | class BidirectionalLSTM(nn.Module): 364 | def __init__(self, nIn, nHidden, nOut): 365 | super(BidirectionalLSTM, self).__init__() 366 | 367 | self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True) 368 | self.FC = nn.Linear(nHidden * 2, nOut) 369 | 370 | def forward(self, input): 371 | recurrent, _ = self.rnn(input) 372 | T, b, h = recurrent.size() 373 | t_rec = recurrent.view(T * b, h) 374 | 375 | output = self.FC(t_rec) 376 | output = output.view(T, b, -1) 377 | 378 | return output -------------------------------------------------------------------------------- /model/utils/rebise_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import math 15 | 16 | import os 17 | import sys 18 | 19 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 20 | sys.path.append(__dir__) 21 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 22 | 23 | from model.utils.ops import CBA 24 | from tools.nninit import common_init 25 | 26 | class CatBottleneck(nn.Module): 27 | def __init__(self, in_planes, out_planes, block_num=3, stride=1): 28 | super(CatBottleneck, self).__init__() 29 | self.conv_list = nn.ModuleList() 30 | self.stride = stride 31 | if stride == 2: 32 | self.avd_layer = nn.Sequential( 33 | nn.Conv2d(out_planes//2, out_planes//2, kernel_size=3, stride=2, padding=1, groups=out_planes//2, bias=False), 34 | nn.BatchNorm2d(out_planes//2), 35 | ) 36 | self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) 37 | stride = 1 38 | 39 | for idx in range(block_num): 40 | if idx == 0: 41 | self.conv_list.append(CBA(in_planes, out_planes//2, ksize=1, stride=1, act="relu")) 42 | elif idx == 1 and block_num == 2: 43 | self.conv_list.append(CBA(out_planes//2, out_planes//2, ksize=3, stride=stride, act="relu")) 44 | elif idx == 1 and block_num > 2: 45 | self.conv_list.append(CBA(out_planes//2, out_planes//4, ksize=3, stride=stride, act="relu")) 46 | elif idx < block_num - 1: 47 | self.conv_list.append(CBA(out_planes//int(math.pow(2, idx)), out_planes//int(math.pow(2, idx+1)), ksize=3, stride=1)) 48 | else: 49 | self.conv_list.append(CBA(out_planes//int(math.pow(2, idx)), out_planes//int(math.pow(2, idx)), ksize=3, stride=1)) 50 | 51 | def forward(self, x): 52 | out_list = [] 53 | out1 = self.conv_list[0](x) 54 | 55 | for idx, conv in enumerate(self.conv_list[1:]): 56 | if idx == 0: 57 | if self.stride == 2: 58 | out = conv(self.avd_layer(out1)) 59 | else: 60 | out = conv(out1) 61 | else: 62 | out = conv(out) 63 | out_list.append(out) 64 | 65 | if self.stride == 2: 66 | out1 = self.skip(out1) 67 | out_list.insert(0, out1) 68 | 69 | out = torch.cat(out_list, dim=1) 70 | return out 71 | 72 | class DetailHead(nn.Module): 73 | def __init__(self, in_chan, mid_chan, n_classes): 74 | super(DetailHead, self).__init__() 75 | self.conv = CBA(in_chan, mid_chan, ksize=3, stride=1, act="relu", pad=1) 76 | self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False) 77 | self.apply(self._init_weights) 78 | 79 | def forward(self, x): 80 | x = self.conv(x) 81 | x = self.conv_out(x) 82 | return x 83 | 84 | def _init_weights(self, m): 85 | common_init(m) 86 | 87 | class AttentionRefinementModule(nn.Module): 88 | def __init__(self, in_chan, out_chan, ksize=3): 89 | super(AttentionRefinementModule, self).__init__() 90 | self.conv = CBA(in_chan, out_chan, ksize=ksize, stride=1, act="relu", pad=1) 91 | self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False) 92 | self.bn_atten = nn.BatchNorm2d(out_chan) 93 | self.sigmoid_atten = nn.Sigmoid() 94 | self.apply(self._init_weights) 95 | 96 | def forward(self, x): 97 | feat = self.conv(x) 98 | atten = F.avg_pool2d(feat, feat.size()[2:]) 99 | atten = self.conv_atten(atten) 100 | atten = self.bn_atten(atten) 101 | atten = self.sigmoid_atten(atten) 102 | out = torch.mul(feat, atten) 103 | return out 104 | 105 | def _init_weights(self, m): 106 | common_init(m) -------------------------------------------------------------------------------- /model/utils/res_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import torch.nn as nn 12 | 13 | model_urls = { 14 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 15 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 16 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 17 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 18 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 19 | } 20 | 21 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 22 | """3x3 convolution with padding""" 23 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 24 | padding=dilation, groups=groups, bias=False, dilation=dilation) 25 | 26 | 27 | def conv1x1(in_planes, out_planes, stride=1): 28 | """1x1 convolution""" 29 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 30 | 31 | 32 | class BasicBlock(nn.Module): 33 | expansion = 1 34 | 35 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 36 | base_width=64, dilation=1, norm_layer=None): 37 | super(BasicBlock, self).__init__() 38 | if norm_layer is None: 39 | norm_layer = nn.BatchNorm2d 40 | if groups != 1 or base_width != 64: 41 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 42 | if dilation > 1: 43 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 44 | self.conv1 = conv3x3(inplanes, planes, stride) 45 | self.bn1 = norm_layer(planes) 46 | self.conv2 = conv3x3(planes, planes) 47 | self.bn2 = norm_layer(planes) 48 | self.relu = nn.ReLU(inplace=True) 49 | self.downsample = downsample 50 | self.stride = stride 51 | 52 | def forward(self, x): 53 | identity = x 54 | 55 | out = self.conv1(x) 56 | out = self.bn1(out) 57 | out = self.relu(out) 58 | 59 | out = self.conv2(out) 60 | out = self.bn2(out) 61 | 62 | if self.downsample is not None: 63 | identity = self.downsample(x) 64 | 65 | out += identity 66 | out = self.relu(out) 67 | 68 | return out 69 | 70 | class Bottleneck(nn.Module): 71 | 72 | expansion = 4 73 | 74 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 75 | base_width=64, dilation=1, norm_layer=None): 76 | super(Bottleneck, self).__init__() 77 | if norm_layer is None: 78 | norm_layer = nn.BatchNorm2d 79 | width = int(planes * (base_width / 64.)) * groups 80 | self.conv1 = conv1x1(inplanes, width) 81 | self.bn1 = norm_layer(width) 82 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 83 | self.bn2 = norm_layer(width) 84 | self.conv3 = conv1x1(width, planes * self.expansion) 85 | self.bn3 = norm_layer(planes * self.expansion) 86 | self.relu = nn.ReLU(inplace=True) 87 | self.downsample = downsample 88 | self.stride = stride 89 | 90 | def forward(self, x): 91 | identity = x 92 | 93 | out = self.conv1(x) 94 | out = self.bn1(out) 95 | out = self.relu(out) 96 | 97 | out = self.conv2(out) 98 | out = self.bn2(out) 99 | out = self.relu(out) 100 | 101 | out = self.conv3(out) 102 | out = self.bn3(out) 103 | 104 | if self.downsample is not None: 105 | identity = self.downsample(x) 106 | 107 | out += identity 108 | out = self.relu(out) 109 | 110 | return out 111 | -------------------------------------------------------------------------------- /model/utils/transformer_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch import Tensor 14 | import torch.nn.functional as F 15 | from torch.nn.parameter import Parameter 16 | 17 | import numpy as np 18 | import math 19 | 20 | import os 21 | import sys 22 | __dir__ = os.path.dirname(os.path.abspath(__file__)) 23 | sys.path.append(__dir__) 24 | sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) 25 | 26 | from model.utils.ops import clones 27 | 28 | class MultiHeadAttention(nn.Module): 29 | def __init__(self, multi_attention_heads, dimensions, dropout=0.1): 30 | """ 31 | 32 | :param _multi_attention_heads: number of self attention head 33 | :param _dimensions: dimension of model 34 | :param _dropout: 35 | """ 36 | super(MultiHeadAttention, self).__init__() 37 | 38 | assert dimensions % multi_attention_heads == 0 39 | self.d_k = int(dimensions / multi_attention_heads) 40 | self.h = multi_attention_heads 41 | self.linears = clones(nn.Linear(dimensions, dimensions), 4) 42 | self.attention = None 43 | self.dropout = nn.Dropout(p=dropout) 44 | 45 | def dot_product_attention(self, query, key, value, mask): 46 | """ 47 | Compute 'Scaled Dot Product Attention 48 | 49 | :param _query: (N, h, seq_len, d_q), h is multi-head 50 | :param _key: (N, h, seq_len, d_k) 51 | :param _value: (N, h, seq_len, d_v) 52 | :param _mask: None or (N, 1, seq_len, seq_len), 0 will be replaced with -1e9 53 | :return: 54 | """ 55 | 56 | d_k = value.size(-1) 57 | score = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) 58 | if mask is not None: 59 | score = score.masked_fill(mask == 0, -1e9) 60 | p_attn = F.softmax(score, dim=-1) 61 | return torch.matmul(p_attn, value), p_attn 62 | 63 | def forward(self, query, key, value, mask): 64 | batch_size = query.size(0) 65 | 66 | query, key, value = \ 67 | [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) 68 | for l, x in zip(self.linears, (query, key, value))] 69 | 70 | product_and_attention = self.dot_product_attention(query, key, value, mask=mask) 71 | x = product_and_attention[0] 72 | 73 | x = x.transpose(1, 2).contiguous() \ 74 | .view(batch_size, -1, self.h * self.d_k) 75 | 76 | return self.linears[-1](x) 77 | 78 | class FeedForwarding(nn.Module): 79 | def __init__(self, _dimensions, _feed_forward_dimensions, _dropout=0.1): 80 | super(FeedForwarding, self).__init__() 81 | self.w_1 = nn.Linear(_dimensions, _feed_forward_dimensions) 82 | self.w_2 = nn.Linear(_feed_forward_dimensions, _dimensions) 83 | self.dropout = nn.Dropout(p=_dropout) 84 | 85 | def forward(self, _input_tensor): 86 | return self.w_2(self.dropout(F.relu(self.w_1(_input_tensor)))) 87 | 88 | class PositionalEncoding(nn.Module): 89 | def __init__(self, emb_size, dropout=0.1, maxlen=5000): 90 | super(PositionalEncoding, self).__init__() 91 | 92 | 93 | den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size) 94 | pos = torch.arange(0, maxlen).reshape(maxlen, 1) 95 | pos_embedding = torch.zeros((maxlen, emb_size)) 96 | pos_embedding[:, 0::2] = torch.sin(pos * den) 97 | pos_embedding[:, 1::2] = torch.cos(pos * den) 98 | self.pos_embedding = pos_embedding.unsqueeze(0) 99 | self.dropout = nn.Dropout(dropout) 100 | 101 | def forward(self, x): 102 | """Forward pass. 103 | Args: 104 | x: (B, len, d_model) 105 | Returns: 106 | (B, len, d_model) 107 | """ 108 | return self.dropout(x + self.pos_embedding[:, :x.size(1), :].to(x.device)) 109 | 110 | class PositionalEncoding2D(nn.Module): 111 | def __init__(self, emb_size, dropout=0.1, max_h=1000, max_w=1000): 112 | super(PositionalEncoding2D, self).__init__() 113 | 114 | 115 | self.emb_size = emb_size 116 | assert emb_size % 2 == 0, f"Embedding depth {emb_size} is not even" 117 | pe_h = self.make_pe(emb_size // 2, maxlen=max_h) 118 | pe_w = self.make_pe(emb_size // 2, maxlen=max_w) 119 | 120 | pe_h = pe_h.permute(2, 1, 0).expand(-1, -1, max_w) 121 | pe_w = pe_w.permute(2, 0, 1).expand(-1, max_h, -1) 122 | 123 | pe = torch.cat([pe_h, pe_w], dim=0) 124 | self.pe = pe.unsqueeze(0) 125 | self.dropout = nn.Dropout(dropout) 126 | 127 | def make_pe(self, emb_size, maxlen=2000): 128 | den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size) 129 | pos = torch.arange(0, maxlen).reshape(maxlen, 1) 130 | pos_embedding = torch.zeros((maxlen, emb_size)) 131 | pos_embedding[:, 0::2] = torch.sin(pos * den) 132 | pos_embedding[:, 1::2] = torch.cos(pos * den) 133 | pos_embedding = pos_embedding.unsqueeze(0) 134 | return pos_embedding 135 | 136 | def forward(self, x): 137 | """Forward pass. 138 | Args: 139 | x: (B, d_model, H, W) 140 | Returns: 141 | (B, d_model, H, W) 142 | """ 143 | assert x.shape[1] == self.pe.shape[1] 144 | return self.dropout(x + self.pe[:, :, : x.size(2), : x.size(3)].to(x.device)) 145 | 146 | class TransformerDecoderLayer(nn.Module): 147 | def __init__(self, nhead, d_model, n_layers, dropout, dim_feedforward, n_classes, PAD_IDX=1): 148 | 149 | super(TransformerDecoderLayer, self).__init__() 150 | self.attention = MultiHeadAttention(nhead, d_model, dropout) 151 | self.source_attention = MultiHeadAttention(nhead, d_model, dropout) 152 | self.position_feed_forward = FeedForwarding(d_model, dim_feedforward, dropout) 153 | self.position = PositionalEncoding(d_model, dropout) 154 | self.stacks = n_layers 155 | self.dropout = torch.nn.Dropout(dropout) 156 | self.layer_norm = torch.nn.LayerNorm(d_model, eps=1e-6) 157 | self.embedding = nn.Embedding(n_classes, d_model) 158 | self.sqrt_model_size = math.sqrt(d_model) 159 | self.padding_symbol = PAD_IDX 160 | 161 | def generate_target_mask(self, source, target): 162 | target_pad_mask = (target != self.padding_symbol).unsqueeze(1).unsqueeze(3) 163 | target_length = target.size(1) 164 | target_sub_mask = torch.tril( 165 | torch.ones((target_length, target_length), dtype=torch.uint8, device=source.device) 166 | ) 167 | source_mask = torch.ones((target_length, source.size(1)), dtype=torch.uint8, device=source.device) 168 | target_mask = target_pad_mask & target_sub_mask.bool() 169 | return source_mask, target_mask 170 | 171 | def eval(self): 172 | self.attention.eval() 173 | self.source_attention.eval() 174 | self.position_feed_forward.eval() 175 | self.position.eval() 176 | self.dropout.eval() 177 | self.layer_norm.eval() 178 | self.embedding.eval() 179 | 180 | def forward(self, target_result, memory): 181 | target = self.embedding(target_result) * self.sqrt_model_size 182 | target = self.position(target) 183 | 184 | if self.padding_symbol is None: 185 | source_mask, target_mask = None, None 186 | else: 187 | source_mask, target_mask = self.generate_target_mask(memory, target_result) 188 | output = target 189 | for i in range(self.stacks): 190 | normed_output = self.layer_norm(output) 191 | output = output + self.dropout( 192 | self.attention(normed_output, normed_output, normed_output, target_mask) 193 | ) 194 | normed_output = self.layer_norm(output) 195 | output = output + self.dropout(self.source_attention(normed_output, memory, memory, source_mask)) 196 | normed_output = self.layer_norm(output) 197 | output = output + self.dropout(self.position_feed_forward(normed_output)) 198 | return self.layer_norm(output) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | Cython 3 | loguru 4 | h5py 5 | einops 6 | pyclipper 7 | pycocotools 8 | Shapely 9 | timm 10 | segmentation-models-pytorch 11 | torchsummary 12 | imgaug 13 | opencv-python 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | import torch 4 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 5 | 6 | def make_cuda_ext(name, module, sources): 7 | define_macros = [] 8 | 9 | if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': 10 | define_macros += [("WITH_CUDA", None)] 11 | else: 12 | raise EnvironmentError('CUDA is required to compile!') 13 | 14 | return CUDAExtension( 15 | name='{}.{}'.format(module, name), 16 | sources=[os.path.join(*module.split('.'), p) for p in sources], 17 | define_macros=define_macros, 18 | extra_compile_args={ 19 | 'cxx': ['-std=c++14'], 20 | 'nvcc': [ 21 | '-D__CUDA_NO_HALF_OPERATORS__', 22 | '-D__CUDA_NO_HALF_CONVERSIONS__', 23 | '-D__CUDA_NO_HALF2_OPERATORS__', 24 | ] 25 | }) 26 | 27 | # python setup.py develop 28 | # python setup.py build_ext --inplace 29 | if __name__ == '__main__': 30 | 31 | setup( 32 | name='focalloss', 33 | version='1.0.0', 34 | package_data={'tools/loss': ['*/*.so']}, 35 | classifiers=[ 36 | 'Development Status :: 4 - Beta', 37 | 'License :: OSI Approved :: Apache Software License', 38 | 'Operating System :: OS Independent', 39 | 'Programming Language :: Python :: 3', 40 | 'Programming Language :: Python :: 3.8' 41 | ], 42 | 43 | ext_modules=[ 44 | make_cuda_ext(name='sigmoid_focal_loss_cuda', module='tools.loss', 45 | sources=[ 46 | 'src/sigmoid_focal_loss.cpp', 47 | 'src/sigmoid_focal_loss_cuda.cu' 48 | ]), 49 | make_cuda_ext(name='SigmoidFocalLoss_cuda', module='tools.loss', 50 | sources=[ 51 | 'src/SigmoidFocalLoss.cpp', 52 | 'src/SigmoidFocalLoss_cuda.cu' 53 | ]) 54 | 55 | ], 56 | 57 | cmdclass={'build_ext': BuildExtension}, 58 | zip_safe=False) 59 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/__init__.py -------------------------------------------------------------------------------- /tools/augmentation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 数据增强 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import numpy as np 12 | from loguru import logger 13 | 14 | import imgaug as ia 15 | from imgaug import augmenters as iaa 16 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage 17 | from imgaug.augmentables.polys import PolygonsOnImage 18 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage 19 | 20 | aug_func = { 21 | 'affine': iaa.Affine, 22 | 'fliplr': iaa.Fliplr, 23 | 'flipud': iaa.Flipud, 24 | 'addgaussiannoise': iaa.AdditiveGaussianNoise, 25 | 'multiply': iaa.Multiply, 26 | 'cutout': iaa.Cutout, 27 | 'add': iaa.Add, 28 | 'grayscale': iaa.Grayscale, 29 | 'clouds': iaa.Clouds, 30 | 'fog': iaa.Fog, 31 | 'snowflakes': iaa.Snowflakes, 32 | 'rain': iaa.Rain, 33 | 'gaussianblur': iaa.GaussianBlur 34 | } 35 | 36 | class BaseAugmentation(): 37 | def __init__(self, aug_dicts, mode='some'): 38 | assert isinstance(aug_dicts, dict) 39 | self.aug_dicts = aug_dicts 40 | self.mode = mode 41 | 42 | def __call__(self): 43 | augment_func =[aug_func[f](**self.aug_dicts[f]) for f in self.aug_dicts] 44 | if self.mode == 'some': 45 | return iaa.SomeOf((0, len(augment_func)), augment_func) 46 | else: 47 | return iaa.Sequential(augment_func) 48 | 49 | class Augmentation(): 50 | def __init__(self, use_aug=True, task_type='cls', aug=None): 51 | 52 | assert task_type in ['cls', 'det', 'seg', 'polygon', 'custom'] 53 | self.use_aug = use_aug 54 | self.aug = iaa.SomeOf((0, 13),[ 55 | iaa.Affine(translate_percent=[-0.05, 0.05], scale=[0.8, 1.2], rotate=(-5, 5), mode='constant', cval=[240, 255]), 56 | iaa.Fliplr(0.5), 57 | iaa.Flipud(0.5), 58 | iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 12.0), per_channel=0.5), 59 | iaa.Multiply((0.5, 1.5)), 60 | iaa.Cutout(nb_iterations=(1, 4), size=0.1, squared=False, fill_mode="constant", cval=(0, 255), fill_per_channel=0.5), 61 | iaa.Add((-40, 40), per_channel=0.5), 62 | iaa.Grayscale(alpha=(0.0, 1.0)), 63 | iaa.GaussianBlur(sigma=(0.0,1.4)) 64 | ]) if aug is None else aug 65 | 66 | self.task_type = task_type 67 | logger.info("Augmentation_type: {}".format(self.task_type)) 68 | def make_aug(self, img, label, box_label=None): 69 | if self.task_type == 'cls': 70 | img = self.aug(image=img) 71 | return img, label 72 | elif self.task_type == 'det': 73 | boxes = BoundingBoxesOnImage([BoundingBox(x1=float(ii[0]), y1=float(ii[1]), x2=float(ii[2]), y2=float(ii[3]), 74 | label=ii[4]) for ii in label], shape=img.shape) 75 | new_img, new_boxes = self.aug(image=img, bounding_boxes=boxes) 76 | new_boxes = new_boxes.remove_out_of_image().clip_out_of_image() 77 | boxes_ = [[float(new_boxes.bounding_boxes[j].x1), float(new_boxes.bounding_boxes[j].y1), 78 | float(new_boxes.bounding_boxes[j].x2), float(new_boxes.bounding_boxes[j].y2), new_boxes.bounding_boxes[j].label] for j in range(len(new_boxes.bounding_boxes))] 79 | 80 | return new_img, boxes_ 81 | 82 | elif self.task_type == 'polygon': 83 | polygons = PolygonsOnImage([ia.Polygon(p[:-1], label=p[-1]) for p in label], shape=img.shape) 84 | new_img, new_polygons = self.aug(image=img, polygons=polygons) 85 | new_polygons = new_polygons.remove_out_of_image().clip_out_of_image() 86 | polygons_ = [new_polygons.polygons[j].coords.tolist()+[new_polygons.polygons[j].label] for j in range(len(new_polygons.polygons))] 87 | 88 | return new_img, polygons_ 89 | 90 | elif self.task_type == 'seg': 91 | label = np.array(label) 92 | if box_label is not None: 93 | box_label = np.array(box_label) 94 | box_label = BoundingBoxesOnImage([BoundingBox(x1=float(ii[0]), y1=float(ii[1]), x2=float(ii[2]), y2=float(ii[3]), 95 | label=ii[4]) for ii in box_label], shape=img.shape) 96 | seg_map = SegmentationMapsOnImage(label, shape=img.shape) 97 | new_img, seg_map, new_boxes = self.aug(image=img, segmentation_maps=seg_map, bounding_boxes=box_label) 98 | new_boxes = [[float(new_boxes.bounding_boxes[j].x1), float(new_boxes.bounding_boxes[j].y1), 99 | float(new_boxes.bounding_boxes[j].x2), float(new_boxes.bounding_boxes[j].y2), new_boxes.bounding_boxes[j].label] for j in range(len(new_boxes.bounding_boxes))] 100 | seg_map = seg_map.get_arr() 101 | return new_img, seg_map, new_boxes 102 | 103 | else: 104 | seg_map = SegmentationMapsOnImage(label, shape=img.shape) 105 | new_img, seg_map = self.aug(image=img, segmentation_maps=seg_map) 106 | seg_map = seg_map.get_arr() 107 | 108 | return new_img, seg_map 109 | 110 | else: 111 | return self.custom_label_type(img, label) 112 | 113 | def custom_label_type(self, img, label): 114 | raise NotImplementedError('Custom label type not supported.') 115 | 116 | def reorder_vertexes(self, pts): 117 | pts = np.array(pts) 118 | rect = np.zeros((4, 2), dtype = "float32") 119 | 120 | s = pts.sum(axis = 1) 121 | rect[0] = pts[np.argmin(s)] 122 | rect[2] = pts[np.argmax(s)] 123 | 124 | diff = np.diff(pts, axis = 1) 125 | rect[1] = pts[np.argmin(diff)] 126 | rect[3] = pts[np.argmax(diff)] 127 | 128 | return rect.tolist() -------------------------------------------------------------------------------- /tools/boxes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import numpy as np 12 | 13 | import torch 14 | import torchvision 15 | 16 | 17 | def filter_box(output, scale_range): 18 | """ 19 | output: (N, 5+class) shape 20 | """ 21 | min_scale, max_scale = scale_range 22 | w = output[:, 2] - output[:, 0] 23 | h = output[:, 3] - output[:, 1] 24 | keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) 25 | return output[keep] 26 | 27 | """ 28 | def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45): 29 | box_corner = prediction.new(prediction.shape) 30 | box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 31 | box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 32 | box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 33 | box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 34 | prediction[:, :, :4] = box_corner[:, :, :4] 35 | 36 | output = [None for _ in range(len(prediction))] 37 | for i, image_pred in enumerate(prediction): 38 | 39 | # If none are remaining => process next image 40 | if not image_pred.size(0): 41 | continue 42 | # Get score and class with highest confidence 43 | class_conf, class_pred = torch.max( 44 | image_pred[:, 5 : 5 + num_classes], 1, keepdim=True 45 | ) 46 | 47 | conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() 48 | # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000) 49 | # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) 50 | detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1) 51 | detections = detections[conf_mask] 52 | if not detections.size(0): 53 | continue 54 | 55 | nms_out_index = torchvision.ops.batched_nms( 56 | detections[:, :4], 57 | detections[:, 4] * detections[:, 5], 58 | detections[:, 6], 59 | nms_thre, 60 | ) 61 | detections = detections[nms_out_index] 62 | if output[i] is None: 63 | output[i] = detections 64 | else: 65 | output[i] = torch.cat((output[i], detections)) 66 | 67 | return output 68 | """ 69 | 70 | def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): 71 | if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: 72 | raise IndexError 73 | 74 | if xyxy: 75 | tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) 76 | br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) 77 | area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) 78 | area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) 79 | else: 80 | tl = torch.max( 81 | (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), 82 | (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), 83 | ) 84 | br = torch.min( 85 | (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), 86 | (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), 87 | ) 88 | 89 | area_a = torch.prod(bboxes_a[:, 2:], 1) 90 | area_b = torch.prod(bboxes_b[:, 2:], 1) 91 | en = (tl < br).type(tl.type()).prod(dim=2) 92 | area_i = torch.prod(br - tl, 2) * en 93 | return area_i / (area_a[:, None] + area_b - area_i) 94 | 95 | 96 | def matrix_iou(a, b): 97 | """ 98 | return iou of a and b, numpy version for data augenmentation 99 | """ 100 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 101 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 102 | 103 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 104 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 105 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 106 | return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) 107 | 108 | 109 | def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): 110 | bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) 111 | bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) 112 | return bbox 113 | 114 | 115 | def xyxy2xywh(bboxes): 116 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] 117 | bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] 118 | return bboxes 119 | 120 | 121 | def xyxy2cxcywh(bboxes): 122 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] 123 | bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] 124 | bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5 125 | bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5 126 | return bboxes 127 | -------------------------------------------------------------------------------- /tools/evaluation_tools.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | 11 | from tqdm import tqdm 12 | import numpy as np 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | 18 | import json 19 | import tempfile 20 | 21 | from pycocotools.coco import COCO 22 | from pycocotools.cocoeval import COCOeval 23 | 24 | from tools.boxes import xyxy2xywh 25 | 26 | class Coco_eval(): 27 | def __init__(self, eval_bbox=False, eval_mask=False, jsonfile=None): 28 | self.eval_bbox = eval_bbox 29 | self.eval_mask = eval_mask 30 | self.jsonfile = jsonfile 31 | 32 | def __call__(self, data_list, ann_file): 33 | if self.jsonfile is not None: 34 | json.dump(data_list, open(self.jsonfile, "w")) 35 | else: 36 | _, self.jsonfile = tempfile.mkstemp() 37 | json.dump(data_list, open(self.jsonfile, "w")) 38 | print('Loading annotations...') 39 | gt_annotations = COCO(ann_file) 40 | test_res = gt_annotations.loadRes(self.jsonfile) 41 | 42 | if self.eval_bbox: 43 | print('\nEvaluating BBoxes:') 44 | bbox_eval = COCOeval(gt_annotations, test_res, 'bbox') 45 | bbox_eval.evaluate() 46 | bbox_eval.accumulate() 47 | bbox_eval.summarize() 48 | 49 | if self.eval_mask: 50 | print('\nEvaluating Masks:') 51 | bbox_eval = COCOeval(gt_annotations, test_res, 'segm') 52 | bbox_eval.evaluate() 53 | bbox_eval.accumulate() 54 | bbox_eval.summarize() 55 | 56 | return bbox_eval.stats[0] 57 | 58 | class ConvertCocoFormat(): 59 | 60 | def __init__(self, id2cat, mode='bbox'): 61 | self.id2cat = id2cat 62 | self.mode = mode 63 | 64 | def __call__(self, b_bboxes, b_cls, b_scores, ids): 65 | data_list = [] 66 | for (bboxes, cls, scores, img_id) in zip(b_bboxes, b_cls, b_scores, ids): 67 | bboxes, cls, scores = bboxes.cpu(), cls.cpu(), scores.cpu() 68 | 69 | if bboxes is None: 70 | continue 71 | if self.mode == 'bbox': 72 | for ind in range(bboxes.shape[0]): 73 | label = self.id2cat[int(cls[ind])] 74 | pred_data = { 75 | "image_id": int(img_id.numpy().item()), 76 | "category_id": label, 77 | "bbox": bboxes[ind].numpy().tolist(), 78 | "score": scores[ind].numpy().item(), 79 | "segmentation": [], 80 | } 81 | data_list.append(pred_data) 82 | else: 83 | raise NotImplementedError 84 | return data_list 85 | 86 | class SemanticSegmIOU(): 87 | def __init__(self, scale=0.5, ignore_label=255): 88 | super().__init__() 89 | self.scale = scale 90 | self.ignore_label = ignore_label 91 | 92 | def __call__(self, model, dataset, n_classes): 93 | hist = torch.zeros(n_classes, n_classes).cuda().detach() 94 | 95 | for inps, targets in tqdm(dataset): 96 | 97 | N, H, W = targets.shape 98 | targets = targets.cuda() 99 | size = targets.size()[-2:] 100 | 101 | inps = inps.cuda() 102 | N, C, H, W = inps.size() 103 | 104 | new_hw = [int(H*self.scale), int(W*self.scale)] 105 | inps = F.interpolate(inps, new_hw, mode='bilinear', align_corners=True) 106 | 107 | logits = model(inps)[0] 108 | logits = F.interpolate(logits, size=size, mode='bilinear', align_corners=True) 109 | probs = torch.softmax(logits, dim=1) 110 | preds = torch.argmax(probs, dim=1) 111 | keep = targets != self.ignore_label 112 | 113 | hist += torch.bincount(targets[keep] * n_classes + preds[keep], minlength=n_classes ** 2).view(n_classes, n_classes).float() 114 | 115 | ious = hist.diag() / (hist.sum(dim=0) + hist.sum(dim=1) - hist.diag()) 116 | miou = ious.mean() 117 | return miou.item() -------------------------------------------------------------------------------- /tools/loss/SigmoidFocalLoss_cuda.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/SigmoidFocalLoss_cuda.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /tools/loss/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/loss/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /tools/loss/__pycache__/detr_criterion.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/__pycache__/detr_criterion.cpython-38.pyc -------------------------------------------------------------------------------- /tools/loss/__pycache__/detr_matcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/__pycache__/detr_matcher.cpython-38.pyc -------------------------------------------------------------------------------- /tools/loss/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /tools/loss/__pycache__/loss_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/__pycache__/loss_utils.cpython-38.pyc -------------------------------------------------------------------------------- /tools/loss/detr_criterion.py: -------------------------------------------------------------------------------- 1 | """ 2 | MaskFormer criterion. 3 | """ 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn 7 | 8 | 9 | from ..misc import nested_tensor_from_tensor_list 10 | 11 | 12 | def dice_loss(inputs, targets, num_masks): 13 | """ 14 | Compute the DICE loss, similar to generalized IOU for masks 15 | Args: 16 | inputs: A float tensor of arbitrary shape. 17 | The predictions for each example. 18 | targets: A float tensor with the same shape as inputs. Stores the binary 19 | classification label for each element in inputs 20 | (0 for the negative class and 1 for the positive class). 21 | """ 22 | inputs = inputs.sigmoid() 23 | inputs = inputs.flatten(1) 24 | numerator = 2 * (inputs * targets).sum(-1) 25 | denominator = inputs.sum(-1) + targets.sum(-1) 26 | loss = 1 - (numerator + 1) / (denominator + 1) 27 | return loss.sum() / num_masks 28 | 29 | 30 | def sigmoid_focal_loss(inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2): 31 | """ 32 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 33 | Args: 34 | inputs: A float tensor of arbitrary shape. 35 | The predictions for each example. 36 | targets: A float tensor with the same shape as inputs. Stores the binary 37 | classification label for each element in inputs 38 | (0 for the negative class and 1 for the positive class). 39 | alpha: (optional) Weighting factor in range (0,1) to balance 40 | positive vs negative examples. Default = -1 (no weighting). 41 | gamma: Exponent of the modulating factor (1 - p_t) to 42 | balance easy vs hard examples. 43 | Returns: 44 | Loss tensor 45 | """ 46 | prob = inputs.sigmoid() 47 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 48 | p_t = prob * targets + (1 - prob) * (1 - targets) 49 | loss = ce_loss * ((1 - p_t) ** gamma) 50 | 51 | if alpha >= 0: 52 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 53 | loss = alpha_t * loss 54 | return loss.mean(1).sum() / num_masks 55 | 56 | 57 | class SetCriterion(nn.Module): 58 | """This class computes the loss for DETR. 59 | The process happens in two steps: 60 | 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 61 | 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) 62 | """ 63 | 64 | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): 65 | """Create the criterion. 66 | Parameters: 67 | num_classes: number of object categories, omitting the special no-object category 68 | matcher: module able to compute a matching between targets and proposals 69 | weight_dict: dict containing as key the names of the losses and as values their relative weight. 70 | eos_coef: relative classification weight applied to the no-object category 71 | losses: list of all the losses to be applied. See get_loss for list of available losses. 72 | """ 73 | super().__init__() 74 | self.num_classes = num_classes 75 | self.matcher = matcher 76 | self.weight_dict = weight_dict 77 | self.eos_coef = eos_coef 78 | self.losses = losses 79 | empty_weight = torch.ones(self.num_classes + 1) 80 | empty_weight[-1] = self.eos_coef 81 | self.register_buffer("empty_weight", empty_weight) 82 | 83 | def loss_labels(self, outputs, targets, indices, num_masks): 84 | """Classification loss (NLL) 85 | targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] 86 | """ 87 | assert "pred_logits" in outputs 88 | src_logits = outputs["pred_logits"] 89 | 90 | idx = self._get_src_permutation_idx(indices) 91 | target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) 92 | target_classes = torch.full( 93 | src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device 94 | ) 95 | target_classes[idx] = target_classes_o 96 | 97 | loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) 98 | losses = {"loss_ce": loss_ce} 99 | return losses 100 | 101 | def loss_masks(self, outputs, targets, indices, num_masks): 102 | """Compute the losses related to the masks: the focal loss and the dice loss. 103 | targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] 104 | """ 105 | assert "pred_masks" in outputs 106 | 107 | src_idx = self._get_src_permutation_idx(indices) 108 | tgt_idx = self._get_tgt_permutation_idx(indices) 109 | src_masks = outputs["pred_masks"] 110 | src_masks = src_masks[src_idx] 111 | masks = [t["masks"] for t in targets] 112 | target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() 113 | target_masks = target_masks.to(src_masks) 114 | target_masks = target_masks[tgt_idx] 115 | 116 | src_masks = F.interpolate( 117 | src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False 118 | ) 119 | src_masks = src_masks[:, 0].flatten(1) 120 | 121 | target_masks = target_masks.flatten(1) 122 | target_masks = target_masks.view(src_masks.shape) 123 | losses = { 124 | "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_masks), 125 | "loss_dice": dice_loss(src_masks, target_masks, num_masks), 126 | } 127 | return losses 128 | 129 | def _get_src_permutation_idx(self, indices): 130 | batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) 131 | src_idx = torch.cat([src for (src, _) in indices]) 132 | return batch_idx, src_idx 133 | 134 | def _get_tgt_permutation_idx(self, indices): 135 | batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) 136 | tgt_idx = torch.cat([tgt for (_, tgt) in indices]) 137 | return batch_idx, tgt_idx 138 | def get_loss(self, loss, outputs, targets, indices, num_masks): 139 | loss_map = {"labels": self.loss_labels, "masks": self.loss_masks} 140 | assert loss in loss_map, f"do you really want to compute {loss} loss?" 141 | return loss_map[loss](outputs, targets, indices, num_masks) 142 | 143 | def forward(self, outputs, targets): 144 | """This performs the loss computation. 145 | Parameters: 146 | outputs: dict of tensors, see the output specification of the model for the format 147 | targets: list of dicts, such that len(targets) == batch_size. 148 | The expected keys in each dict depends on the losses applied, see each loss' doc 149 | """ 150 | 151 | for i in range(len(targets)): 152 | targets[i]['labels'] = targets[i]['labels'].cuda() 153 | targets[i]['masks'] = targets[i]['masks'].cuda() 154 | 155 | outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} 156 | 157 | indices = self.matcher(outputs_without_aux, targets) 158 | 159 | num_masks = sum(len(t["labels"]) for t in targets) 160 | num_masks = torch.as_tensor( 161 | [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device 162 | ) 163 | 164 | losses = {} 165 | for loss in self.losses: 166 | losses.update(self.get_loss(loss, outputs, targets, indices, num_masks)) 167 | 168 | if "aux_outputs" in outputs: 169 | for i, aux_outputs in enumerate(outputs["aux_outputs"]): 170 | indices = self.matcher(aux_outputs, targets) 171 | for loss in self.losses: 172 | l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks) 173 | l_dict = {k + f"_{i}": v for k, v in l_dict.items()} 174 | losses.update(l_dict) 175 | 176 | for k in list(losses.keys()): 177 | if k in self.weight_dict: 178 | losses[k] *= self.weight_dict[k] 179 | else: 180 | losses.pop(k) 181 | 182 | losses = sum(losses.values()) 183 | return losses 184 | -------------------------------------------------------------------------------- /tools/loss/detr_matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modules to compute the matching cost and solve the corresponding LSAP. 3 | """ 4 | import torch 5 | import torch.nn.functional as F 6 | from scipy.optimize import linear_sum_assignment 7 | from torch import nn 8 | 9 | 10 | def batch_dice_loss(inputs, targets): 11 | """ 12 | Compute the DICE loss, similar to generalized IOU for masks 13 | Args: 14 | inputs: A float tensor of arbitrary shape. 15 | The predictions for each example. 16 | targets: A float tensor with the same shape as inputs. Stores the binary 17 | classification label for each element in inputs 18 | (0 for the negative class and 1 for the positive class). 19 | """ 20 | inputs = inputs.sigmoid() 21 | inputs = inputs.flatten(1) 22 | numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) 23 | denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] 24 | loss = 1 - (numerator + 1) / (denominator + 1) 25 | return loss 26 | 27 | 28 | def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2): 29 | """ 30 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 31 | Args: 32 | inputs: A float tensor of arbitrary shape. 33 | The predictions for each example. 34 | targets: A float tensor with the same shape as inputs. Stores the binary 35 | classification label for each element in inputs 36 | (0 for the negative class and 1 for the positive class). 37 | alpha: (optional) Weighting factor in range (0,1) to balance 38 | positive vs negative examples. Default = -1 (no weighting). 39 | gamma: Exponent of the modulating factor (1 - p_t) to 40 | balance easy vs hard examples. 41 | Returns: 42 | Loss tensor 43 | """ 44 | hw = inputs.shape[1] 45 | 46 | prob = inputs.sigmoid() 47 | focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits( 48 | inputs, torch.ones_like(inputs), reduction="none" 49 | ) 50 | focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits( 51 | inputs, torch.zeros_like(inputs), reduction="none" 52 | ) 53 | if alpha >= 0: 54 | focal_pos = focal_pos * alpha 55 | focal_neg = focal_neg * (1 - alpha) 56 | 57 | loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum( 58 | "nc,mc->nm", focal_neg, (1 - targets) 59 | ) 60 | 61 | return loss / hw 62 | 63 | 64 | class HungarianMatcher(nn.Module): 65 | """This class computes an assignment between the targets and the predictions of the network 66 | 67 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 68 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 69 | while the others are un-matched (and thus treated as non-objects). 70 | """ 71 | 72 | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1): 73 | """Creates the matcher 74 | 75 | Params: 76 | cost_class: This is the relative weight of the classification error in the matching cost 77 | cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost 78 | cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost 79 | """ 80 | super().__init__() 81 | self.cost_class = cost_class 82 | self.cost_mask = cost_mask 83 | self.cost_dice = cost_dice 84 | assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" 85 | 86 | @torch.no_grad() 87 | def memory_efficient_forward(self, outputs, targets): 88 | """More memory-friendly matching""" 89 | bs, num_queries = outputs["pred_logits"].shape[:2] 90 | 91 | masks = [v["masks"] for v in targets] 92 | h_max = max([m.shape[1] for m in masks]) 93 | w_max = max([m.shape[2] for m in masks]) 94 | 95 | indices = [] 96 | 97 | for b in range(bs): 98 | 99 | out_prob = outputs["pred_logits"][b].softmax(-1) 100 | out_mask = outputs["pred_masks"][b] 101 | 102 | tgt_ids = targets[b]["labels"] 103 | tgt_mask = targets[b]["masks"].to(out_mask) 104 | 105 | cost_class = -out_prob[:, tgt_ids] 106 | 107 | tgt_mask = F.interpolate(tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest") 108 | 109 | out_mask = out_mask.flatten(1) 110 | tgt_mask = tgt_mask[:, 0].flatten(1) 111 | 112 | cost_mask = batch_sigmoid_focal_loss(out_mask, tgt_mask) 113 | 114 | cost_dice = batch_dice_loss(out_mask, tgt_mask) 115 | 116 | C = ( 117 | self.cost_mask * cost_mask 118 | + self.cost_class * cost_class 119 | + self.cost_dice * cost_dice 120 | ) 121 | C = C.reshape(num_queries, -1).cpu() 122 | indices.append(linear_sum_assignment(C)) 123 | return [ 124 | (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) 125 | for i, j in indices 126 | ] 127 | 128 | @torch.no_grad() 129 | def forward(self, outputs, targets): 130 | """Performs the matching 131 | 132 | Params: 133 | outputs: This is a dict that contains at least these entries: 134 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 135 | "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks 136 | 137 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 138 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 139 | objects in the target) containing the class labels 140 | "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks 141 | 142 | Returns: 143 | A list of size batch_size, containing tuples of (index_i, index_j) where: 144 | - index_i is the indices of the selected predictions (in order) 145 | - index_j is the indices of the corresponding selected targets (in order) 146 | For each batch element, it holds: 147 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 148 | """ 149 | return self.memory_efficient_forward(outputs, targets) 150 | 151 | def __repr__(self): 152 | head = "Matcher " + self.__class__.__name__ 153 | body = [ 154 | "cost_class: {}".format(self.cost_class), 155 | "cost_mask: {}".format(self.cost_mask), 156 | "cost_dice: {}".format(self.cost_dice), 157 | ] 158 | _repr_indent = 4 159 | lines = [head] + [" " * _repr_indent + line for line in body] 160 | return "\n".join(lines) 161 | -------------------------------------------------------------------------------- /tools/loss/loss_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | from torch.autograd import Function 11 | from torch.autograd.function import once_differentiable 12 | import torch.nn.functional as F 13 | 14 | def norm_add_multi_loss(loss_dict): 15 | for i, key in enumerate(loss_dict.keys()): 16 | if i == 0: 17 | loss = loss_dict[key] 18 | n = loss_dict[key].detach() 19 | else: 20 | loss = loss + loss_dict[key]/n 21 | 22 | return loss 23 | 24 | def reduce_loss(loss, reduction): 25 | """Reduce loss as specified. 26 | 27 | Args: 28 | loss (Tensor): Elementwise loss tensor. 29 | reduction (str): Options are "none", "mean" and "sum". 30 | 31 | Return: 32 | Tensor: Reduced loss tensor. 33 | """ 34 | reduction_enum = F._Reduction.get_enum(reduction) 35 | if reduction_enum == 0: 36 | return loss 37 | elif reduction_enum == 1: 38 | return loss.mean() 39 | elif reduction_enum == 2: 40 | return loss.sum() 41 | 42 | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): 43 | """Apply element-wise weight and reduce loss. 44 | 45 | Args: 46 | loss (Tensor): Element-wise loss. 47 | weight (Tensor): Element-wise weights. 48 | reduction (str): Same as built-in losses of PyTorch. 49 | avg_factor (float): Avarage factor when computing the mean of losses. 50 | 51 | Returns: 52 | Tensor: Processed loss values. 53 | """ 54 | if weight is not None: 55 | loss = loss * weight 56 | 57 | if avg_factor is None: 58 | loss = reduce_loss(loss, reduction) 59 | else: 60 | if reduction == 'mean': 61 | loss = loss.sum() / avg_factor 62 | elif reduction != 'none': 63 | raise ValueError('avg_factor can not be used with reduction="sum"') 64 | return loss 65 | 66 | from . import sigmoid_focal_loss_cuda, SigmoidFocalLoss_cuda 67 | 68 | class SigmoidFocalLossFunction(Function): 69 | 70 | @staticmethod 71 | def forward(ctx, input, target, gamma=2.0, alpha=0.25): 72 | ctx.save_for_backward(input, target) 73 | num_classes = input.shape[1] 74 | ctx.num_classes = num_classes 75 | ctx.gamma = gamma 76 | ctx.alpha = alpha 77 | 78 | loss = sigmoid_focal_loss_cuda.forward(input, target, num_classes, gamma, alpha) 79 | return loss 80 | 81 | @staticmethod 82 | @once_differentiable 83 | def backward(ctx, d_loss): 84 | input, target = ctx.saved_tensors 85 | num_classes = ctx.num_classes 86 | gamma = ctx.gamma 87 | alpha = ctx.alpha 88 | d_loss = d_loss.contiguous() 89 | d_input = sigmoid_focal_loss_cuda.backward(input, target, d_loss, num_classes, gamma, alpha) 90 | return d_input, None, None, None, None 91 | 92 | sigmoid_focal_loss_ = SigmoidFocalLossFunction.apply 93 | 94 | def sigmoid_focal_loss(pred, 95 | target, 96 | weight=None, 97 | gamma=2.0, 98 | alpha=0.25, 99 | reduction='mean', 100 | avg_factor=None): 101 | loss = sigmoid_focal_loss_(pred, target, gamma, alpha) 102 | 103 | if weight is not None: 104 | weight = weight.view(-1, 1) 105 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 106 | return loss 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /tools/loss/sigmoid_focal_loss_cuda.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/tools/loss/sigmoid_focal_loss_cuda.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /tools/loss/src/SigmoidFocalLoss.cpp: -------------------------------------------------------------------------------- 1 | // modify from 2 | // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h 3 | #include 4 | 5 | at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, 6 | const at::Tensor &targets, 7 | const int num_classes, 8 | const float gamma, const float alpha); 9 | 10 | at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, 11 | const at::Tensor &targets, 12 | const at::Tensor &d_losses, 13 | const int num_classes, 14 | const float gamma, const float alpha); 15 | 16 | // Interface for Python 17 | at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits, 18 | const at::Tensor &targets, 19 | const int num_classes, const float gamma, 20 | const float alpha) { 21 | if (logits.type().is_cuda()) { 22 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, 23 | alpha); 24 | } 25 | AT_ERROR("SigmoidFocalLoss is not implemented on the CPU"); 26 | } 27 | 28 | at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits, 29 | const at::Tensor &targets, 30 | const at::Tensor &d_losses, 31 | const int num_classes, const float gamma, 32 | const float alpha) { 33 | if (logits.type().is_cuda()) { 34 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, 35 | num_classes, gamma, alpha); 36 | } 37 | AT_ERROR("SigmoidFocalLoss is not implemented on the CPU"); 38 | } 39 | 40 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 41 | m.def("forward", &SigmoidFocalLoss_forward, 42 | "SigmoidFocalLoss forward (CUDA)"); 43 | m.def("backward", &SigmoidFocalLoss_backward, 44 | "SigmoidFocalLoss backward (CUDA)"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tools/loss/src/SigmoidFocalLoss_cuda.cu: -------------------------------------------------------------------------------- 1 | // modified from 2 | // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu 3 | 4 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 5 | // This file is modified from 6 | // https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu 7 | // Cheng-Yang Fu 8 | // cyfu@cs.unc.edu 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | // TODO make it in a common file 19 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 20 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 21 | i += blockDim.x * gridDim.x) 22 | 23 | template 24 | __global__ void SigmoidFocalLossForward(const int nthreads, 25 | const scalar_t *logits, 26 | const int64_t *targets, 27 | const int num_classes, 28 | const float gamma, const float alpha, 29 | const int num, scalar_t *losses) { 30 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 31 | int n = i / num_classes; 32 | int d = i % num_classes; // current class[0~79]; 33 | int t = targets[n]; // target class [0~79]; 34 | 35 | // Decide it is positive or negative case. 36 | scalar_t c1 = (t == d); 37 | scalar_t c2 = (t >= 0 & t != d); 38 | 39 | scalar_t zn = (1.0 - alpha); 40 | scalar_t zp = (alpha); 41 | 42 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 43 | scalar_t p = 1. / (1. + expf(-logits[i])); 44 | 45 | // (1-p)**gamma * log(p) where 46 | scalar_t term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); 47 | 48 | // p**gamma * log(1-p) 49 | scalar_t term2 = 50 | powf(p, gamma) * 51 | (-1. * logits[i] * (logits[i] >= 0) - 52 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); 53 | 54 | losses[i] = 0.0; 55 | losses[i] += -c1 * term1 * zp; 56 | losses[i] += -c2 * term2 * zn; 57 | 58 | } // CUDA_1D_KERNEL_LOOP 59 | } // SigmoidFocalLossForward 60 | 61 | template 62 | __global__ void SigmoidFocalLossBackward( 63 | const int nthreads, const scalar_t *logits, const int64_t *targets, 64 | const scalar_t *d_losses, const int num_classes, const float gamma, 65 | const float alpha, const int num, scalar_t *d_logits) { 66 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 67 | int n = i / num_classes; 68 | int d = i % num_classes; // current class[0~79]; 69 | int t = targets[n]; // target class [1~80], 0 is background; 70 | 71 | // Decide it is positive or negative case. 72 | scalar_t c1 = (t == d); 73 | scalar_t c2 = (t >= 0 & t != d); 74 | 75 | scalar_t zn = (1.0 - alpha); 76 | scalar_t zp = (alpha); 77 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 78 | scalar_t p = 1. / (1. + expf(-logits[i])); 79 | 80 | // (1-p)**g * (1 - p - g*p*log(p) 81 | scalar_t term1 = 82 | powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); 83 | 84 | // (p**g) * (g*(1-p)*log(1-p) - p) 85 | scalar_t term2 = 86 | powf(p, gamma) * 87 | ((-1. * logits[i] * (logits[i] >= 0) - 88 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * 89 | (1. - p) * gamma - 90 | p); 91 | d_logits[i] = 0.0; 92 | d_logits[i] += -c1 * term1 * zp; 93 | d_logits[i] += -c2 * term2 * zn; 94 | d_logits[i] = d_logits[i] * d_losses[i]; 95 | 96 | } // CUDA_1D_KERNEL_LOOP 97 | } // SigmoidFocalLossBackward 98 | 99 | at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, 100 | const at::Tensor &targets, 101 | const int num_classes, 102 | const float gamma, const float alpha) { 103 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 104 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 105 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 106 | 107 | const int num_samples = logits.size(0); 108 | 109 | auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); 110 | auto losses_size = num_samples * logits.size(1); 111 | 112 | dim3 grid( 113 | std::min(THCCeilDiv((int64_t)losses_size, (int64_t)512), (int64_t)4096)); 114 | dim3 block(512); 115 | 116 | if (losses.numel() == 0) { 117 | THCudaCheck(cudaGetLastError()); 118 | return losses; 119 | } 120 | 121 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( 122 | logits.scalar_type(), "SigmoidFocalLoss_forward", [&] { 123 | SigmoidFocalLossForward<<>>( 124 | losses_size, logits.contiguous().data(), 125 | targets.contiguous().data(), num_classes, gamma, alpha, 126 | num_samples, losses.data()); 127 | }); 128 | THCudaCheck(cudaGetLastError()); 129 | return losses; 130 | } 131 | 132 | at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, 133 | const at::Tensor &targets, 134 | const at::Tensor &d_losses, 135 | const int num_classes, 136 | const float gamma, 137 | const float alpha) { 138 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 139 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 140 | AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); 141 | 142 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 143 | 144 | const int num_samples = logits.size(0); 145 | AT_ASSERTM(logits.size(1) == num_classes, 146 | "logits.size(1) should be num_classes"); 147 | 148 | auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); 149 | auto d_logits_size = num_samples * logits.size(1); 150 | 151 | dim3 grid(std::min(THCCeilDiv((int64_t)d_logits_size, (int64_t)512), 152 | (int64_t)4096)); 153 | dim3 block(512); 154 | 155 | if (d_logits.numel() == 0) { 156 | THCudaCheck(cudaGetLastError()); 157 | return d_logits; 158 | } 159 | 160 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( 161 | logits.scalar_type(), "SigmoidFocalLoss_backward", [&] { 162 | SigmoidFocalLossBackward<<>>( 163 | d_logits_size, logits.contiguous().data(), 164 | targets.contiguous().data(), 165 | d_losses.contiguous().data(), num_classes, gamma, alpha, 166 | num_samples, d_logits.data()); 167 | }); 168 | 169 | THCudaCheck(cudaGetLastError()); 170 | return d_logits; 171 | } 172 | -------------------------------------------------------------------------------- /tools/loss/src/sigmoid_focal_loss.cpp: -------------------------------------------------------------------------------- 1 | // modify from 2 | // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h 3 | #include 4 | 5 | at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, 6 | const at::Tensor &targets, 7 | const int num_classes, 8 | const float gamma, const float alpha); 9 | 10 | at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, 11 | const at::Tensor &targets, 12 | const at::Tensor &d_losses, 13 | const int num_classes, 14 | const float gamma, const float alpha); 15 | 16 | // Interface for Python 17 | at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits, 18 | const at::Tensor &targets, 19 | const int num_classes, const float gamma, 20 | const float alpha) { 21 | if (logits.type().is_cuda()) { 22 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, 23 | alpha); 24 | } 25 | AT_ERROR("SigmoidFocalLoss is not implemented on the CPU"); 26 | } 27 | 28 | at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits, 29 | const at::Tensor &targets, 30 | const at::Tensor &d_losses, 31 | const int num_classes, const float gamma, 32 | const float alpha) { 33 | if (logits.type().is_cuda()) { 34 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, 35 | num_classes, gamma, alpha); 36 | } 37 | AT_ERROR("SigmoidFocalLoss is not implemented on the CPU"); 38 | } 39 | 40 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 41 | m.def("forward", &SigmoidFocalLoss_forward, 42 | "SigmoidFocalLoss forward (CUDA)"); 43 | m.def("backward", &SigmoidFocalLoss_backward, 44 | "SigmoidFocalLoss backward (CUDA)"); 45 | } 46 | -------------------------------------------------------------------------------- /tools/loss/src/sigmoid_focal_loss_cuda.cu: -------------------------------------------------------------------------------- 1 | // modified from 2 | // https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu 3 | 4 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 5 | // This file is modified from 6 | // https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu 7 | // Cheng-Yang Fu 8 | // cyfu@cs.unc.edu 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | // TODO make it in a common file 19 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 20 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 21 | i += blockDim.x * gridDim.x) 22 | 23 | template 24 | __global__ void SigmoidFocalLossForward(const int nthreads, 25 | const scalar_t *logits, 26 | const int64_t *targets, 27 | const int num_classes, 28 | const float gamma, const float alpha, 29 | const int num, scalar_t *losses) { 30 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 31 | int n = i / num_classes; 32 | int d = i % num_classes; // current class[0~79]; 33 | int t = targets[n]; // target class [1~80]; 34 | 35 | // Decide it is positive or negative case. 36 | scalar_t c1 = (t == (d + 1)); 37 | scalar_t c2 = (t >= 0 & t != (d + 1)); 38 | 39 | scalar_t zn = (1.0 - alpha); 40 | scalar_t zp = (alpha); 41 | 42 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 43 | scalar_t p = 1. / (1. + expf(-logits[i])); 44 | 45 | // (1-p)**gamma * log(p) where 46 | scalar_t term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); 47 | 48 | // p**gamma * log(1-p) 49 | scalar_t term2 = 50 | powf(p, gamma) * 51 | (-1. * logits[i] * (logits[i] >= 0) - 52 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); 53 | 54 | losses[i] = 0.0; 55 | losses[i] += -c1 * term1 * zp; 56 | losses[i] += -c2 * term2 * zn; 57 | 58 | } // CUDA_1D_KERNEL_LOOP 59 | } // SigmoidFocalLossForward 60 | 61 | template 62 | __global__ void SigmoidFocalLossBackward( 63 | const int nthreads, const scalar_t *logits, const int64_t *targets, 64 | const scalar_t *d_losses, const int num_classes, const float gamma, 65 | const float alpha, const int num, scalar_t *d_logits) { 66 | CUDA_1D_KERNEL_LOOP(i, nthreads) { 67 | int n = i / num_classes; 68 | int d = i % num_classes; // current class[0~79]; 69 | int t = targets[n]; // target class [1~80], 0 is background; 70 | 71 | // Decide it is positive or negative case. 72 | scalar_t c1 = (t == (d + 1)); 73 | scalar_t c2 = (t >= 0 & t != (d + 1)); 74 | 75 | scalar_t zn = (1.0 - alpha); 76 | scalar_t zp = (alpha); 77 | // p = 1. / 1. + expf(-x); p = sigmoid(x) 78 | scalar_t p = 1. / (1. + expf(-logits[i])); 79 | 80 | // (1-p)**g * (1 - p - g*p*log(p) 81 | scalar_t term1 = 82 | powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); 83 | 84 | // (p**g) * (g*(1-p)*log(1-p) - p) 85 | scalar_t term2 = 86 | powf(p, gamma) * 87 | ((-1. * logits[i] * (logits[i] >= 0) - 88 | logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * 89 | (1. - p) * gamma - 90 | p); 91 | d_logits[i] = 0.0; 92 | d_logits[i] += -c1 * term1 * zp; 93 | d_logits[i] += -c2 * term2 * zn; 94 | d_logits[i] = d_logits[i] * d_losses[i]; 95 | 96 | } // CUDA_1D_KERNEL_LOOP 97 | } // SigmoidFocalLossBackward 98 | 99 | at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, 100 | const at::Tensor &targets, 101 | const int num_classes, 102 | const float gamma, const float alpha) { 103 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 104 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 105 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 106 | 107 | const int num_samples = logits.size(0); 108 | 109 | auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); 110 | auto losses_size = num_samples * logits.size(1); 111 | 112 | dim3 grid( 113 | std::min(THCCeilDiv((int64_t)losses_size, (int64_t)512), (int64_t)4096)); 114 | dim3 block(512); 115 | 116 | if (losses.numel() == 0) { 117 | THCudaCheck(cudaGetLastError()); 118 | return losses; 119 | } 120 | 121 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( 122 | logits.scalar_type(), "SigmoidFocalLoss_forward", [&] { 123 | SigmoidFocalLossForward<<>>( 124 | losses_size, logits.contiguous().data(), 125 | targets.contiguous().data(), num_classes, gamma, alpha, 126 | num_samples, losses.data()); 127 | }); 128 | THCudaCheck(cudaGetLastError()); 129 | return losses; 130 | } 131 | 132 | at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, 133 | const at::Tensor &targets, 134 | const at::Tensor &d_losses, 135 | const int num_classes, 136 | const float gamma, 137 | const float alpha) { 138 | AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); 139 | AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); 140 | AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); 141 | 142 | AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); 143 | 144 | const int num_samples = logits.size(0); 145 | AT_ASSERTM(logits.size(1) == num_classes, 146 | "logits.size(1) should be num_classes"); 147 | 148 | auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); 149 | auto d_logits_size = num_samples * logits.size(1); 150 | 151 | dim3 grid(std::min(THCCeilDiv((int64_t)d_logits_size, (int64_t)512), 152 | (int64_t)4096)); 153 | dim3 block(512); 154 | 155 | if (d_logits.numel() == 0) { 156 | THCudaCheck(cudaGetLastError()); 157 | return d_logits; 158 | } 159 | 160 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( 161 | logits.scalar_type(), "SigmoidFocalLoss_backward", [&] { 162 | SigmoidFocalLossBackward<<>>( 163 | d_logits_size, logits.contiguous().data(), 164 | targets.contiguous().data(), 165 | d_losses.contiguous().data(), num_classes, gamma, alpha, 166 | num_samples, d_logits.data()); 167 | }); 168 | 169 | THCudaCheck(cudaGetLastError()); 170 | return d_logits; 171 | } 172 | -------------------------------------------------------------------------------- /tools/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Misc functions, including distributed helpers. 3 | 4 | Mostly copy-paste from torchvision references. 5 | """ 6 | from typing import List, Optional 7 | 8 | import torch 9 | import torch.distributed as dist 10 | import torchvision 11 | from torch import Tensor 12 | 13 | 14 | def _max_by_axis(the_list): 15 | maxes = the_list[0] 16 | for sublist in the_list[1:]: 17 | for index, item in enumerate(sublist): 18 | maxes[index] = max(maxes[index], item) 19 | return maxes 20 | 21 | 22 | class NestedTensor(object): 23 | def __init__(self, tensors, mask: Optional[Tensor]): 24 | self.tensors = tensors 25 | self.mask = mask 26 | 27 | def to(self, device): 28 | cast_tensor = self.tensors.to(device) 29 | mask = self.mask 30 | if mask is not None: 31 | assert mask is not None 32 | cast_mask = mask.to(device) 33 | else: 34 | cast_mask = None 35 | return NestedTensor(cast_tensor, cast_mask) 36 | 37 | def decompose(self): 38 | return self.tensors, self.mask 39 | 40 | def __repr__(self): 41 | return str(self.tensors) 42 | 43 | 44 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 45 | if tensor_list[0].ndim == 3: 46 | if torchvision._is_tracing(): 47 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 48 | 49 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 50 | batch_shape = [len(tensor_list)] + max_size 51 | b, c, h, w = batch_shape 52 | dtype = tensor_list[0].dtype 53 | device = tensor_list[0].device 54 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 55 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 56 | for img, pad_img, m in zip(tensor_list, tensor, mask): 57 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 58 | m[: img.shape[1], : img.shape[2]] = False 59 | else: 60 | raise ValueError("not supported") 61 | return NestedTensor(tensor, mask) 62 | 63 | 64 | @torch.jit.unused 65 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 66 | max_size = [] 67 | for i in range(tensor_list[0].dim()): 68 | max_size_i = torch.max( 69 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 70 | ).to(torch.int64) 71 | max_size.append(max_size_i) 72 | max_size = tuple(max_size) 73 | 74 | padded_imgs = [] 75 | padded_masks = [] 76 | for img in tensor_list: 77 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 78 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 79 | padded_imgs.append(padded_img) 80 | 81 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 82 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 83 | padded_masks.append(padded_mask.to(torch.bool)) 84 | 85 | tensor = torch.stack(padded_imgs) 86 | mask = torch.stack(padded_masks) 87 | 88 | return NestedTensor(tensor, mask=mask) 89 | 90 | 91 | def is_dist_avail_and_initialized(): 92 | if not dist.is_available(): 93 | return False 94 | if not dist.is_initialized(): 95 | return False 96 | return True 97 | -------------------------------------------------------------------------------- /tools/nms.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: NMS tools 6 | 7 | example: 8 | 9 | ''' 10 | 11 | import numpy as np 12 | import torch.nn as nn 13 | import torch 14 | import torch.nn.functional as F 15 | 16 | def nms(boxes, scores, nms_thr): 17 | x1 = boxes[:, 0] 18 | y1 = boxes[:, 1] 19 | x2 = boxes[:, 2] 20 | y2 = boxes[:, 3] 21 | 22 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 23 | order = scores.argsort()[::-1] 24 | 25 | keep = [] 26 | while order.size > 0: 27 | i = order[0] 28 | keep.append(i) 29 | xx1 = np.maximum(x1[i], x1[order[1:]]) 30 | yy1 = np.maximum(y1[i], y1[order[1:]]) 31 | xx2 = np.minimum(x2[i], x2[order[1:]]) 32 | yy2 = np.minimum(y2[i], y2[order[1:]]) 33 | 34 | w = np.maximum(0.0, xx2 - xx1 + 1) 35 | h = np.maximum(0.0, yy2 - yy1 + 1) 36 | inter = w * h 37 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 38 | 39 | inds = np.where(ovr <= nms_thr)[0] 40 | order = order[inds + 1] 41 | 42 | return keep 43 | 44 | def multiclass_nms(boxes, scores, nms_thr, score_thr): 45 | final_dets = [] 46 | num_classes = scores.shape[1] 47 | for cls_ind in range(num_classes): 48 | cls_scores = scores[:, cls_ind] 49 | valid_score_mask = cls_scores > score_thr 50 | if valid_score_mask.sum() == 0: 51 | continue 52 | else: 53 | valid_scores = cls_scores[valid_score_mask] 54 | valid_boxes = boxes[valid_score_mask] 55 | keep = nms(valid_boxes, valid_scores, nms_thr) 56 | if len(keep) > 0: 57 | cls_inds = np.ones((len(keep), 1)) * cls_ind 58 | dets = np.concatenate( 59 | [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 60 | ) 61 | final_dets.append(dets) 62 | if len(final_dets) == 0: 63 | return None 64 | return np.concatenate(final_dets, 0) 65 | 66 | def points_nms(heat, kernel=2): 67 | hmax = nn.functional.max_pool2d( 68 | heat, (kernel, kernel), stride=1, padding=1) 69 | keep = (hmax[:, :, :-1, :-1] == heat).float() 70 | return heat * keep 71 | 72 | def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): 73 | """Matrix NMS for multi-class masks. 74 | 75 | Args: 76 | seg_masks (Tensor): shape (n, h, w) 77 | cate_labels (Tensor): shape (n), mask labels in descending order 78 | cate_scores (Tensor): shape (n), mask scores in descending order 79 | kernel (str): 'linear' or 'gauss' 80 | sigma (float): std in gaussian method 81 | sum_masks (Tensor): The sum of seg_masks 82 | 83 | Returns: 84 | Tensor: cate_scores_update, tensors of shape (n) 85 | """ 86 | n_samples = len(cate_labels) 87 | if n_samples == 0: 88 | return [] 89 | if sum_masks is None: 90 | sum_masks = seg_masks.sum((1, 2)).float() 91 | seg_masks = seg_masks.reshape(n_samples, -1).float() 92 | inter_matrix = torch.mm(seg_masks, seg_masks.transpose(1, 0)) 93 | sum_masks_x = sum_masks.expand(n_samples, n_samples) 94 | iou_matrix = (inter_matrix / (sum_masks_x + sum_masks_x.transpose(1, 0) - inter_matrix)).triu(diagonal=1) 95 | cate_labels_x = cate_labels.expand(n_samples, n_samples) 96 | label_matrix = (cate_labels_x == cate_labels_x.transpose(1, 0)).float().triu(diagonal=1) 97 | 98 | compensate_iou, _ = (iou_matrix * label_matrix).max(0) 99 | compensate_iou = compensate_iou.expand(n_samples, n_samples).transpose(1, 0) 100 | 101 | decay_iou = iou_matrix * label_matrix 102 | 103 | if kernel == 'gaussian': 104 | decay_matrix = torch.exp(-1 * sigma * (decay_iou ** 2)) 105 | compensate_matrix = torch.exp(-1 * sigma * (compensate_iou ** 2)) 106 | decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0) 107 | elif kernel == 'linear': 108 | decay_matrix = (1-decay_iou)/(1-compensate_iou) 109 | decay_coefficient, _ = decay_matrix.min(0) 110 | else: 111 | raise NotImplementedError 112 | 113 | cate_scores_update = cate_scores * decay_coefficient 114 | return cate_scores_update -------------------------------------------------------------------------------- /tools/nninit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | 6 | import warnings 7 | 8 | 9 | def constant_init(module, val, bias=0): 10 | if hasattr(module, 'weight') and module.weight is not None: 11 | nn.init.constant_(module.weight, val) 12 | if hasattr(module, 'bias') and module.bias is not None: 13 | nn.init.constant_(module.bias, bias) 14 | 15 | 16 | def xavier_init(module, gain=1, bias=0, distribution='normal'): 17 | assert distribution in ['uniform', 'normal'] 18 | if distribution == 'uniform': 19 | nn.init.xavier_uniform_(module.weight, gain=gain) 20 | else: 21 | nn.init.xavier_normal_(module.weight, gain=gain) 22 | if hasattr(module, 'bias') and module.bias is not None: 23 | nn.init.constant_(module.bias, bias) 24 | 25 | 26 | def normal_init(module, mean=0, std=1, bias=0): 27 | nn.init.normal_(module.weight, mean, std) 28 | if hasattr(module, 'bias') and module.bias is not None: 29 | nn.init.constant_(module.bias, bias) 30 | 31 | 32 | def uniform_init(module, a=0, b=1, bias=0): 33 | nn.init.uniform_(module.weight, a, b) 34 | if hasattr(module, 'bias') and module.bias is not None: 35 | nn.init.constant_(module.bias, bias) 36 | 37 | 38 | def kaiming_init(module, 39 | a=0, 40 | mode='fan_out', 41 | nonlinearity='relu', 42 | bias=0, 43 | distribution='normal'): 44 | assert distribution in ['uniform', 'normal'] 45 | if distribution == 'uniform': 46 | nn.init.kaiming_uniform_( 47 | module.weight, a=a, mode=mode, nonlinearity=nonlinearity) 48 | else: 49 | nn.init.kaiming_normal_( 50 | module.weight, a=a, mode=mode, nonlinearity=nonlinearity) 51 | if hasattr(module, 'bias') and module.bias is not None: 52 | nn.init.constant_(module.bias, bias) 53 | 54 | 55 | def bias_init_with_prob(prior_prob): 56 | """initialize conv/fc bias value according to giving probablity.""" 57 | bias_init = float(-np.log((1 - prior_prob) / prior_prob)) 58 | return bias_init 59 | 60 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): 61 | def norm_cdf(x): 62 | return (1. + math.erf(x / math.sqrt(2.))) / 2. 63 | 64 | if (mean < a - 2 * std) or (mean > b + 2 * std): 65 | warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " 66 | "The distribution of values may be incorrect.", stacklevel=2) 67 | with torch.no_grad(): 68 | l = norm_cdf((a - mean) / std) 69 | u = norm_cdf((b - mean) / std) 70 | tensor.uniform_(2 * l - 1, 2 * u - 1) 71 | tensor.erfinv_() 72 | tensor.mul_(std * math.sqrt(2.)) 73 | tensor.add_(mean) 74 | tensor.clamp_(min=a, max=b) 75 | return tensor 76 | 77 | def common_init(m): 78 | if isinstance(m, (nn.Conv2d, nn.Conv1d)): 79 | kaiming_init(m) 80 | elif isinstance(m, nn.Linear): 81 | trunc_normal_(m.weight, std=.02) 82 | if m.bias is not None: 83 | nn.init.constant_(m.bias, 0) 84 | elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.GroupNorm, nn.LayerNorm)): 85 | nn.init.constant_(m.weight, 1) 86 | nn.init.constant_(m.bias, 0) 87 | -------------------------------------------------------------------------------- /train_ddp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: PPT训练框架入口 6 | 7 | example: 8 | 9 | ''' 10 | 11 | 12 | import torch 13 | import torch.backends.cudnn as cudnn 14 | import torch.distributed as dist 15 | import torch.multiprocessing as mp 16 | from torch.nn.parallel import DistributedDataParallel as DDP 17 | 18 | 19 | import argparse 20 | import numpy as np 21 | import random 22 | 23 | import warnings 24 | import yaml 25 | 26 | from data.dataloader import Data_loader 27 | from trainer_ddp import Trainer 28 | from model.model_factory import Classify_Model, DB_Model, Segmentation_Model, Yolox_Model, \ 29 | Crnn_Model, Solo_Model, ICTransformer, ReBiSeNet_Model, MaskFormer_Model 30 | from utils.common import find_free_port 31 | 32 | MODEL_SELECT = {'Classify': Classify_Model, 'DB': DB_Model, 'Seg': Segmentation_Model, 'YOLOX':Yolox_Model, 33 | 'CRNN':Crnn_Model, 'SOLO':Solo_Model, 'ICT': ICTransformer, 'ReBiSe': ReBiSeNet_Model, 'MaskFormer': MaskFormer_Model} 34 | 35 | def arg_parser(): 36 | parser = argparse.ArgumentParser("train parser") 37 | parser.add_argument( 38 | "-e", "--eval_interval", type=int, default=1, help="eval interval" 39 | ) 40 | parser.add_argument( 41 | "-s", "--save_interval", type=int, default=1, help="save interval" 42 | ) 43 | parser.add_argument( 44 | "-v", "--visual_batch_interval", type=int, default=10, help="save interval" 45 | ) 46 | parser.add_argument( 47 | "-ste", "--start_eval", type=int, default=0, help="save interval" 48 | ) 49 | parser.add_argument( 50 | "-se", "--seed", type=int, default=None, help="random seed" 51 | ) 52 | parser.add_argument( 53 | "--local_rank", default=0, type=int, help="GPU device for training" 54 | ) 55 | parser.add_argument( 56 | "--nprocs", default=1, type=int, help="GPU device for training" 57 | ) 58 | parser.add_argument( 59 | "--syncBN", default=False, action="store_true", help="syncBN" 60 | ) 61 | parser.add_argument( 62 | "-c", "--ckpt", default=None, type=str, help="checkpoint file" 63 | ) 64 | parser.add_argument( 65 | "--resume", default=False, action="store_true", help="resume training" 66 | ) 67 | parser.add_argument( 68 | "-pre", "--pretrained", default=None, type=str, help="pretrained file" 69 | ) 70 | parser.add_argument( 71 | "-f", 72 | "--exp_file", 73 | default='./config/Config.yaml', 74 | type=str, 75 | help="training description file", 76 | ) 77 | parser.add_argument( 78 | "-o", 79 | "--output_dir", 80 | default='./checkpoints', 81 | type=str, 82 | help="save dir", 83 | ) 84 | parser.add_argument( 85 | "--fp16", 86 | dest="fp16", 87 | default=False, 88 | action="store_true", 89 | help="Adopting mix precision training.", 90 | ) 91 | 92 | return parser 93 | 94 | def init_seeds(seed=0, cuda_deterministic=True): 95 | random.seed(seed) 96 | np.random.seed(seed) 97 | torch.manual_seed(seed) 98 | if cuda_deterministic: 99 | cudnn.deterministic = True 100 | cudnn.benchmark = False 101 | else: 102 | cudnn.deterministic = False 103 | cudnn.benchmark = True 104 | 105 | def main(): 106 | args = arg_parser().parse_args() 107 | args.nprocs = torch.cuda.device_count() 108 | 109 | args.distributed = True if args.nprocs > 1 else False 110 | args.dis_backend = 'nccl' 111 | 112 | dist_url = "tcp://127.0.0.1" 113 | port = find_free_port() 114 | args.dist_url = "{}:{}".format(dist_url, str(port) ) 115 | 116 | with open(args.exp_file, mode='r') as fr: 117 | cfg = yaml.load(fr, Loader=yaml.FullLoader) 118 | 119 | if args.distributed: 120 | mp.spawn(main_worker, nprocs=args.nprocs, args=(args.nprocs, args, cfg)) 121 | else: 122 | main_worker(args.local_rank, args.nprocs, args, cfg) 123 | 124 | def main_worker(local_rank,nprocs, args, cfg): 125 | assert ( torch.cuda.is_available()), "cuda is not available. Please check your installation." 126 | args.rank = local_rank 127 | cfg['distributed'] = args.distributed 128 | init_seeds(local_rank+1) 129 | 130 | cudnn.benchmark = True 131 | if args.distributed: 132 | dist.init_process_group(backend=args.dis_backend, 133 | init_method=args.dist_url, 134 | world_size=nprocs, 135 | rank=local_rank) 136 | 137 | Model = MODEL_SELECT[cfg['experiment_name']](config=cfg, amp_training=args.fp16) 138 | DATA_Loader = Data_loader(config=cfg, args=args) 139 | 140 | trainer = Trainer(cfg, args, Model, DATA_Loader, step_update=True) 141 | trainer.train() 142 | 143 | if __name__ == '__main__': 144 | main() -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsr12345/Pytorch-Devkit/97f4ca88b9dbecc43cd97be2537ddc601715cbc9/utils/__init__.py -------------------------------------------------------------------------------- /utils/common.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: PPT common functions 6 | 7 | example: 8 | 9 | ''' 10 | # coding: utf-8 11 | 12 | import os 13 | from functools import partial 14 | 15 | import torch 16 | import torch.distributed as dist 17 | 18 | from loguru import logger 19 | 20 | # # distribute function and config ## 21 | # _LOCAL_PROCESS_GROUP = None 22 | 23 | def synchronize(): 24 | """ 25 | Helper function to synchronize (barrier) among all processes when using distributed training 26 | """ 27 | if not dist.is_available(): 28 | return 29 | if not dist.is_initialized(): 30 | return 31 | world_size = dist.get_world_size() 32 | if world_size == 1: 33 | return 34 | dist.barrier() 35 | 36 | def find_free_port(): 37 | """ 38 | Find an available port of current machine / node. 39 | """ 40 | import socket 41 | 42 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 43 | # Binding to port 0 will cause the OS to find an available port for us 44 | sock.bind(("", 0)) 45 | port = sock.getsockname()[1] 46 | sock.close() 47 | # NOTE: there is still a chance the port could be taken by other processes. 48 | return port 49 | 50 | def get_rank(): 51 | if not dist.is_available(): 52 | return 0 53 | if not dist.is_initialized(): 54 | return 0 55 | return dist.get_rank() 56 | 57 | def reduce_mean(tensor, nprocs): 58 | rt = tensor.clone() 59 | dist.all_reduce(rt, op=dist.ReduceOp.SUM) 60 | rt /= nprocs 61 | return rt 62 | 63 | def reduce_sum(tensor): 64 | rt = tensor.clone() 65 | dist.all_reduce(rt, op=dist.ReduceOp.SUM) 66 | return rt 67 | 68 | def remove_file(file_dir, key_words=''): 69 | assert key_words != '' 70 | for fn in os.listdir(file_dir): 71 | if key_words in fn: 72 | os.remove(os.path.join(file_dir, fn)) 73 | return True 74 | else: 75 | return False 76 | 77 | def prepare_device(local_rank, local_world_size, distributed=False): 78 | ''' 79 | setup GPU device if available, move model into configured device 80 | :param local_rank: 81 | :param local_world_size: 82 | :return: 83 | ''' 84 | if distributed: 85 | ngpu_per_process = torch.cuda.device_count() // local_world_size 86 | device_ids = list(range(local_rank * ngpu_per_process, (local_rank + 1) * ngpu_per_process)) 87 | 88 | if torch.cuda.is_available() and local_rank != -1: 89 | torch.cuda.set_device(device_ids[0]) # device_ids[0] =local_rank if local_world_size = n_gpu per node 90 | device = 'cuda' 91 | else: 92 | device = 'cpu' 93 | device = torch.device(device) 94 | return device, device_ids 95 | else: 96 | n_gpu = torch.cuda.device_count() 97 | n_gpu_use = local_world_size 98 | if n_gpu_use > 0 and n_gpu == 0: 99 | n_gpu_use = 0 100 | if n_gpu_use > n_gpu: 101 | n_gpu_use = n_gpu 102 | 103 | list_ids = list(range(n_gpu_use)) 104 | if n_gpu_use > 0: 105 | torch.cuda.set_device(list_ids[0]) # only use first available gpu as devices 106 | device = 'cuda' 107 | else: 108 | device = 'cpu' 109 | device = torch.device(device) 110 | return device, list_ids 111 | 112 | def multi_apply(func, *args, **kwargs): 113 | """Apply function to a list of arguments. 114 | 115 | Note: 116 | This function applies the ``func`` to multiple inputs and 117 | map the multiple outputs of the ``func`` into different 118 | list. Each list contains the same type of outputs corresponding 119 | to different inputs. 120 | 121 | Args: 122 | func (Function): A function that will be applied to a list of 123 | arguments 124 | 125 | Returns: 126 | tuple(list): A tuple containing multiple list, each list contains 127 | a kind of returned results by the function 128 | """ 129 | pfunc = partial(func, **kwargs) if kwargs else func 130 | map_results = map(pfunc, *args) 131 | return tuple(map(list, zip(*map_results))) 132 | 133 | # def togpu(data, requires_grad=True): 134 | # """ 135 | # Transfer tensor in `data` to gpu recursively 136 | # `data` can be dict, list or tuple 137 | # """ 138 | # if isinstance(data, list) or isinstance(data, tuple): 139 | # data = [togpu(x) for x in data] 140 | # elif isinstance(data, dict): 141 | # data = {key:togpu(_data) for key,_data in data.items()} 142 | # elif isinstance(data, torch.Tensor): 143 | # data = data.contiguous().cuda(non_blocking=True).requires_grad = requires_grad 144 | # return data 145 | 146 | # def togpu(data): 147 | # """ 148 | # Transfer tensor in `data` to gpu recursively 149 | # `data` can be dict, list or tuple 150 | # """ 151 | # if isinstance(data, list) or isinstance(data, tuple): 152 | # data = [togpu(x) for x in data] 153 | # elif isinstance(data, dict): 154 | # data = {key:togpu(_data) for key,_data in data.items()} 155 | # # else: 156 | # # data = torch.tensor(data) 157 | # else: 158 | # if not torch.is_tensor(data): 159 | # data = torch.tensor(data) 160 | # data = data.contiguous().cuda(non_blocking=True) 161 | # return data 162 | 163 | # def tolong(data): 164 | # if isinstance(data, dict): 165 | # for key in data.keys(): 166 | # data[key] = tolong(data[key]) 167 | # if isinstance(data, list) or isinstance(data, tuple): 168 | # data = [tolong(x) for x in data] 169 | # if torch.is_tensor(data) and data.dtype == torch.int16: 170 | # data = data.long() 171 | # return data 172 | ############################### -------------------------------------------------------------------------------- /utils/standard_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | """ 5 | def recursiveToTensor(data): 6 | if isinstance(data, dict): 7 | for key in data.keys(): 8 | data[key] = recursiveToTensor(data[key]) 9 | if isinstance(data, list) or isinstance(data, tuple): 10 | data = [recursiveToTensor(x) for x in data] 11 | if isinstance(data, np.ndarray): 12 | data = torch.from_numpy(data) 13 | if isinstance(data, bool): 14 | data = torch.tensor(data) 15 | return data 16 | """ 17 | 18 | def recursiveToTensor(data): 19 | """Recursively transform numpy.ndarray to torch.Tensor. 20 | """ 21 | if isinstance(data, dict): 22 | for key in data.keys(): 23 | data[key] = recursiveToTensor(data[key]) 24 | elif isinstance(data, list) or isinstance(data, tuple): 25 | data = [recursiveToTensor(x) for x in data] 26 | # data = torch.tensor(data) 27 | elif isinstance(data, np.ndarray): 28 | """Pytorch now has bool type.""" 29 | data = torch.from_numpy(data).float() 30 | # if isinstance(data, bool): 31 | # data = torch.tensor(data) 32 | elif torch.is_tensor(data): 33 | return data 34 | # else: 35 | # data = torch.tensor(data) 36 | return data 37 | 38 | def togpu(data): 39 | """ 40 | Transfer tensor in `data` to gpu recursively 41 | `data` can be dict, list or tuple 42 | """ 43 | if isinstance(data, list) or isinstance(data, tuple): 44 | data = [togpu(x) for x in data] 45 | elif isinstance(data, dict): 46 | data = {key:togpu(_data) for key,_data in data.items()} 47 | # else: 48 | # data = torch.tensor(data) 49 | else: 50 | if not torch.is_tensor(data): 51 | data = torch.tensor(data) 52 | data = data.contiguous().cuda(non_blocking=True) 53 | return data 54 | 55 | def tolong(data): 56 | if isinstance(data, dict): 57 | for key in data.keys(): 58 | data[key] = tolong(data[key]) 59 | if isinstance(data, list) or isinstance(data, tuple): 60 | data = [tolong(x) for x in data] 61 | if torch.is_tensor(data) and data.dtype == torch.int16: 62 | data = data.long() 63 | return data 64 | 65 | def recursiveToNumpy(data): 66 | """Recursively transform numpy.ndarray to torch.Tensor. 67 | """ 68 | if isinstance(data, dict): 69 | for key in data.keys(): 70 | data[key] = recursiveToTensor(data[key]) 71 | if isinstance(data, list) or isinstance(data, tuple): 72 | data = np.array(data) 73 | return data -------------------------------------------------------------------------------- /utils/visualize.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Author: Shaoran Lu 3 | # Date: 2021/10/04 4 | # Email: lushaoran92@gmail.com 5 | # Description: 6 | 7 | example: 8 | 9 | ''' 10 | # coding: utf-8 11 | 12 | import cv2 13 | import numpy as np 14 | from scipy import ndimage 15 | from data.coco.coco_classes import COCO_CLASSES, COCO_LABEL, COCO_LABEL_MAP 16 | 17 | def box_vis(img, cfg, boxes, scores, cls_ids): 18 | class_names = cfg.get('class_names', None) 19 | conf = cfg.get('test_conf', 0.1) 20 | 21 | if class_names is None: 22 | class_names = COCO_CLASSES 23 | 24 | if boxes is None: 25 | return img 26 | 27 | for i in range(len(boxes)): 28 | box = boxes[i] 29 | cls_id = int(cls_ids[i]) 30 | score = scores[i] 31 | if score < conf: 32 | continue 33 | x0 = int(box[0]) 34 | y0 = int(box[1]) 35 | x1 = int(box[2]) 36 | y1 = int(box[3]) 37 | 38 | color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() 39 | text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100) 40 | txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) 41 | font = cv2.FONT_HERSHEY_SIMPLEX 42 | 43 | txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] 44 | cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) 45 | 46 | txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() 47 | cv2.rectangle( 48 | img, 49 | (x0, y0 + 1), 50 | (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), 51 | txt_bk_color, 52 | -1 53 | ) 54 | cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) 55 | 56 | return img 57 | 58 | 59 | def instance_vis(img, cfg, seg_label, cate_label, score, sort_by_density=False): 60 | score_thr = cfg.get('test_score_thr', 0.1) 61 | img_show = img.copy() 62 | h, w, _ = img.shape 63 | seg_label = seg_label[:, :h, :w] 64 | 65 | seg_label = seg_label.cpu().numpy().astype(np.uint8) 66 | # print(seg_label.sum()) 67 | cate_label = cate_label.cpu().numpy() 68 | score = score.cpu().numpy() 69 | 70 | vis_inds = score > score_thr 71 | seg_label = seg_label[vis_inds] 72 | num_mask = seg_label.shape[0] 73 | cate_label = cate_label[vis_inds] 74 | cate_score = score[vis_inds] 75 | 76 | if sort_by_density: 77 | mask_density = [] 78 | for idx in range(num_mask): 79 | cur_mask = seg_label[idx, :, :] 80 | cur_mask = cv2.resize(cur_mask, (w, h), interpolation= cv2.INTER_LINEAR) 81 | cur_mask = (cur_mask > 0.5).astype(np.int32) 82 | mask_density.append(cur_mask.sum()) 83 | orders = np.argsort(mask_density) 84 | seg_label = seg_label[orders] 85 | cate_label = cate_label[orders] 86 | cate_score = cate_score[orders] 87 | 88 | np.random.seed(42) 89 | color_masks = [ 90 | np.random.randint(0, 256, (1, 3), dtype=np.uint8) 91 | for _ in range(num_mask) 92 | ] 93 | #img_show = None 94 | font = cv2.FONT_HERSHEY_SIMPLEX 95 | for idx in range(num_mask): 96 | idx = -(idx+1) 97 | cur_mask = seg_label[idx, :, :] 98 | # cur_mask = cv2.resize(cur_mask, (w, h), interpolation= cv2.INTER_LINEAR) 99 | cur_mask = (cur_mask > 0.5).astype(np.uint8) 100 | if cur_mask.sum() == 0: 101 | # print('*') 102 | continue 103 | color_mask = color_masks[idx] 104 | cur_mask_bool = cur_mask.astype(np.bool) 105 | img_show[cur_mask_bool] = img[cur_mask_bool] * 0.5 + color_mask * 0.5 106 | 107 | #当前实例的类别 108 | cur_cate = cate_label[idx] # 1-80 109 | realclass = COCO_LABEL[cur_cate] 110 | # realclass = COCO_LABEL[cur_cate-1] 111 | cur_score = cate_score[idx] 112 | 113 | name_idx = COCO_LABEL_MAP[realclass] 114 | label_text = COCO_CLASSES[name_idx-1] 115 | # label_text = COCO_CLASSES[name_idx] 116 | label_text += '|{:.02f}'.format(cur_score) 117 | center_y, center_x = ndimage.measurements.center_of_mass(cur_mask) 118 | vis_pos = (max(int(center_x) - 10, 0), int(center_y)) 119 | cv2.putText(img_show, label_text, vis_pos, font, 0.4, (255, 255, 255)) # green 120 | 121 | return img_show 122 | 123 | _COLORS = np.array( 124 | [ 125 | 0.000, 0.447, 0.741, 126 | 0.850, 0.325, 0.098, 127 | 0.929, 0.694, 0.125, 128 | 0.494, 0.184, 0.556, 129 | 0.466, 0.674, 0.188, 130 | 0.301, 0.745, 0.933, 131 | 0.635, 0.078, 0.184, 132 | 0.300, 0.300, 0.300, 133 | 0.600, 0.600, 0.600, 134 | 1.000, 0.000, 0.000, 135 | 1.000, 0.500, 0.000, 136 | 0.749, 0.749, 0.000, 137 | 0.000, 1.000, 0.000, 138 | 0.000, 0.000, 1.000, 139 | 0.667, 0.000, 1.000, 140 | 0.333, 0.333, 0.000, 141 | 0.333, 0.667, 0.000, 142 | 0.333, 1.000, 0.000, 143 | 0.667, 0.333, 0.000, 144 | 0.667, 0.667, 0.000, 145 | 0.667, 1.000, 0.000, 146 | 1.000, 0.333, 0.000, 147 | 1.000, 0.667, 0.000, 148 | 1.000, 1.000, 0.000, 149 | 0.000, 0.333, 0.500, 150 | 0.000, 0.667, 0.500, 151 | 0.000, 1.000, 0.500, 152 | 0.333, 0.000, 0.500, 153 | 0.333, 0.333, 0.500, 154 | 0.333, 0.667, 0.500, 155 | 0.333, 1.000, 0.500, 156 | 0.667, 0.000, 0.500, 157 | 0.667, 0.333, 0.500, 158 | 0.667, 0.667, 0.500, 159 | 0.667, 1.000, 0.500, 160 | 1.000, 0.000, 0.500, 161 | 1.000, 0.333, 0.500, 162 | 1.000, 0.667, 0.500, 163 | 1.000, 1.000, 0.500, 164 | 0.000, 0.333, 1.000, 165 | 0.000, 0.667, 1.000, 166 | 0.000, 1.000, 1.000, 167 | 0.333, 0.000, 1.000, 168 | 0.333, 0.333, 1.000, 169 | 0.333, 0.667, 1.000, 170 | 0.333, 1.000, 1.000, 171 | 0.667, 0.000, 1.000, 172 | 0.667, 0.333, 1.000, 173 | 0.667, 0.667, 1.000, 174 | 0.667, 1.000, 1.000, 175 | 1.000, 0.000, 1.000, 176 | 1.000, 0.333, 1.000, 177 | 1.000, 0.667, 1.000, 178 | 0.333, 0.000, 0.000, 179 | 0.500, 0.000, 0.000, 180 | 0.667, 0.000, 0.000, 181 | 0.833, 0.000, 0.000, 182 | 1.000, 0.000, 0.000, 183 | 0.000, 0.167, 0.000, 184 | 0.000, 0.333, 0.000, 185 | 0.000, 0.500, 0.000, 186 | 0.000, 0.667, 0.000, 187 | 0.000, 0.833, 0.000, 188 | 0.000, 1.000, 0.000, 189 | 0.000, 0.000, 0.167, 190 | 0.000, 0.000, 0.333, 191 | 0.000, 0.000, 0.500, 192 | 0.000, 0.000, 0.667, 193 | 0.000, 0.000, 0.833, 194 | 0.000, 0.000, 1.000, 195 | 0.000, 0.000, 0.000, 196 | 0.143, 0.143, 0.143, 197 | 0.286, 0.286, 0.286, 198 | 0.429, 0.429, 0.429, 199 | 0.571, 0.571, 0.571, 200 | 0.714, 0.714, 0.714, 201 | 0.857, 0.857, 0.857, 202 | 0.000, 0.447, 0.741, 203 | 0.314, 0.717, 0.741, 204 | 0.50, 0.5, 0 205 | ] 206 | ).astype(np.float32).reshape(-1, 3) 207 | 208 | --------------------------------------------------------------------------------