├── .gitignore ├── LICENSE ├── README.md ├── add_hf.py ├── configs └── OmDet-Turbo_tiny_SWIN_T.yaml ├── docs ├── cvt_grounding_dino-en.md ├── cvt_grounding_dino-zh.md ├── main_results.png ├── speed_compare.jpeg └── turbo_model.jpeg ├── export.py ├── install.md ├── omdet ├── __init__.py ├── inference │ ├── __init__.py │ ├── base_engine.py │ └── det_engine.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── bifpn.py │ │ ├── config.py │ │ ├── convnext.py │ │ ├── dlafpn.py │ │ └── swint.py │ ├── common.py │ ├── language_backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── clip │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ ├── clip.py │ │ │ │ └── model.py │ │ │ └── simple_tokenizer.py │ │ └── word_utils.py │ └── registry.py ├── omdet_v2_turbo │ ├── __init__.py │ ├── block.py │ ├── build_components.py │ ├── config.py │ ├── conv.py │ ├── detector.py │ ├── detr_torch.py │ ├── dn_ops.py │ ├── ela_decoder.py │ ├── ela_encoder.py │ ├── head.py │ ├── infer_model.py │ └── torch_utils.py └── utils │ ├── __init__.py │ ├── analyze_model.py │ ├── box_ops.py │ ├── cache.py │ ├── plots.py │ ├── registry.py │ └── tools.py ├── outputs └── 000000574769.jpg ├── requirements.txt ├── run_demo.py ├── run_wsgi.py └── sample_data ├── 000000574769.jpg └── simsun.ttc /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OmDet-Turbo 2 | 3 |

4 | [Paper 📄] [Model 🗂️] 5 |

6 |

7 | Fast and accurate open-vocabulary end-to-end object detection 8 |

9 | 10 | *** 11 | ## 🗓️ Updates 12 | * 09/26/2024:OmDet-Turbo has been integrated into Transformers version 4.45.0. The code is available at [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models/omdet_turbo), and the Hugging Face model is available at [here](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf). 13 | * 07/05/2024: Our new open-source project, [OmAget: A multimodal agent framework for solving complex tasks](https://github.com/om-ai-lab/OmAgent) is available !!! Additionally, OmDet has been seamlessly integrated as an OVD tool within it. Feel free to delve into our innovative multimodal agent framework. 14 | * 06/24/2024: Guidance for [converting OmDet-Turbo to ONNX](https://github.com/om-ai-lab/OmDet#:~:text=How%20To%20Export%20ONNX%20Model) 15 | * 03/25/2024: Inference code and a pretrained OmDet-Turbo-Tiny model released. 16 | * 03/12/2024: Github open-source project created 17 | 18 | *** 19 | ## 🔗 Related Works 20 | If you are interested in our research, we welcome you to explore our other wonderful projects. 21 | 22 | 🔆 [How to Evaluate the Generalization of Detection? A Benchmark for Comprehensive Open-Vocabulary Detection](https://arxiv.org/abs/2308.13177)(AAAI24)  🏠[Github Repository](https://github.com/om-ai-lab/OVDEval/tree/main) 23 | 24 | 🔆 [OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network](https://ietresearch.onlinelibrary.wiley.com/doi/full/10.1049/cvi2.12268)(IET Computer Vision) 25 | 26 | *** 27 | ## 📖 Introduction 28 | This repository is the official PyTorch implementation for **OmDet-Turbo**, a fast transformer-based open-vocabulary object detection model. 29 | 30 | **⭐️Highlights** 31 | 1. **OmDet-Turbo** is a transformer-based real-time open-vocabulary 32 | detector that combines strong OVD capabilities with fast inference speed. 33 | This model addresses the challenges of efficient detection in open-vocabulary 34 | scenarios while maintaining high detection performance. 35 | 2. We introduce the **Efficient Fusion Head**, a swift multimodal fusion module 36 | designed to alleviate the computational burden on the encoder and reduce 37 | the time consumption of the head with ROI. 38 | 3. OmDet-Turbo-Base model, achieves state-of-the-art zero-shot performance on the ODinW and OVDEval datasets, with AP scores 39 | of **30.1** and **26.86**, respectively. 40 | 4. The inference speed of OmDetTurbo-Base on the COCO val2017 dataset reach **100.2** FPS on an A100 GPU. 41 | 42 | For more details, check out our paper **[Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892)** 43 | model_structure 44 | 45 | 46 | *** 47 | ## ⚡️ Inference Speed 48 | Comparison of inference speeds for each component in tiny-size model. 49 | speed 50 | 51 | *** 52 | ## 🛠️ How To Install 53 | Follow the [Installation Instructions](install.md) to set up the environments for OmDet-Turbo 54 | 55 | *** 56 | ## 🚀 How To Run 57 | ### Local Inference 58 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints. 59 | 2. Create a folder named **resources**, put downloaded models into this folder. 60 | 3. Run **run_demo.py**, the images with predicted results will be saved at **./outputs** folder. 61 | ### Run as a API Server 62 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints. 63 | 2. Create a folder named **resources**, put downloaded models into this folder. 64 | 3. Run **run_wsgi.py**, the API server will be started at **http://host_ip:8000/inf_predict**, check **http://host_ip:8000/docs** to have a try. 65 | 66 | We already added language cache while inferring with **run_demo.py**. For more details, please open and check **run_demo.py** scripts. 67 | 68 | 69 | *** 70 | ## ⚙️ How To Export ONNX Model 71 | 1. Replace **OmDetV2Turbo** in **OmDet-Turbo_tiny_SWIN_T.yaml** with **OmDetV2TurboInfer** 72 | 2. Run **export.py**, and the omdet.onnx will be exported. 73 | 74 | In the above example, post processing is not included in onnx model , and all input size are fixed. You can add more post processing and change the input size according to your needs. 75 | 76 | 77 | *** 78 | ## 📦 Model Zoo 79 | The performance of COCO and LVIS are evaluated under zero-shot setting. 80 | 81 | Model | Backbone | Pre-Train Data | COCO | LVIS | FPS (pytorch/trt) |Weight 82 | -- |--------|-----------------| -- | -- |-------------------| -- 83 | OmDet-Turbo-Tiny| Swin-T | O365,GoldG | 42.5 | 30.3 | 21.5/140.0 | [weight](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/tree/main) 84 | 85 | *** 86 | ## 📝 Main Results 87 | main_result 88 | 89 | *** 90 | ## Citation 91 | Please consider citing our papers if you use our projects: 92 | 93 | ``` 94 | @article{zhao2024real, 95 | title={Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head}, 96 | author={Zhao, Tiancheng and Liu, Peng and He, Xuan and Zhang, Lu and Lee, Kyusong}, 97 | journal={arXiv preprint arXiv:2403.06892}, 98 | year={2024} 99 | } 100 | ``` 101 | 102 | ``` 103 | @article{zhao2024omdet, 104 | title={OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network}, 105 | author={Zhao, Tiancheng and Liu, Peng and Lee, Kyusong}, 106 | journal={IET Computer Vision}, 107 | year={2024}, 108 | publisher={Wiley Online Library} 109 | } 110 | ``` 111 | -------------------------------------------------------------------------------- /add_hf.py: -------------------------------------------------------------------------------- 1 | from omdet.inference.det_engine import DetEngine 2 | from omdet.omdet_v2_turbo.detector import OmDetV2Turbo 3 | 4 | 5 | if __name__ == "__main__": 6 | engine = DetEngine(batch_size=1, device='cuda') 7 | img_paths = ['./sample_data/000000574769.jpg'] # path of images 8 | labels = ["person", "cat", "orange"] # labels to be predicted 9 | prompt = 'Detect {}.'.format(','.join(labels)) # prompt of detection task, use "Detect {}." as default 10 | 11 | model_id = 'OmDet-Turbo_tiny_SWIN_T' 12 | model, cfg = engine._load_model(model_id) 13 | 14 | # push to hub 15 | model.push_to_hub("nielsr/omde-v2-turbo-tiny-swin-tiny") 16 | 17 | # reload 18 | model = OmDetV2Turbo.from_pretrained("nielsr/omde-v2-turbo-tiny-swin-tiny") -------------------------------------------------------------------------------- /configs/OmDet-Turbo_tiny_SWIN_T.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: OmDetV2Turbo 3 | DEPLOY_MODE: true 4 | SWIN: 5 | OUT_FEATURES: 6 | - 1 7 | - 2 8 | - 3 9 | SIZE: T 10 | USE_CHECKPOINT: false 11 | BACKBONE: 12 | NAME: build_swintransformer_backbone 13 | LANGUAGE_BACKBONE: 14 | MODEL_TYPE: "clip" 15 | LANG_DIM: 512 16 | DEVICE: cuda 17 | FUSE_TYPE: merged_attn 18 | TRANSFORMER_DECODER: ELADecoder 19 | TRANSFORMER_ENCODER: ELAEncoder 20 | HEAD: DINOHead 21 | ELAEncoder: 22 | act: gelu 23 | depth_mult: 1.0 24 | dim_feedforward: 2048 25 | encoder_layer: TransformerLayer 26 | eval_size: null 27 | expansion: 1.0 28 | feat_strides: 29 | - 8 30 | - 16 31 | - 32 32 | hidden_dim: 256 33 | in_channels: 34 | - 192 35 | - 384 36 | - 768 37 | num_encoder_layers: 1 38 | pe_temperature: 10000 39 | use_encoder_idx: 40 | - 2 41 | PIXEL_MEAN: 42 | - 123.675 43 | - 116.28 44 | - 103.53 45 | PIXEL_STD: 46 | - 58.395 47 | - 57.12 48 | - 57.375 49 | ELADecoder: 50 | activation: relu 51 | backbone_feat_channels: 52 | - 256 53 | - 256 54 | - 256 55 | box_noise_scale: 1.0 56 | cls_type: cosine 57 | dim_feedforward: 2048 58 | dropout: 0.0 59 | eps: 0.01 60 | eval_idx: -1 61 | eval_size: null 62 | feat_strides: 63 | - 8 64 | - 16 65 | - 32 66 | hidden_dim: 256 67 | label_noise_ratio: 0.5 68 | learnt_init_query: false 69 | nhead: 8 70 | num_decoder_layers: 6 71 | num_decoder_points: 4 72 | num_denoising: 100 73 | num_levels: 3 74 | num_queries: 900 75 | position_embed_type: sine 76 | WEIGHTS: resources/swin_tiny_patch4_window7_224.pkl 77 | INPUT: 78 | FORMAT: RGB 79 | MAX_SIZE_TEST: 640 80 | MIN_SIZE_TEST: 640 81 | -------------------------------------------------------------------------------- /docs/cvt_grounding_dino-en.md: -------------------------------------------------------------------------------- 1 | # Grounding DINO to TensorRT Conversion 2 | 3 | Given that many people are interested about how to convert Grounding DINO mentioned in our paper to TensorRT, here is a brief introduction to our previous conversion approach. Additionally, while organizing the TRT conversion, we discovered a minor issue with the previous Grounding-DINO-T conversion. The correct FP16 speed after proper conversion should be approximately 27 FPS. 4 | 5 | ## Converting PyTorch Model to ONNX Model 6 | The original Grounding DINO code requires slight modifications to be converted to an ONNX model. However, when converting the ONNX model to a TensorRT model, various errors may occur. To avoid errors during ONNX to TensorRT conversion, some additional changes must be made when converting to the ONNX model. 7 | 8 | - Comment out the statements using checkpoints in the backbone. 9 | - Rewrite the NestedTensor in the code; avoid using the NestedTensor data structure. NestedTensor is mainly concentrated in the visual part. Use Tensor directly instead. 10 | - Rewrite the Joiner class in `backbone.py` as shown in the example below. The rewritten class should inherit from `nn.Module` instead of `nn.Sequential`. This might be the key to avoiding issues when converting the ONNX model to a TensorRT model. Some content in the `build_backbone` function can be moved to the rewritten Joiner class. 11 | - Treat the tokenizer as data preprocessing and place it outside the model; the output should be directly passed as input to the model's forward function. 12 | - The special handling in the `nested_tensor_from_tensor_list` function for ONNX conversion needs to be retained. 13 | - Make other necessary changes due to the above modifications. 14 | 15 | ```python 16 | class Joiner(nn.Module): 17 | def __init__(self): 18 | self.backbone = xxxx 19 | self.position_embedding = xxx 20 | 21 | def forward(self): 22 | pass 23 | ``` 24 | 25 | ## Converting ONNX Model to TensorRT Model 26 | The ONNX model converted according to the above suggestions can be smoothly converted to a TensorRT model. 27 | 28 | - It is recommended to use the latest version of TensorRT; it is indeed very fast. 29 | - Fixing the input dimensions can provide certain advantages. The speed tests for Grounding DINO in Omdet are based on fixed input dimensions. 30 | - F32 is almost lossless. When converting to FP16, there is a significant loss of precision, and some layers with substantial losses need extra handling. The speed tests for Grounding DINO in Omdet are based on FP16 models. FP32 is about 25-30% slower than FP16. 31 | -------------------------------------------------------------------------------- /docs/cvt_grounding_dino-zh.md: -------------------------------------------------------------------------------- 1 | # Grounding DINO 转TensorRT 2 | 鉴于不少同学提问想知道我们Paper提到的Grounding DINO的TRT是如何转换,所以在这里简单介绍一下我们之前的转换思路。此外,我们在整理TRT转换时也发现之前的Grounding-DINO-T转换得有点小问题,实际正确转换之后的FP16速度应该为~27FPS。 3 | 4 | ## pytorch模型 转换成 onnx模型 5 | 原始的Grounding DINO代码稍作修改就能转换成onnx模型, 但是转换成onnx模型后再转换成TensorRT模型时,会有各式各样的花式报错。为了避免onnx 转TensorRT时的报错,必须在转onnx模型时做一些额外的改动。 6 | 7 | - 注释掉backbone中使用checkpoint的语句 8 | - 将代码中的 NestedTensor 进行改写,不要使用NestedTensor数据结构。NestedTensor主要集中在视觉部分。直接使用Tensor即可 9 | - 将backbone.py 中的Joiner类改写成下面示例。改写后的类要继承nn.Module, 而不是nn.Sequential类。这可能是避免onnx转TensorRT模型出现问题的关键。build_backbone函数里面的部分内容可以移动到改写后的Joint类中 10 | - 将tokenizer 当成数据预处理放在模型的外面,输出直接作为forward函数的输入传入模型 11 | - nested_tensor_from_tensor_list 函数中针对转onnx做的特殊处理需要保留 12 | - 其他一些因为上述改动导致的必要改动 13 | 14 | ```python 15 | class Joiner(nn.Module): 16 | def __init__(self): 17 | self.backbone = xxxx 18 | self.position_embedding = xxx 19 | 20 | def forward(self): 21 | pass 22 | 23 | ``` 24 | 25 | 26 | ## onnx模型转TensorRT模型 27 | 按照上述建议转出的onnx模型可以流畅的转成TensorRT模型 28 | 29 | - 建议使用最新版本TensorRT, 真的很快 30 | - 固定输入维度,会有一定的优势。Omdet中关于Grounding DINO 的速度测试都是基于固定的输入维度 31 | - F32 几乎无损, 转换FP16的时候精度损失较大,需要对一些损失较大的层进行额外的处理。Omdet中关于Grounding DINO 的速度测试都是基于FP16模型。FP32 比 FP16 慢 25~30%左右 32 | -------------------------------------------------------------------------------- /docs/main_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/main_results.png -------------------------------------------------------------------------------- /docs/speed_compare.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/speed_compare.jpeg -------------------------------------------------------------------------------- /docs/turbo_model.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/turbo_model.jpeg -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | from omdet.inference.det_engine import DetEngine 2 | import torch 3 | 4 | if __name__ == "__main__": 5 | 6 | model_dir = "./resources" 7 | img_tensor = torch.rand(1, 3, 640, 640) # 8 | label_feats = torch.rand(80, 1, 512) # 80 is cls num, 512 is clip dim 9 | task_feats = torch.rand(77, 1, 512) # 77 is task dim 10 | task_mask = torch.rand(1, 77) 11 | 12 | engine = DetEngine(model_dir=model_dir, batch_size=1, device='cpu') 13 | onnx_model_path = "./omdet.onnx" 14 | engine.export_onnx('OmDet-Turbo_tiny_SWIN_T', img_tensor, label_feats, task_feats, task_mask, onnx_model_path) 15 | 16 | -------------------------------------------------------------------------------- /install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | ## Requirements 3 | 4 | * CUDA>=11.8 5 | 6 | * Python>=3.9 7 | 8 | Create Python environments. 9 | ```bash 10 | conda create -n omdet python=3.9 11 | ``` 12 | Activate the environment: 13 | ```bash 14 | conda activate omdet 15 | ``` 16 | 17 | * Pytorch>=2.1.0, Torchvision>=0.16.0 18 | 19 | If your CUDA version is 11.8, you can install Pytorch as following: 20 | ```bash 21 | conda install pytorch==2.1.0 torchvision==0.16.0 pytorch-cuda=11.8 -c pytorch -c nvidia 22 | ``` 23 | 24 | * detectron2>=0.6.0: 25 | 26 | Install detectron2: 27 | ```bash 28 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' 29 | ``` 30 | 31 | * Other requirements 32 | ```bash 33 | pip install -r requirements.txt 34 | ``` 35 | -------------------------------------------------------------------------------- /omdet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/__init__.py -------------------------------------------------------------------------------- /omdet/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/inference/__init__.py -------------------------------------------------------------------------------- /omdet/inference/base_engine.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import requests 4 | import io 5 | import base64 6 | from detectron2.data.detection_utils import _apply_exif_orientation, convert_PIL_to_numpy 7 | import numpy as np 8 | 9 | 10 | def get_output_shape(oldh: int, oldw: int, short_edge_length: int, max_size: int): 11 | """ 12 | Compute the output size given input size and target short edge length. 13 | """ 14 | h, w = oldh, oldw 15 | size = short_edge_length * 1.0 16 | scale = size / min(h, w) 17 | if h < w: 18 | newh, neww = size, scale * w 19 | else: 20 | newh, neww = scale * h, size 21 | if max(newh, neww) > max_size: 22 | scale = max_size * 1.0 / max(newh, neww) 23 | newh = newh * scale 24 | neww = neww * scale 25 | neww = int(neww + 0.5) 26 | newh = int(newh + 0.5) 27 | return (newh, neww) 28 | 29 | 30 | class BaseEngine(object): 31 | def _load_data(self, src_type, cfg, data, return_transform=False): 32 | if src_type == 'local': 33 | image_data = [Image.open(x) for x in data] 34 | 35 | elif src_type == 'url': 36 | image_data = [] 37 | for x in data: 38 | temp = Image.open(io.BytesIO(requests.get(x).content)) 39 | image_data.append(temp) 40 | 41 | elif src_type == "base64": 42 | image_data = [] 43 | for x in data: 44 | temp = Image.open(io.BytesIO(base64.b64decode(x))).convert("RGB") 45 | image_data.append(temp) 46 | 47 | else: 48 | raise Exception("Unknown mode {}.".format(src_type)) 49 | 50 | input_data = [] 51 | transforms = [] 52 | for x in image_data: 53 | width, height = x.size 54 | pil_image = x.resize((cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST), Image.BILINEAR) 55 | image = convert_PIL_to_numpy(pil_image, cfg.INPUT.FORMAT) 56 | 57 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 58 | input_data.append({"image": image, "height": height, "width": width}) 59 | 60 | if return_transform: 61 | return input_data, transforms 62 | else: 63 | return input_data -------------------------------------------------------------------------------- /omdet/inference/det_engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from typing import List, Union, Dict 4 | from omdet.utils.tools import chunks 5 | from detectron2.checkpoint import DetectionCheckpointer 6 | from detectron2.config import get_cfg 7 | from detectron2.engine import DefaultTrainer as Trainer 8 | from omdet.utils.cache import LRUCache 9 | from omdet.inference.base_engine import BaseEngine 10 | from detectron2.utils.logger import setup_logger 11 | from omdet.omdet_v2_turbo.config import add_omdet_v2_turbo_config 12 | 13 | 14 | class DetEngine(BaseEngine): 15 | def __init__(self, model_dir='resources/', device='cpu', batch_size=10): 16 | self.model_dir = model_dir 17 | self._models = LRUCache(10) 18 | self.device = device 19 | self.batch_size = batch_size 20 | self.logger = setup_logger(name=__name__) 21 | 22 | def _init_cfg(self, cfg, model_id): 23 | cfg.MODEL.WEIGHTS = os.path.join(self.model_dir, model_id+'.pth') 24 | cfg.MODEL.DEVICE = self.device 25 | cfg.INPUT.MAX_SIZE_TEST = 640 26 | cfg.INPUT.MIN_SIZE_TEST = 640 27 | cfg.MODEL.DEPLOY_MODE = True 28 | cfg.freeze() 29 | return cfg 30 | 31 | def count_parameters(self, model): 32 | return sum(p.numel() for p in model.parameters()) 33 | 34 | def _load_model(self, model_id): 35 | if not self._models.has(model_id): 36 | cfg = get_cfg() 37 | add_omdet_v2_turbo_config(cfg) 38 | cfg.merge_from_file(os.path.join('configs', model_id+'.yaml')) 39 | cfg = self._init_cfg(cfg, model_id) 40 | model = Trainer.build_model(cfg) 41 | self.logger.info("Model:\n{}".format(model)) 42 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 43 | print("Loading a OmDet model {}".format(cfg.MODEL.WEIGHTS)) 44 | model.eval() 45 | model.to(cfg.MODEL.DEVICE) 46 | print("Total parameters: {}".format(self.count_parameters(model))) 47 | self._models.put(model_id, (model, cfg)) 48 | 49 | return self._models.get(model_id) 50 | 51 | def inf_predict(self, model_id, 52 | data: List, 53 | task: Union[str, List], 54 | labels: List[str], 55 | src_type: str = 'local', 56 | conf_threshold: float = 0.5, 57 | nms_threshold: float = 0.5 58 | ): 59 | 60 | if len(task) == 0: 61 | raise Exception("Task cannot be empty.") 62 | 63 | model, cfg = self._load_model(model_id) 64 | 65 | resp = [] 66 | flat_labels = labels 67 | 68 | with torch.no_grad(): 69 | for batch in chunks(data, self.batch_size): 70 | batch_image = self._load_data(src_type, cfg, batch) 71 | for img in batch_image: 72 | img['label_set'] = labels 73 | img['tasks'] = task 74 | 75 | batch_y = model(batch_image, score_thresh=conf_threshold, nms_thresh=nms_threshold) 76 | 77 | for z in batch_y: 78 | temp = [] 79 | instances = z['instances'].to('cpu') 80 | instances = instances[instances.scores > conf_threshold] 81 | 82 | for idx, pred in enumerate(zip(instances.pred_boxes, instances.scores, instances.pred_classes)): 83 | (x, y, xx, yy), conf, cls = pred 84 | conf = float(conf) 85 | cls = flat_labels[int(cls)] 86 | 87 | temp.append({'xmin': int(x), 88 | 'ymin': int(y), 89 | 'xmax': int(xx), 90 | 'ymax': int(yy), 91 | 'conf': conf, 92 | 'label': cls}) 93 | resp.append(temp) 94 | 95 | return resp 96 | 97 | def export_onnx(self, model_id, img_tensor, label_feats, task_feats, task_mask, onnx_model_path): 98 | 99 | model, _ = self._load_model(model_id) 100 | model.to("cpu") 101 | model.eval() 102 | inputs = (img_tensor, label_feats, task_feats, task_mask) 103 | 104 | print("start cvt onnx...") 105 | torch.onnx.export(model, # model being run 106 | inputs, # model input (or a tuple for multiple inputs) 107 | onnx_model_path, # where to save the model (can be a file or file-like object) 108 | export_params=True, # store the trained parameter weights inside the model file 109 | opset_version=17, # the ONNX version to export the model to 110 | do_constant_folding=True, # whether to execute constant folding for optimization 111 | input_names=['img_tensor', "label_feats", "task_feats", "task_mask"], 112 | ) -------------------------------------------------------------------------------- /omdet/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/__init__.py -------------------------------------------------------------------------------- /omdet/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from omdet.modeling.backbone import (convnext, swint) -------------------------------------------------------------------------------- /omdet/modeling/backbone/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode as CN 2 | 3 | 4 | def add_backbone_config(cfg): 5 | add_convnext_config(cfg) 6 | add_swint_config(cfg) 7 | 8 | 9 | def add_convnext_config(cfg): 10 | # extra configs for convnext 11 | cfg.MODEL.CONVNEXT = CN() 12 | cfg.MODEL.CONVNEXT.SIZE = "T" 13 | cfg.MODEL.CONVNEXT.DEPTHS= [3, 3, 9, 3] 14 | cfg.MODEL.CONVNEXT.DIMS= [96, 192, 384, 768] 15 | cfg.MODEL.CONVNEXT.DROP_PATH_RATE= 0.2 16 | cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE= 1e-6 17 | cfg.MODEL.CONVNEXT.OUT_FEATURES= [0, 1, 2, 3] 18 | cfg.SOLVER.WEIGHT_DECAY_RATE= 0.95 19 | 20 | 21 | def add_swint_config(cfg): 22 | cfg.MODEL.SWIN = CN() 23 | cfg.MODEL.SWIN.SIZE = 'T' # 'T', 'S', 'B' 24 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 25 | cfg.MODEL.SWIN.OUT_FEATURES = (0, 1, 2, 3) # FPN stride 8 - 32 26 | 27 | 28 | -------------------------------------------------------------------------------- /omdet/modeling/backbone/convnext.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from timm.models.layers import trunc_normal_, DropPath 6 | from detectron2.modeling.backbone import Backbone 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | from detectron2.modeling.backbone.fpn import FPN 9 | from detectron2.layers import ShapeSpec 10 | 11 | 12 | class Block(nn.Module): 13 | r""" ConvNeXt Block. There are two equivalent implementations: 14 | (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) 15 | (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back 16 | We use (2) as we find it slightly faster in PyTorch 17 | 18 | Args: 19 | dim (int): Number of input channels. 20 | drop_path (float): Stochastic depth rate. Default: 0.0 21 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 22 | """ 23 | 24 | def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): 25 | super().__init__() 26 | self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv 27 | self.norm = LayerNorm(dim, eps=1e-6) 28 | self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers 29 | self.act = nn.GELU() 30 | self.pwconv2 = nn.Linear(4 * dim, dim) 31 | self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 32 | requires_grad=True) if layer_scale_init_value > 0 else None 33 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 34 | 35 | def forward(self, x): 36 | input = x 37 | x = self.dwconv(x) 38 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 39 | x = self.norm(x) 40 | x = self.pwconv1(x) 41 | x = self.act(x) 42 | x = self.pwconv2(x) 43 | if self.gamma is not None: 44 | x = self.gamma * x 45 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 46 | 47 | x = input + self.drop_path(x) 48 | return x 49 | 50 | 51 | class LayerNorm(nn.Module): 52 | r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 53 | The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 54 | shape (batch_size, height, width, channels) while channels_first corresponds to inputs 55 | with shape (batch_size, channels, height, width). 56 | """ 57 | 58 | def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): 59 | super().__init__() 60 | self.weight = nn.Parameter(torch.ones(normalized_shape)) 61 | self.bias = nn.Parameter(torch.zeros(normalized_shape)) 62 | self.eps = eps 63 | self.data_format = data_format 64 | if self.data_format not in ["channels_last", "channels_first"]: 65 | raise NotImplementedError 66 | self.normalized_shape = (normalized_shape,) 67 | 68 | def forward(self, x): 69 | if self.data_format == "channels_last": 70 | return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) 71 | elif self.data_format == "channels_first": 72 | u = x.mean(1, keepdim=True) 73 | s = (x - u).pow(2).mean(1, keepdim=True) 74 | x = (x - u) / torch.sqrt(s + self.eps) 75 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 76 | return x 77 | 78 | 79 | class ConvNeXt(Backbone): 80 | r""" ConvNeXt 81 | A PyTorch impl of : `A ConvNet for the 2020s` - 82 | https://arxiv.org/pdf/2201.03545.pdf 83 | Args: 84 | in_chans (int): Number of input image channels. Default: 3 85 | num_classes (int): Number of classes for classification head. Default: 1000 86 | depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] 87 | dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] 88 | drop_path_rate (float): Stochastic depth rate. Default: 0. 89 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 90 | head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. 91 | out_features (tuple(int)): Stage numbers of the outputs given to the Neck. 92 | """ 93 | 94 | def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], 95 | drop_path_rate=0., layer_scale_init_value=1e-6, out_features=None): 96 | super().__init__() 97 | 98 | self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers 99 | stem = nn.Sequential( 100 | nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), 101 | LayerNorm(dims[0], eps=1e-6, data_format="channels_first") 102 | ) 103 | 104 | self.downsample_layers.append(stem) 105 | for i in range(3): 106 | downsample_layer = nn.Sequential( 107 | LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), 108 | nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2), 109 | ) 110 | self.downsample_layers.append(downsample_layer) 111 | 112 | self.num_layers = len(depths) 113 | num_features = [int(dims[i] * 2 ** i) for i in range(self.num_layers)] 114 | self.num_features = num_features 115 | self._out_features = out_features 116 | 117 | self._out_feature_strides = {} 118 | self._out_feature_channels = {} 119 | 120 | self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks 121 | dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 122 | cur = 0 123 | strides = [4, 4, 4, 4] 124 | for i in range(4): 125 | stage = nn.Sequential( 126 | *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 127 | layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] 128 | ) 129 | self.stages.append(stage) 130 | cur += depths[i] 131 | 132 | self._out_feature_channels[i] = dims[i] 133 | self._out_feature_strides[i] = strides[i] * 2 ** i 134 | 135 | norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") 136 | for i_layer in range(4): 137 | layer = norm_layer(dims[i_layer]) 138 | layer_name = f'norm{i_layer}' 139 | self.add_module(layer_name, layer) 140 | 141 | self.apply(self._init_weights) 142 | 143 | def _init_weights(self, m): 144 | if isinstance(m, (nn.Conv2d, nn.Linear)): 145 | trunc_normal_(m.weight, std=.02) 146 | nn.init.constant_(m.bias, 0) 147 | 148 | def init_weights(self, pretrained=None): 149 | """Initialize the weights in backbone. 150 | Args: 151 | pretrained (str, optional): Path to pre-trained weights. 152 | Defaults to None. 153 | """ 154 | 155 | def _init_weights(m): 156 | if isinstance(m, nn.Linear): 157 | trunc_normal_(m.weight, std=.02) 158 | if isinstance(m, nn.Linear) and m.bias is not None: 159 | nn.init.constant_(m.bias, 0) 160 | elif isinstance(m, nn.LayerNorm): 161 | nn.init.constant_(m.bias, 0) 162 | nn.init.constant_(m.weight, 1.0) 163 | 164 | self.apply(_init_weights) 165 | 166 | def forward_features(self, x): 167 | outs = {} 168 | for i in range(4): 169 | x = self.downsample_layers[i](x) 170 | x = self.stages[i](x) 171 | if i in self._out_features: 172 | norm_layer = getattr(self, f'norm{i}') 173 | x_out = norm_layer(x) 174 | out = x_out.contiguous() 175 | stage_name = i 176 | outs[stage_name] = out 177 | 178 | return outs # {"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs) 179 | 180 | def forward(self, x): 181 | x = self.forward_features(x) 182 | return x 183 | 184 | 185 | model_urls = { 186 | "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", 187 | "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", 188 | "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", 189 | "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", 190 | "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", 191 | "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", 192 | "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", 193 | "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", 194 | "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", 195 | } 196 | 197 | size2config = { 198 | "N": { 199 | "DEPTHS": [2, 2, 8, 2], 200 | "DIMS": [80, 160, 320, 640] 201 | }, 202 | 'T': { 203 | "DEPTHS": [3, 3, 9, 3], 204 | "DIMS": [96, 192, 384, 768] 205 | }, 206 | 'S': { 207 | "DEPTHS": [3, 3, 27, 3], 208 | "DIMS": [96, 192, 384, 768] 209 | }, 210 | 'B': { 211 | "DEPTHS": [3, 3, 27, 3], 212 | "DIMS": [128, 256, 512, 1024] 213 | }, 214 | 'L': { 215 | "DEPTHS": [3, 3, 27, 3], 216 | "DIMS": [192, 384, 768, 1536] 217 | }, 218 | 'XL': { 219 | "DEPTHS": [3, 3, 27, 3], 220 | "DIMS": [256, 512, 1024, 2048] 221 | } 222 | } 223 | 224 | 225 | @BACKBONE_REGISTRY.register() 226 | def build_convnext_backbone(cfg, input_shape): 227 | """ 228 | Create a ConvNeXt instance from config. 229 | 230 | Returns: 231 | VoVNet: a :class:`VoVNet` instance. 232 | """ 233 | size = cfg.MODEL.CONVNEXT.SIZE 234 | if size in size2config: 235 | depth = size2config[size]['DEPTHS'] 236 | dims = size2config[size]['DIMS'] 237 | else: 238 | depth = cfg.MODEL.CONVNEXT.DEPTHS 239 | dims = cfg.MODEL.CONVNEXT.DIMS 240 | 241 | return ConvNeXt( 242 | in_chans=input_shape.channels, 243 | depths=depth, 244 | dims=dims, 245 | drop_path_rate=cfg.MODEL.CONVNEXT.DROP_PATH_RATE, 246 | layer_scale_init_value=cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE, 247 | out_features=cfg.MODEL.CONVNEXT.OUT_FEATURES 248 | ) 249 | 250 | 251 | @BACKBONE_REGISTRY.register() 252 | def build_convnext_fpn_backbone(cfg, input_shape: ShapeSpec): 253 | """ 254 | Args: 255 | cfg: a detectron2 CfgNode 256 | 257 | Returns: 258 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 259 | """ 260 | bottom_up = build_convnext_backbone(cfg, input_shape) 261 | in_features = cfg.MODEL.FPN.IN_FEATURES 262 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 263 | backbone = FPN( 264 | bottom_up=bottom_up, 265 | in_features=in_features, 266 | out_channels=out_channels, 267 | norm=cfg.MODEL.FPN.NORM, 268 | top_block=None, 269 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 270 | ) 271 | return backbone 272 | -------------------------------------------------------------------------------- /omdet/modeling/common.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn, Tensor 4 | import copy 5 | import torch.nn.functional as F 6 | 7 | 8 | class PositionalEncoding(nn.Module): 9 | 10 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): 11 | super().__init__() 12 | self.dropout = nn.Dropout(p=dropout) 13 | 14 | position = torch.arange(max_len).unsqueeze(1) 15 | div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) 16 | pe = torch.zeros(max_len, 1, d_model) 17 | pe[:, 0, 0::2] = torch.sin(position * div_term) 18 | pe[:, 0, 1::2] = torch.cos(position * div_term) 19 | self.register_buffer('pe', pe) 20 | 21 | def forward(self, x: Tensor) -> Tensor: 22 | """ 23 | Args: 24 | x: Tensor, shape [seq_len, batch_size, embedding_dim] 25 | """ 26 | x = x + self.pe[:x.size(0)] 27 | return self.dropout(x) 28 | 29 | 30 | class AbsPositionalEncoding(nn.Module): 31 | 32 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): 33 | super().__init__() 34 | self.dropout = nn.Dropout(p=dropout) 35 | self.pe = nn.Embedding(max_len, d_model) 36 | 37 | def forward(self, x: Tensor) -> Tensor: 38 | """ 39 | Args: 40 | x: Tensor, shape [seq_len, batch_size, embedding_dim] 41 | """ 42 | seq_len = x.size(0) 43 | position = torch.arange(seq_len, device=x.device).unsqueeze(1) 44 | pos_emb = self.pe(position) 45 | x = x + pos_emb 46 | return self.dropout(x) 47 | 48 | 49 | class ResMultiHeadAttention(nn.Module): 50 | def __init__(self, d_q, d_k, d_v, nhead, dropout): 51 | super().__init__() 52 | self.self_attn = nn.MultiheadAttention(d_q, nhead, dropout=dropout, kdim=d_k, vdim=d_v) 53 | self.norm1 = nn.LayerNorm(d_q) 54 | self.dropout = nn.Dropout(dropout) 55 | 56 | def forward(self, q, k=None, v=None, attn_mask=None): 57 | """ 58 | """ 59 | if k is None: 60 | k = q 61 | 62 | if v is None: 63 | v = q 64 | 65 | q1 = self.self_attn(query=q, key=k, value=v, attn_mask=attn_mask)[0] 66 | q = q + self.dropout(q1) 67 | q = self.norm1(q) 68 | return q 69 | 70 | 71 | class DistilMLP(nn.Module): 72 | def __init__(self, input_size, output_size, dropout=0.1): 73 | super(DistilMLP, self).__init__() 74 | self.squash = nn.GELU() 75 | self.LayerNorm = nn.LayerNorm(input_size, eps=1e-12) 76 | self.intermediate = nn.Linear(input_size, input_size) 77 | self.dropout = nn.Dropout(dropout) 78 | self.dense = nn.Linear(input_size, output_size) 79 | 80 | def forward(self, word_emb): 81 | word_emb = self.squash(word_emb) 82 | word_emb = self.LayerNorm(word_emb) 83 | word_emb = self.dropout(word_emb) 84 | word_emb = self.dense(word_emb) 85 | return word_emb 86 | 87 | 88 | class ResidualLayer(nn.Module): 89 | """ 90 | A residual connection followed by a layer norm. 91 | """ 92 | def __init__(self, size, dropout): 93 | super(ResidualLayer, self).__init__() 94 | self.norm1 = nn.LayerNorm(size) 95 | self.dropout = nn.Dropout(dropout) 96 | 97 | def forward(self, x, y): 98 | "Apply residual connection to any sublayer with the same size." 99 | return self.norm1(x + self.dropout(y)) 100 | 101 | 102 | class ResidualMLP(nn.Module): 103 | def __init__(self, d_m, dropout, d_hidden=1024, activation='relu'): 104 | super(ResidualMLP, self).__init__() 105 | self.mlp = MLP(d_m, d_m, d_hidden, dropout, activation) 106 | self.res1 = ResidualLayer(d_m, dropout) 107 | 108 | def forward(self, x): 109 | mlp_out = self.mlp(x) 110 | x = self.res1(x, mlp_out) 111 | return x 112 | 113 | 114 | class MLP(nn.Module): 115 | def __init__(self, d_input, d_output, d_hidden=1024, dropout=0.1, activation='relu'): 116 | super(MLP, self).__init__() 117 | self.linear1 = nn.Linear(d_input, d_hidden) 118 | self.activation = _get_activation_fn(activation) 119 | self.dropout = nn.Dropout(dropout) 120 | self.linear2 = nn.Linear(d_hidden, d_output) 121 | 122 | def forward(self, x): 123 | return self.linear2(self.dropout(self.activation(self.linear1(x)))) 124 | 125 | 126 | def apply_deltas(deltas, boxes, bbox_weights, scale_clamp): 127 | """ 128 | Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. 129 | 130 | Args: 131 | deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1. 132 | deltas[i] represents k potentially different class-specific 133 | box transformations for the single box boxes[i]. 134 | boxes (Tensor): boxes to transform, of shape (N, 4) 135 | """ 136 | boxes = boxes.to(deltas.dtype) 137 | 138 | widths = boxes[:, 2] - boxes[:, 0] 139 | heights = boxes[:, 3] - boxes[:, 1] 140 | ctr_x = boxes[:, 0] + 0.5 * widths 141 | ctr_y = boxes[:, 1] + 0.5 * heights 142 | 143 | wx, wy, ww, wh = bbox_weights 144 | dx = deltas[:, 0::4] / wx 145 | dy = deltas[:, 1::4] / wy 146 | dw = deltas[:, 2::4] / ww 147 | dh = deltas[:, 3::4] / wh 148 | 149 | # Prevent sending too large values into torch.exp() 150 | dw = torch.clamp(dw, max=scale_clamp) 151 | dh = torch.clamp(dh, max=scale_clamp) 152 | 153 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] 154 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] 155 | pred_w = torch.exp(dw) * widths[:, None] 156 | pred_h = torch.exp(dh) * heights[:, None] 157 | 158 | pred_boxes = torch.zeros_like(deltas) 159 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1 160 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1 161 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2 162 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2 163 | 164 | return pred_boxes 165 | 166 | 167 | def _get_clones(module, N): 168 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 169 | 170 | 171 | def _get_activation_fn(activation): 172 | """Return an activation function given a string""" 173 | if activation == "relu": 174 | return F.relu 175 | if activation == "gelu": 176 | return F.gelu 177 | if activation == "glu": 178 | return F.glu 179 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 180 | 181 | 182 | def _norm(f, dim=-1): 183 | return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12) 184 | 185 | 186 | def _b_cosine(a, b, logit_scale): 187 | """ 188 | a: B x K x H 189 | b: B x H x K 190 | """ 191 | a = _norm(a, dim=2) 192 | b = _norm(b, dim=1) 193 | # Calculating the Loss 194 | logit_scale = logit_scale.exp() 195 | logits_per_image = logit_scale * torch.bmm(a, b) 196 | return logits_per_image 197 | 198 | def _cosine(a, b, logit_scale): 199 | """ 200 | a: ?/1 x K x H 201 | b: ?/1 x H x 1 202 | """ 203 | a = _norm(a, dim=2) 204 | b = _norm(b, dim=1) 205 | # Calculating the Loss 206 | logit_scale = logit_scale.exp() 207 | logits_per_image = logit_scale * torch.matmul(a, b) 208 | return logits_per_image -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_language_backbone 2 | #from .build import build_tokenizer 3 | 4 | # from .hfpt_tokenizer import HFPTTokenizer 5 | # from .simple_tokenizer import SimpleTokenizer 6 | -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/backbone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from omdet.modeling import registry 3 | from omdet.modeling.language_backbone.clip.models import clip as clip 4 | 5 | 6 | @registry.LANGUAGE_BACKBONES.register("clip") 7 | def build_clip_backbone(cfg): 8 | model, _ = clip.load("resources/ViT-B-16.pt", device=torch.device(cfg.MODEL.DEVICE), jit=False) 9 | model.visual = None # delete the vision part 10 | model.logit_scale = None 11 | return model 12 | 13 | 14 | def build_language_backbone(cfg): 15 | print ("cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE", cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE) 16 | assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \ 17 | "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format( 18 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 19 | ) 20 | return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg) 21 | 22 | 23 | if __name__ == "__main__": 24 | a = build_clip_backbone('') 25 | print(a) -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/__init__.py -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/clip/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/__init__.py -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/clip/models/clip.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import urllib 4 | import warnings 5 | from typing import Union, List 6 | 7 | import torch 8 | from PIL import Image 9 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize 10 | from tqdm import tqdm 11 | 12 | from omdet.modeling.language_backbone.clip.models.model import build_model 13 | from omdet.modeling.language_backbone.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer 14 | 15 | try: 16 | from torchvision.transforms import InterpolationMode 17 | BICUBIC = InterpolationMode.BICUBIC 18 | except ImportError: 19 | BICUBIC = Image.BICUBIC 20 | 21 | 22 | __all__ = ["available_models", "load", "tokenize"] 23 | _tokenizer = _Tokenizer() 24 | 25 | 26 | _MODELS = { 27 | "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", 28 | "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", 29 | "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", 30 | "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", 31 | "ViT-B-32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", 32 | "ViT-B-16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", 33 | } 34 | 35 | 36 | def _download(url: str, root: str): 37 | os.makedirs(root, exist_ok=True) 38 | filename = os.path.basename(url) 39 | 40 | expected_sha256 = url.split("/")[-2] 41 | download_target = os.path.join(root, filename) 42 | 43 | if os.path.exists(download_target) and not os.path.isfile(download_target): 44 | raise RuntimeError(f"{download_target} exists and is not a regular file") 45 | 46 | if os.path.isfile(download_target): 47 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: 48 | return download_target 49 | else: 50 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") 51 | 52 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: 53 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop: 54 | while True: 55 | buffer = source.read(8192) 56 | if not buffer: 57 | break 58 | 59 | output.write(buffer) 60 | loop.update(len(buffer)) 61 | 62 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: 63 | raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") 64 | 65 | return download_target 66 | 67 | 68 | def _transform(n_px): 69 | return Compose([ 70 | Resize(n_px, interpolation=BICUBIC), 71 | CenterCrop(n_px), 72 | lambda image: image.convert("RGB"), 73 | ToTensor(), 74 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 75 | ]) 76 | 77 | 78 | def available_models() -> List[str]: 79 | """Returns the names of available CLIP rclip""" 80 | return list(_MODELS.keys()) 81 | 82 | 83 | def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", 84 | jit: bool = False, download_root: str = None): 85 | """Load a CLIP model 86 | 87 | Parameters 88 | ---------- 89 | name : str 90 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 91 | 92 | device : Union[str, torch.device] 93 | The device to put the loaded model 94 | 95 | jit : bool 96 | Whether to load the optimized JIT model or more hackable non-JIT model (default). 97 | 98 | download_root: str 99 | path to download the model files; by default, it uses "~/.cache/clip" 100 | 101 | Returns 102 | ------- 103 | model : torch.nn.Module 104 | The CLIP model 105 | 106 | preprocess : Callable[[PIL.Image], torch.Tensor] 107 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 108 | """ 109 | if name in _MODELS: 110 | model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip")) 111 | elif os.path.isfile(name): 112 | model_path = name 113 | else: 114 | raise RuntimeError(f"Model {name} not found; available rclip = {available_models()}") 115 | 116 | try: 117 | # loading JIT archive 118 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 119 | state_dict = None 120 | except RuntimeError: 121 | # loading saved state dict 122 | if jit: 123 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") 124 | jit = False 125 | state_dict = torch.load(model_path, map_location="cpu") 126 | 127 | if not jit: 128 | model = build_model(state_dict or model.state_dict()).to(device) 129 | if str(device) == "cpu": 130 | model.float() 131 | return model, _transform(model.visual.input_resolution) 132 | 133 | # patch the device names 134 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) 135 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] 136 | 137 | def patch_device(module): 138 | try: 139 | graphs = [module.graph] if hasattr(module, "graph") else [] 140 | except RuntimeError: 141 | graphs = [] 142 | 143 | if hasattr(module, "forward1"): 144 | graphs.append(module.forward1.graph) 145 | 146 | for graph in graphs: 147 | for node in graph.findAllNodes("prim::Constant"): 148 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): 149 | node.copyAttributes(device_node) 150 | 151 | model.apply(patch_device) 152 | patch_device(model.encode_image) 153 | patch_device(model.encode_text) 154 | 155 | # patch dtype to float32 on CPU 156 | if str(device) == "cpu": 157 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) 158 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 159 | float_node = float_input.node() 160 | 161 | def patch_float(module): 162 | try: 163 | graphs = [module.graph] if hasattr(module, "graph") else [] 164 | except RuntimeError: 165 | graphs = [] 166 | 167 | if hasattr(module, "forward1"): 168 | graphs.append(module.forward1.graph) 169 | 170 | for graph in graphs: 171 | for node in graph.findAllNodes("aten::to"): 172 | inputs = list(node.inputs()) 173 | for i in [1, 2]: # dtype can be the second or third argument to aten::to() 174 | if inputs[i].node()["value"] == 5: 175 | inputs[i].node().copyAttributes(float_node) 176 | 177 | model.apply(patch_float) 178 | patch_float(model.encode_image) 179 | patch_float(model.encode_text) 180 | 181 | model.float() 182 | 183 | return model, _transform(model.input_resolution.item()) 184 | 185 | 186 | def tokenize(texts: Union[str, List[str]], context_length: int = 77, 187 | truncate: bool = False) -> torch.LongTensor: 188 | """ 189 | Returns the tokenized representation of given input string(s) 190 | 191 | Parameters 192 | ---------- 193 | texts : Union[str, List[str]] 194 | An input string or a list of input strings to tokenize 195 | 196 | context_length : int 197 | The context length to use; all CLIP rclip use 77 as the context length 198 | 199 | truncate: bool 200 | Whether to truncate the text in case its encoding is longer than the context length 201 | 202 | Returns 203 | ------- 204 | A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] 205 | """ 206 | if isinstance(texts, str): 207 | texts = [texts] 208 | 209 | sot_token = _tokenizer.encoder["<|startoftext|>"] 210 | eot_token = _tokenizer.encoder["<|endoftext|>"] 211 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] 212 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 213 | 214 | for i, tokens in enumerate(all_tokens): 215 | if len(tokens) > context_length: 216 | if truncate: 217 | tokens = tokens[:context_length] 218 | tokens[-1] = eot_token 219 | else: 220 | raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") 221 | result[i, :len(tokens)] = torch.tensor(tokens) 222 | 223 | return result 224 | -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /omdet/modeling/language_backbone/word_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language-related data loading helper functions and class wrappers. 3 | """ 4 | 5 | import re 6 | import torch 7 | import codecs 8 | 9 | UNK_TOKEN = '' 10 | PAD_TOKEN = '' 11 | END_TOKEN = '' 12 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 13 | 14 | 15 | class Dictionary(object): 16 | def __init__(self): 17 | self.word2idx = {} 18 | self.idx2word = [] 19 | 20 | def add_word(self, word): 21 | if word not in self.word2idx: 22 | self.idx2word.append(word) 23 | self.word2idx[word] = len(self.idx2word) - 1 24 | return self.word2idx[word] 25 | 26 | def __len__(self): 27 | return len(self.idx2word) 28 | 29 | def __getitem__(self, a): 30 | if isinstance(a, int): 31 | return self.idx2word[a] 32 | elif isinstance(a, list): 33 | return [self.idx2word[x] for x in a] 34 | elif isinstance(a, str): 35 | return self.word2idx[a] 36 | else: 37 | raise TypeError("Query word/index argument must be int or str") 38 | 39 | def __contains__(self, word): 40 | return word in self.word2idx 41 | 42 | 43 | class Corpus(object): 44 | def __init__(self): 45 | self.dictionary = Dictionary() 46 | 47 | def set_max_len(self, value): 48 | self.max_len = value 49 | 50 | def load_file(self, filename): 51 | with codecs.open(filename, 'r', 'utf-8') as f: 52 | for line in f: 53 | line = line.strip() 54 | self.add_to_corpus(line) 55 | self.dictionary.add_word(UNK_TOKEN) 56 | self.dictionary.add_word(PAD_TOKEN) 57 | 58 | def add_to_corpus(self, line): 59 | """Tokenizes a text line.""" 60 | # Add words to the dictionary 61 | words = line.split() 62 | # tokens = len(words) 63 | for word in words: 64 | word = word.lower() 65 | self.dictionary.add_word(word) 66 | 67 | def tokenize(self, line, max_len=20): 68 | # Tokenize line contents 69 | words = SENTENCE_SPLIT_REGEX.split(line.strip()) 70 | # words = [w.lower() for w in words if len(w) > 0] 71 | words = [w.lower() for w in words if (len(w) > 0 and w != ' ')] ## do not include space as a token 72 | 73 | if words[-1] == '.': 74 | words = words[:-1] 75 | 76 | if max_len > 0: 77 | if len(words) > max_len: 78 | words = words[:max_len] 79 | elif len(words) < max_len: 80 | # words = [PAD_TOKEN] * (max_len - len(words)) + words 81 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1) 82 | 83 | tokens = len(words) ## for end token 84 | ids = torch.LongTensor(tokens) 85 | token = 0 86 | for word in words: 87 | if word not in self.dictionary: 88 | word = UNK_TOKEN 89 | if type(word) != type('a'): 90 | print(word, type(word), word.encode('ascii', 'ignore').decode('ascii'), 91 | type(word.encode('ascii', 'ignore').decode('ascii'))) 92 | word = word.encode('ascii', 'ignore').decode('ascii') 93 | ids[token] = self.dictionary[word] 94 | token += 1 95 | # ids[token] = self.dictionary[END_TOKEN] 96 | return ids 97 | 98 | def __len__(self): 99 | return len(self.dictionary) 100 | -------------------------------------------------------------------------------- /omdet/modeling/registry.py: -------------------------------------------------------------------------------- 1 | from omdet.utils.registry import Registry 2 | 3 | LANGUAGE_BACKBONES = Registry() 4 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import add_omdet_v2_turbo_config 2 | from .detector import OmDetV2Turbo 3 | from .ela_encoder import ELAEncoder 4 | from .ela_decoder import ELADecoder 5 | from .head import DINOHead 6 | from .infer_model import OmDetV2TurboInfer 7 | 8 | 9 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/block.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .conv import Conv, DWConv, GhostConv, LightConv, RepConv 6 | 7 | __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost', 8 | 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3') 9 | 10 | 11 | class DFL(nn.Module): 12 | """ 13 | Integral module of Distribution Focal Loss (DFL). 14 | Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 15 | """ 16 | 17 | def __init__(self, c1=16): 18 | """Initialize a convolutional layer with a given number of input channels.""" 19 | super().__init__() 20 | self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False) 21 | x = torch.arange(c1, dtype=torch.float) 22 | self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1)) 23 | self.c1 = c1 24 | 25 | def forward(self, x): 26 | """Applies a transformer layer on input tensor 'x' and returns a tensor.""" 27 | b, c, a = x.shape # batch, channels, anchors 28 | return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a) 29 | # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a) 30 | 31 | 32 | class Proto(nn.Module): 33 | """YOLOv8 mask Proto module for segmentation models.""" 34 | 35 | def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks 36 | super().__init__() 37 | self.cv1 = Conv(c1, c_, k=3) 38 | self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest') 39 | self.cv2 = Conv(c_, c_, k=3) 40 | self.cv3 = Conv(c_, c2) 41 | 42 | def forward(self, x): 43 | """Performs a forward pass through layers using an upsampled input image.""" 44 | return self.cv3(self.cv2(self.upsample(self.cv1(x)))) 45 | 46 | 47 | class HGStem(nn.Module): 48 | """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d. 49 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py 50 | """ 51 | 52 | def __init__(self, c1, cm, c2): 53 | super().__init__() 54 | self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU()) 55 | self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU()) 56 | self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU()) 57 | self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU()) 58 | self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU()) 59 | self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True) 60 | 61 | def forward(self, x): 62 | """Forward pass of a PPHGNetV2 backbone layer.""" 63 | x = self.stem1(x) 64 | x = F.pad(x, [0, 1, 0, 1]) 65 | x2 = self.stem2a(x) 66 | x2 = F.pad(x2, [0, 1, 0, 1]) 67 | x2 = self.stem2b(x2) 68 | x1 = self.pool(x) 69 | x = torch.cat([x1, x2], dim=1) 70 | x = self.stem3(x) 71 | x = self.stem4(x) 72 | return x 73 | 74 | 75 | class HGBlock(nn.Module): 76 | """HG_Block of PPHGNetV2 with 2 convolutions and LightConv. 77 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py 78 | """ 79 | 80 | def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()): 81 | super().__init__() 82 | block = LightConv if lightconv else Conv 83 | self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n)) 84 | self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act) # squeeze conv 85 | self.ec = Conv(c2 // 2, c2, 1, 1, act=act) # excitation conv 86 | self.add = shortcut and c1 == c2 87 | 88 | def forward(self, x): 89 | """Forward pass of a PPHGNetV2 backbone layer.""" 90 | y = [x] 91 | y.extend(m(y[-1]) for m in self.m) 92 | y = self.ec(self.sc(torch.cat(y, 1))) 93 | return y + x if self.add else y 94 | 95 | 96 | class SPP(nn.Module): 97 | """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.""" 98 | 99 | def __init__(self, c1, c2, k=(5, 9, 13)): 100 | """Initialize the SPP layer with input/output channels and pooling kernel sizes.""" 101 | super().__init__() 102 | c_ = c1 // 2 # hidden channels 103 | self.cv1 = Conv(c1, c_, 1, 1) 104 | self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) 105 | self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) 106 | 107 | def forward(self, x): 108 | """Forward pass of the SPP layer, performing spatial pyramid pooling.""" 109 | x = self.cv1(x) 110 | return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) 111 | 112 | 113 | class SPPF(nn.Module): 114 | """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.""" 115 | 116 | def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13)) 117 | super().__init__() 118 | c_ = c1 // 2 # hidden channels 119 | self.cv1 = Conv(c1, c_, 1, 1) 120 | self.cv2 = Conv(c_ * 4, c2, 1, 1) 121 | self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) 122 | 123 | def forward(self, x): 124 | """Forward pass through Ghost Convolution block.""" 125 | x = self.cv1(x) 126 | y1 = self.m(x) 127 | y2 = self.m(y1) 128 | return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) 129 | 130 | 131 | class C1(nn.Module): 132 | """CSP Bottleneck with 1 convolution.""" 133 | 134 | def __init__(self, c1, c2, n=1): # ch_in, ch_out, number 135 | super().__init__() 136 | self.cv1 = Conv(c1, c2, 1, 1) 137 | self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n))) 138 | 139 | def forward(self, x): 140 | """Applies cross-convolutions to input in the C3 module.""" 141 | y = self.cv1(x) 142 | return self.m(y) + y 143 | 144 | 145 | class C2(nn.Module): 146 | """CSP Bottleneck with 2 convolutions.""" 147 | 148 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 149 | super().__init__() 150 | self.c = int(c2 * e) # hidden channels 151 | self.cv1 = Conv(c1, 2 * self.c, 1, 1) 152 | self.cv2 = Conv(2 * self.c, c2, 1) # optional act=FReLU(c2) 153 | # self.attention = ChannelAttention(2 * self.c) # or SpatialAttention() 154 | self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))) 155 | 156 | def forward(self, x): 157 | """Forward pass through the CSP bottleneck with 2 convolutions.""" 158 | a, b = self.cv1(x).chunk(2, 1) 159 | return self.cv2(torch.cat((self.m(a), b), 1)) 160 | 161 | 162 | class C2f(nn.Module): 163 | """CSP Bottleneck with 2 convolutions.""" 164 | 165 | def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 166 | super().__init__() 167 | self.c = int(c2 * e) # hidden channels 168 | self.cv1 = Conv(c1, 2 * self.c, 1, 1) 169 | self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) 170 | self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) 171 | 172 | def forward(self, x): 173 | """Forward pass through C2f layer.""" 174 | y = list(self.cv1(x).chunk(2, 1)) 175 | y.extend(m(y[-1]) for m in self.m) 176 | return self.cv2(torch.cat(y, 1)) 177 | 178 | def forward_split(self, x): 179 | """Forward pass using split() instead of chunk().""" 180 | y = list(self.cv1(x).split((self.c, self.c), 1)) 181 | y.extend(m(y[-1]) for m in self.m) 182 | return self.cv2(torch.cat(y, 1)) 183 | 184 | 185 | class C3(nn.Module): 186 | """CSP Bottleneck with 3 convolutions.""" 187 | 188 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 189 | super().__init__() 190 | c_ = int(c2 * e) # hidden channels 191 | self.cv1 = Conv(c1, c_, 1, 1) 192 | self.cv2 = Conv(c1, c_, 1, 1) 193 | self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) 194 | self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n))) 195 | 196 | def forward(self, x): 197 | """Forward pass through the CSP bottleneck with 2 convolutions.""" 198 | return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) 199 | 200 | 201 | class C3x(C3): 202 | """C3 module with cross-convolutions.""" 203 | 204 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): 205 | """Initialize C3TR instance and set default parameters.""" 206 | super().__init__(c1, c2, n, shortcut, g, e) 207 | self.c_ = int(c2 * e) 208 | self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n))) 209 | 210 | 211 | class RepC3(nn.Module): 212 | """Rep C3.""" 213 | 214 | def __init__(self, c1, c2, n=3, e=1.0): 215 | super().__init__() 216 | c_ = int(c2 * e) # hidden channels 217 | self.cv1 = Conv(c1, c2, 1, 1) 218 | self.cv2 = Conv(c1, c2, 1, 1) 219 | self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)]) 220 | self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity() 221 | 222 | def forward(self, x): 223 | """Forward pass of RT-DETR neck layer.""" 224 | return self.cv3(self.m(self.cv1(x)) + self.cv2(x)) 225 | 226 | # 227 | # class C3TR(C3): 228 | # """C3 module with TransformerBlock().""" 229 | # 230 | # def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): 231 | # """Initialize C3Ghost module with GhostBottleneck().""" 232 | # super().__init__(c1, c2, n, shortcut, g, e) 233 | # c_ = int(c2 * e) 234 | # self.m = TransformerBlock(c_, c_, 4, n) 235 | 236 | 237 | class C3Ghost(C3): 238 | """C3 module with GhostBottleneck().""" 239 | 240 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): 241 | """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling.""" 242 | super().__init__(c1, c2, n, shortcut, g, e) 243 | c_ = int(c2 * e) # hidden channels 244 | self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n))) 245 | 246 | 247 | class GhostBottleneck(nn.Module): 248 | """Ghost Bottleneck https://github.com/huawei-noah/ghostnet.""" 249 | 250 | def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride 251 | super().__init__() 252 | c_ = c2 // 2 253 | self.conv = nn.Sequential( 254 | GhostConv(c1, c_, 1, 1), # pw 255 | DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw 256 | GhostConv(c_, c2, 1, 1, act=False)) # pw-linear 257 | self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, 258 | act=False)) if s == 2 else nn.Identity() 259 | 260 | def forward(self, x): 261 | """Applies skip connection and concatenation to input tensor.""" 262 | return self.conv(x) + self.shortcut(x) 263 | 264 | 265 | class Bottleneck(nn.Module): 266 | """Standard bottleneck.""" 267 | 268 | def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand 269 | super().__init__() 270 | c_ = int(c2 * e) # hidden channels 271 | self.cv1 = Conv(c1, c_, k[0], 1) 272 | self.cv2 = Conv(c_, c2, k[1], 1, g=g) 273 | self.add = shortcut and c1 == c2 274 | 275 | def forward(self, x): 276 | """'forward()' applies the YOLOv5 FPN to input data.""" 277 | return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) 278 | 279 | 280 | class BottleneckCSP(nn.Module): 281 | """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.""" 282 | 283 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 284 | super().__init__() 285 | c_ = int(c2 * e) # hidden channels 286 | self.cv1 = Conv(c1, c_, 1, 1) 287 | self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) 288 | self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) 289 | self.cv4 = Conv(2 * c_, c2, 1, 1) 290 | self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) 291 | self.act = nn.SiLU() 292 | self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) 293 | 294 | def forward(self, x): 295 | """Applies a CSP bottleneck with 3 convolutions.""" 296 | y1 = self.cv3(self.m(self.cv1(x))) 297 | y2 = self.cv2(x) 298 | return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1)))) 299 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/build_components.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from detectron2.utils.logger import _log_api_usage 3 | from detectron2.utils.registry import Registry 4 | 5 | TRANSFORMER_ENCODER_REGISTRY = Registry("TRANSFORMER_ENCODER") # noqa F401 isort:skip 6 | TRANSFORMER_ENCODER_REGISTRY.__doc__ = """ 7 | """ 8 | 9 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_DECODER") # noqa F401 isort:skip 10 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ """ 11 | 12 | DETR_HEAD_REGISTRY = Registry("DETR_HEAD") # noqa F401 isort:skip 13 | DETR_HEAD_REGISTRY.__doc__ = """ """ 14 | 15 | 16 | def build_encoder_model(cfg): 17 | """ 18 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. 19 | Note that it does not load any weights from ``cfg``. 20 | """ 21 | encoder = cfg.MODEL.TRANSFORMER_ENCODER 22 | mode_class = TRANSFORMER_ENCODER_REGISTRY.get(encoder) 23 | model = mode_class(**mode_class.from_config(cfg)) 24 | # model = TRANSFORMER_ENCODER_REGISTRY.get(encoder)(cfg) 25 | model.to(torch.device(cfg.MODEL.DEVICE)) 26 | _log_api_usage("modeling.transfor_encoder." + encoder) 27 | return model 28 | 29 | 30 | def build_decoder_model(cfg): 31 | """ 32 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. 33 | Note that it does not load any weights from ``cfg``. 34 | """ 35 | decoder = cfg.MODEL.TRANSFORMER_DECODER 36 | mode_class = TRANSFORMER_DECODER_REGISTRY.get(decoder) 37 | model = mode_class(**mode_class.from_config(cfg)) 38 | # model = TRANSFORMER_DECODER_REGISTRY.get(decoder)(cfg) 39 | model.to(torch.device(cfg.MODEL.DEVICE)) 40 | _log_api_usage("modeling.transfor_encoder." + decoder) 41 | return model 42 | 43 | 44 | def build_detr_head(cfg): 45 | """ 46 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. 47 | Note that it does not load any weights from ``cfg``. 48 | """ 49 | head = cfg.MODEL.HEAD 50 | # model = DETR_HEAD_REGISTRY.get(head)(cfg) 51 | mode_class = DETR_HEAD_REGISTRY.get(head) 52 | model = mode_class(**mode_class.from_config(cfg)) 53 | model.to(torch.device(cfg.MODEL.DEVICE)) 54 | _log_api_usage("modeling.transfor_encoder." + head) 55 | return model -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode as CN 2 | from omdet.modeling.backbone.config import add_backbone_config 3 | 4 | 5 | def add_omdet_v2_turbo_config(cfg): 6 | """ 7 | Add config for Modulated OmDet Turn. 8 | """ 9 | cfg.MODEL.HEAD = "DINOHead" 10 | cfg.MODEL.LOSS = "DINOLoss" 11 | cfg.MODEL.TRANSFORMER_ENCODER = "ELAEncoder" 12 | cfg.MODEL.TRANSFORMER_DECODER = "ELADecoder" 13 | 14 | cfg.MODEL.LANGUAGE_BACKBONE = CN() 15 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "clip" 16 | cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 512 17 | 18 | # Task Head 19 | cfg.MODEL.ELAEncoder = CN() 20 | cfg.MODEL.ELAEncoder.in_channels = [192, 384, 768] 21 | cfg.MODEL.ELAEncoder.feat_strides = [8, 16, 32] 22 | cfg.MODEL.ELAEncoder.hidden_dim = 256 23 | cfg.MODEL.ELAEncoder.use_encoder_idx = [2] 24 | cfg.MODEL.ELAEncoder.num_encoder_layers = 1 25 | cfg.MODEL.ELAEncoder.encoder_layer = 'TransformerLayer' 26 | cfg.MODEL.ELAEncoder.pe_temperature = 10000 27 | cfg.MODEL.ELAEncoder.expansion = 1.0 28 | cfg.MODEL.ELAEncoder.depth_mult = 1.0 29 | cfg.MODEL.ELAEncoder.act = 'silu' 30 | cfg.MODEL.ELAEncoder.eval_size = None 31 | cfg.MODEL.ELAEncoder.dim_feedforward=1024 32 | 33 | cfg.MODEL.ELADecoder = CN() 34 | cfg.MODEL.ELADecoder.hidden_dim = 256 35 | cfg.MODEL.ELADecoder.num_queries = 300 36 | cfg.MODEL.ELADecoder.position_embed_type = 'sine' 37 | cfg.MODEL.ELADecoder.backbone_feat_channels = [256, 256, 256] 38 | cfg.MODEL.ELADecoder.feat_strides = [8, 16, 32] 39 | cfg.MODEL.ELADecoder.num_levels = 3 40 | cfg.MODEL.ELADecoder.num_decoder_points = 4 41 | cfg.MODEL.ELADecoder.nhead = 8 42 | cfg.MODEL.ELADecoder.num_decoder_layers = 3 43 | cfg.MODEL.ELADecoder.dim_feedforward = 1024 44 | cfg.MODEL.ELADecoder.dropout = 0.0 45 | cfg.MODEL.ELADecoder.activation = 'relu' 46 | cfg.MODEL.ELADecoder.num_denoising = 100 47 | cfg.MODEL.ELADecoder.label_noise_ratio = 0.5 48 | cfg.MODEL.ELADecoder.box_noise_scale = 1.0 49 | cfg.MODEL.ELADecoder.learnt_init_query = True 50 | cfg.MODEL.ELADecoder.eval_size = None 51 | cfg.MODEL.ELADecoder.eval_idx = -1 52 | cfg.MODEL.ELADecoder.eps = 1e-2 53 | cfg.MODEL.ELADecoder.cls_type = 'cosine' 54 | 55 | cfg.MODEL.FUSE_TYPE = None 56 | 57 | cfg.INPUT.RANDOM_CROP = None 58 | cfg.INPUT.RANDOM_CONTRAST = None 59 | cfg.INPUT.RANDOM_BRIGHTNESS = None 60 | cfg.INPUT.RANDOM_SATURATION = None 61 | 62 | cfg.MODEL.DEPLOY_MODE = False 63 | 64 | add_backbone_config(cfg) -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/conv.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | __all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv', 8 | 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv') 9 | 10 | 11 | def autopad(k, p=None, d=1): # kernel, padding, dilation 12 | """Pad to 'same' shape outputs.""" 13 | if d > 1: 14 | k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size 15 | if p is None: 16 | p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad 17 | return p 18 | 19 | 20 | class Conv(nn.Module): 21 | """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" 22 | default_act = nn.SiLU() # default activation 23 | 24 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): 25 | """Initialize Conv layer with given arguments including activation.""" 26 | super().__init__() 27 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) 28 | self.bn = nn.BatchNorm2d(c2) 29 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() 30 | 31 | def forward(self, x): 32 | """Apply convolution, batch normalization and activation to input tensor.""" 33 | return self.act(self.bn(self.conv(x))) 34 | 35 | def forward_fuse(self, x): 36 | """Perform transposed convolution of 2D data.""" 37 | return self.act(self.conv(x)) 38 | 39 | 40 | class Conv2(Conv): 41 | """Simplified RepConv module with Conv fusing.""" 42 | 43 | def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True): 44 | """Initialize Conv layer with given arguments including activation.""" 45 | super().__init__(c1, c2, k, s, p, g=g, d=d, act=act) 46 | self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False) # add 1x1 conv 47 | 48 | def forward(self, x): 49 | """Apply convolution, batch normalization and activation to input tensor.""" 50 | return self.act(self.bn(self.conv(x) + self.cv2(x))) 51 | 52 | def fuse_convs(self): 53 | """Fuse parallel convolutions.""" 54 | w = torch.zeros_like(self.conv.weight.data) 55 | i = [x // 2 for x in w.shape[2:]] 56 | w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone() 57 | self.conv.weight.data += w 58 | self.__delattr__('cv2') 59 | 60 | 61 | class LightConv(nn.Module): 62 | """Light convolution with args(ch_in, ch_out, kernel). 63 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py 64 | """ 65 | 66 | def __init__(self, c1, c2, k=1, act=nn.ReLU()): 67 | """Initialize Conv layer with given arguments including activation.""" 68 | super().__init__() 69 | self.conv1 = Conv(c1, c2, 1, act=False) 70 | self.conv2 = DWConv(c2, c2, k, act=act) 71 | 72 | def forward(self, x): 73 | """Apply 2 convolutions to input tensor.""" 74 | return self.conv2(self.conv1(x)) 75 | 76 | 77 | class DWConv(Conv): 78 | """Depth-wise convolution.""" 79 | 80 | def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation 81 | super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act) 82 | 83 | 84 | class DWConvTranspose2d(nn.ConvTranspose2d): 85 | """Depth-wise transpose convolution.""" 86 | 87 | def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out 88 | super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2)) 89 | 90 | 91 | class ConvTranspose(nn.Module): 92 | """Convolution transpose 2d layer.""" 93 | default_act = nn.SiLU() # default activation 94 | 95 | def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True): 96 | """Initialize ConvTranspose2d layer with batch normalization and activation function.""" 97 | super().__init__() 98 | self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn) 99 | self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity() 100 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() 101 | 102 | def forward(self, x): 103 | """Applies transposed convolutions, batch normalization and activation to input.""" 104 | return self.act(self.bn(self.conv_transpose(x))) 105 | 106 | def forward_fuse(self, x): 107 | """Applies activation and convolution transpose operation to input.""" 108 | return self.act(self.conv_transpose(x)) 109 | 110 | 111 | class Focus(nn.Module): 112 | """Focus wh information into c-space.""" 113 | 114 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups 115 | super().__init__() 116 | self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act) 117 | # self.contract = Contract(gain=2) 118 | 119 | def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) 120 | return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1)) 121 | # return self.conv(self.contract(x)) 122 | 123 | 124 | class GhostConv(nn.Module): 125 | """Ghost Convolution https://github.com/huawei-noah/ghostnet.""" 126 | 127 | def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups 128 | super().__init__() 129 | c_ = c2 // 2 # hidden channels 130 | self.cv1 = Conv(c1, c_, k, s, None, g, act=act) 131 | self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act) 132 | 133 | def forward(self, x): 134 | """Forward propagation through a Ghost Bottleneck layer with skip connection.""" 135 | y = self.cv1(x) 136 | return torch.cat((y, self.cv2(y)), 1) 137 | 138 | 139 | class RepConv(nn.Module): 140 | """RepConv is a basic rep-style block, including training and deploy status 141 | This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py 142 | """ 143 | default_act = nn.SiLU() # default activation 144 | 145 | def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): 146 | super().__init__() 147 | assert k == 3 and p == 1 148 | self.g = g 149 | self.c1 = c1 150 | self.c2 = c2 151 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() 152 | 153 | self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None 154 | self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) 155 | self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) 156 | 157 | def forward_fuse(self, x): 158 | """Forward process""" 159 | return self.act(self.conv(x)) 160 | 161 | def forward(self, x): 162 | """Forward process""" 163 | id_out = 0 if self.bn is None else self.bn(x) 164 | return self.act(self.conv1(x) + self.conv2(x) + id_out) 165 | 166 | def get_equivalent_kernel_bias(self): 167 | kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) 168 | kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) 169 | kernelid, biasid = self._fuse_bn_tensor(self.bn) 170 | return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid 171 | 172 | def _avg_to_3x3_tensor(self, avgp): 173 | channels = self.c1 174 | groups = self.g 175 | kernel_size = avgp.kernel_size 176 | input_dim = channels // groups 177 | k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) 178 | k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 179 | return k 180 | 181 | def _pad_1x1_to_3x3_tensor(self, kernel1x1): 182 | if kernel1x1 is None: 183 | return 0 184 | else: 185 | return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) 186 | 187 | def _fuse_bn_tensor(self, branch): 188 | if branch is None: 189 | return 0, 0 190 | if isinstance(branch, Conv): 191 | kernel = branch.conv.weight 192 | running_mean = branch.bn.running_mean 193 | running_var = branch.bn.running_var 194 | gamma = branch.bn.weight 195 | beta = branch.bn.bias 196 | eps = branch.bn.eps 197 | elif isinstance(branch, nn.BatchNorm2d): 198 | if not hasattr(self, 'id_tensor'): 199 | input_dim = self.c1 // self.g 200 | kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) 201 | for i in range(self.c1): 202 | kernel_value[i, i % input_dim, 1, 1] = 1 203 | self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) 204 | kernel = self.id_tensor 205 | running_mean = branch.running_mean 206 | running_var = branch.running_var 207 | gamma = branch.weight 208 | beta = branch.bias 209 | eps = branch.eps 210 | std = (running_var + eps).sqrt() 211 | t = (gamma / std).reshape(-1, 1, 1, 1) 212 | return kernel * t, beta - running_mean * gamma / std 213 | 214 | def fuse_convs(self): 215 | if hasattr(self, 'conv'): 216 | return 217 | kernel, bias = self.get_equivalent_kernel_bias() 218 | self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, 219 | out_channels=self.conv1.conv.out_channels, 220 | kernel_size=self.conv1.conv.kernel_size, 221 | stride=self.conv1.conv.stride, 222 | padding=self.conv1.conv.padding, 223 | dilation=self.conv1.conv.dilation, 224 | groups=self.conv1.conv.groups, 225 | bias=True).requires_grad_(False) 226 | self.conv.weight.data = kernel 227 | self.conv.bias.data = bias 228 | for para in self.parameters(): 229 | para.detach_() 230 | self.__delattr__('conv1') 231 | self.__delattr__('conv2') 232 | if hasattr(self, 'nm'): 233 | self.__delattr__('nm') 234 | if hasattr(self, 'bn'): 235 | self.__delattr__('bn') 236 | if hasattr(self, 'id_tensor'): 237 | self.__delattr__('id_tensor') 238 | 239 | 240 | class ChannelAttention(nn.Module): 241 | """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet.""" 242 | 243 | def __init__(self, channels: int) -> None: 244 | super().__init__() 245 | self.pool = nn.AdaptiveAvgPool2d(1) 246 | self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True) 247 | self.act = nn.Sigmoid() 248 | 249 | def forward(self, x: torch.Tensor) -> torch.Tensor: 250 | return x * self.act(self.fc(self.pool(x))) 251 | 252 | 253 | class SpatialAttention(nn.Module): 254 | """Spatial-attention module.""" 255 | 256 | def __init__(self, kernel_size=7): 257 | """Initialize Spatial-attention module with kernel size argument.""" 258 | super().__init__() 259 | assert kernel_size in (3, 7), 'kernel size must be 3 or 7' 260 | padding = 3 if kernel_size == 7 else 1 261 | self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False) 262 | self.act = nn.Sigmoid() 263 | 264 | def forward(self, x): 265 | """Apply channel and spatial attention on input for feature recalibration.""" 266 | return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1))) 267 | 268 | 269 | class CBAM(nn.Module): 270 | """Convolutional Block Attention Module.""" 271 | 272 | def __init__(self, c1, kernel_size=7): # ch_in, kernels 273 | super().__init__() 274 | self.channel_attention = ChannelAttention(c1) 275 | self.spatial_attention = SpatialAttention(kernel_size) 276 | 277 | def forward(self, x): 278 | """Applies the forward pass through C1 module.""" 279 | return self.spatial_attention(self.channel_attention(x)) 280 | 281 | 282 | class Concat(nn.Module): 283 | """Concatenate a list of tensors along dimension.""" 284 | 285 | def __init__(self, dimension=1): 286 | """Concatenates a list of tensors along a specified dimension.""" 287 | super().__init__() 288 | self.d = dimension 289 | 290 | def forward(self, x): 291 | """Forward pass for the YOLOv8 mask Proto module.""" 292 | return torch.cat(x, self.d) 293 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/detector.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import copy 6 | from typing import Tuple 7 | 8 | import numpy as np 9 | # import open_clip 10 | from detectron2.structures import Boxes, ImageList, Instances 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from detectron2.modeling import detector_postprocess 15 | from detectron2.layers import batched_nms 16 | from detectron2.modeling import build_backbone 17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head 18 | from detectron2.config import configurable 19 | from omdet.modeling.language_backbone import build_language_backbone 20 | from detectron2.utils.logger import setup_logger 21 | from ..modeling.language_backbone.clip.models import clip as clip 22 | from .torch_utils import bbox_cxcywh_to_xyxy 23 | __all__ = ['OmDetV2Turbo'] 24 | 25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 26 | 27 | from ..utils.cache import LRUCache 28 | 29 | from huggingface_hub import PyTorchModelHubMixin 30 | 31 | 32 | @META_ARCH_REGISTRY.register() 33 | class OmDetV2Turbo(nn.Module, PyTorchModelHubMixin): 34 | 35 | @configurable 36 | def __init__(self, cfg): 37 | super(OmDetV2Turbo, self).__init__() 38 | self.cfg = cfg 39 | self.logger = setup_logger(name=__name__) 40 | 41 | self.backbone = build_backbone(cfg) 42 | self.decoder = build_decoder_model(cfg) 43 | self.neck = build_encoder_model(cfg) 44 | self.loss_head = build_detr_head(cfg) 45 | self.device = cfg.MODEL.DEVICE 46 | 47 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) 48 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) 49 | normalizer = lambda x: (x - pixel_mean) / pixel_std 50 | self.normalizer = normalizer 51 | 52 | self.size_divisibility = self.backbone.size_divisibility 53 | self.nms_test_th = 0.0 54 | self.conf_test_th = 0.0 55 | self.loss_type = 'FOCAL' 56 | self.use_language_cache = True 57 | self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 58 | self.num_proposals = cfg.MODEL.ELADecoder.num_queries 59 | 60 | # Build language Encoder 61 | self.language_backbone = build_language_backbone(cfg) 62 | self.language_cache_label = LRUCache(100) 63 | self.language_cache_prompt = LRUCache(100) 64 | 65 | 66 | @classmethod 67 | def from_config(cls, cfg, *args, **kwargs): 68 | return { 69 | 'cfg': cfg 70 | } 71 | 72 | def preprocess_image(self, batched_inputs): 73 | """ 74 | Normalize, pad and batch the input images. 75 | """ 76 | images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs] 77 | images = ImageList.from_tensors(images, self.size_divisibility) 78 | 79 | images_whwh = list() 80 | for bi in batched_inputs: 81 | h, w = bi["image"].shape[-2:] 82 | images_whwh.append(torch.tensor([w, h, w, h], dtype=torch.float32, device=self.device)) 83 | images_whwh = torch.stack(images_whwh) 84 | ann_types = [x["ann_type"] if "ann_type" in x else "box" for x in batched_inputs] 85 | return images, images_whwh, ann_types 86 | 87 | def gen_output(self, box_cls, box_pred, batched_inputs, images, score_thresh, nms_thresh, do_postprocess, 88 | max_num_det=None): 89 | results = self.inference(box_cls, box_pred, images.image_sizes, score_thresh, nms_thresh, max_num_det) 90 | 91 | if do_postprocess: 92 | processed_results = [] 93 | for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): 94 | height = input_per_image.get("height", image_size[0]) 95 | width = input_per_image.get("width", image_size[1]) 96 | r = detector_postprocess(results_per_image, height, width) 97 | processed_results.append({"instances": r}) 98 | results = processed_results 99 | return results 100 | 101 | def inference(self, box_cls, box_pred, image_sizes, score_thresh=None, nms_thresh=None, max_num_det=None): 102 | assert len(box_cls) == len(image_sizes) 103 | if score_thresh is None: 104 | score_thresh = self.conf_test_th 105 | 106 | if nms_thresh is None: 107 | nms_thresh = self.nms_test_th 108 | 109 | num_classes = box_cls.shape[2] 110 | scores, labels = self.compute_score(box_cls) 111 | results = [] 112 | if self.loss_type in {"FOCAL", "BCE"}: 113 | for i, (scores_img, box_per_img, image_size) in enumerate(zip(scores, box_pred, image_sizes 114 | )): 115 | results.append(self.inference_single_image(box_per_img, scores_img, labels, image_size, num_classes, 116 | score_thresh=score_thresh, 117 | nms_thresh=nms_thresh, 118 | max_num_det=max_num_det)) 119 | else: 120 | for i, (scores_img, label_img, box_per_img, image_size) in enumerate(zip( 121 | scores, labels, box_pred, image_sizes 122 | )): 123 | results.append( 124 | self.inference_single_image(box_per_img, scores_img, label_img, image_size, num_classes, 125 | score_thresh=score_thresh, 126 | nms_thresh=nms_thresh, 127 | max_num_det=max_num_det)) 128 | 129 | return results 130 | 131 | def inference_single_image(self, boxes, scores, labels, 132 | image_size: Tuple[int, int], 133 | num_classes: int, 134 | score_thresh: float, 135 | nms_thresh: float, 136 | max_num_det: int = None): 137 | """ 138 | Call `fast_rcnn_inference_single_image` for all images. 139 | Args: 140 | boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic 141 | boxes for each image. Element i has shape (Ri, K * 4) if doing 142 | class-specific regression, or (Ri, 4) if doing class-agnostic 143 | regression, where Ri is the number of predicted objects for image i. 144 | This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. 145 | scores (list[Tensor]): A list of Tensors of predicted class scores for each image. 146 | Element i has shape (Ri, K + 1), where Ri is the number of predicted objects 147 | for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. 148 | image_size (list[tuple]): A list of (width, height) tuples for each image in the batch. 149 | score_thresh (float): Only return detections with a confidence score exceeding this 150 | threshold. 151 | nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. 152 | Returns: 153 | instances: (list[Instances]): A list of N instances, one for each image in the batch, 154 | that stores the topk most confidence detections. 155 | kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates 156 | the corresponding boxes/scores index in [0, Ri) from the input, for image i. 157 | """ 158 | # scores_per_image: num_proposal 159 | # labels_per_image: num_proposal 160 | # box_per_images: num_proposal x 4' 161 | if self.loss_type in {"FOCAL", "BCE"}: 162 | proposal_num = len(boxes) if max_num_det is None else max_num_det 163 | scores_per_image, topk_indices = scores.flatten(0, 1).topk(proposal_num, sorted=False) 164 | labels_per_image = labels[topk_indices] 165 | box_pred_per_image = boxes.view(-1, 1, 4).repeat(1, num_classes, 1).view(-1, 4) 166 | box_pred_per_image = box_pred_per_image[topk_indices] 167 | else: 168 | box_pred_per_image = boxes 169 | scores_per_image = scores 170 | labels_per_image = labels 171 | 172 | # Score filtering 173 | box_pred_per_image = bbox_cxcywh_to_xyxy(box_pred_per_image) * torch.tensor(image_size).repeat(2).to(self.device) 174 | filter_mask = scores_per_image > score_thresh # R x K 175 | score_keep = filter_mask.nonzero(as_tuple=False).view(-1) 176 | box_pred_per_image = box_pred_per_image[score_keep] 177 | scores_per_image = scores_per_image[score_keep] 178 | labels_per_image = labels_per_image[score_keep] 179 | 180 | # NMS 181 | scores_per_image.to(self.device) 182 | keep = batched_nms(box_pred_per_image, scores_per_image, labels_per_image, nms_thresh) 183 | box_pred_per_image = box_pred_per_image[keep] 184 | scores_per_image = scores_per_image[keep] 185 | labels_per_image = labels_per_image[keep] 186 | 187 | # create an instance 188 | result = Instances(image_size) 189 | result.pred_boxes = Boxes(box_pred_per_image) 190 | result.pred_boxes.clip(image_size) 191 | result.scores = scores_per_image 192 | result.pred_classes = labels_per_image 193 | 194 | return result 195 | 196 | def compute_score(self, box_cls): 197 | """ 198 | Args: 199 | box_cls: tensor of shape (batch_size, num_proposals, K). 200 | The tensor predicts the classification probability for each proposal. 201 | 202 | Returns: 203 | """ 204 | if self.loss_type in {"FOCAL", "BCE"}: 205 | num_classes = box_cls.shape[2] 206 | proposal_num = box_cls.shape[1] 207 | scores = torch.sigmoid(box_cls) 208 | labels = torch.arange(num_classes, device=self.device). \ 209 | unsqueeze(0).repeat(proposal_num, 1).flatten(0, 1) 210 | else: 211 | scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) 212 | # scores: batch_size x num_proposal 213 | 214 | return scores, labels 215 | 216 | def language_encode(self, batched_inputs, encode_type="task"): 217 | texts = batched_inputs 218 | 219 | if self.language_encoder_type == "clip": 220 | text_input = clip.tokenize(texts, truncate=True).to(self.device) 221 | 222 | return self.language_backbone(text_input, encode_type == "task") 223 | 224 | def get_cached_label_emb(self, labels): 225 | self.logger.info('processing labels embeddings for {}'.format(labels)) 226 | not_cached_index = [] 227 | not_cached_labels = [] 228 | total_embs = [] 229 | for idx, l in enumerate(labels): 230 | if self.language_cache_label.has(l): 231 | total_embs.append(self.language_cache_label.get(l)) 232 | else: 233 | total_embs.append(None) 234 | not_cached_index.append(idx) 235 | not_cached_labels.append(l) 236 | 237 | self.logger.info('cached label emb num: {}, not cached num: {}'.format(len(total_embs) - len(not_cached_labels), 238 | len(not_cached_labels))) 239 | 240 | if not_cached_labels: 241 | embeddings = self.language_encode(not_cached_labels, encode_type="label") 242 | for idx, emb in enumerate(embeddings): 243 | idx_to_put = not_cached_index[idx] 244 | total_embs[idx_to_put] = emb 245 | self.language_cache_label.put(not_cached_labels[idx], emb) 246 | 247 | total_label_embs = torch.stack(total_embs).to(self.device) 248 | return total_label_embs 249 | 250 | def get_cached_prompt_emb(self, batched_tasks): 251 | self.logger.info('processing prompt embeddings for {}'.format(batched_tasks)) 252 | not_cached_index = [] 253 | not_cached_tasks = [] 254 | total_task_features = [] 255 | total_task_masks = [] 256 | for idx, t in enumerate(batched_tasks): 257 | if self.language_cache_prompt.has(t): 258 | task_feature, task_mask = self.language_cache_prompt.get(t) 259 | total_task_features.append(task_feature) 260 | total_task_masks.append(task_mask) 261 | else: 262 | total_task_features.append(None) 263 | total_task_masks.append(None) 264 | not_cached_index.append(idx) 265 | not_cached_tasks.append(t) 266 | 267 | self.logger.info( 268 | 'cached prompt emb num: {}, not cached num: {}'.format(len(total_task_features) - len(not_cached_tasks), 269 | len(not_cached_tasks))) 270 | 271 | if not_cached_tasks: 272 | embeddings, task_masks = self.language_encode(not_cached_tasks, encode_type="task") 273 | 274 | for idx in range(embeddings.shape[1]): 275 | emb = embeddings[:, [idx], :] 276 | idx_to_put = not_cached_index[idx] 277 | cur_mask = torch.unsqueeze(task_masks[idx], dim=0).to(self.device) 278 | total_task_features[idx_to_put] = emb 279 | total_task_masks[idx_to_put] = cur_mask 280 | self.language_cache_prompt.put(not_cached_tasks[idx], (emb, cur_mask)) 281 | 282 | total_prompt_features = torch.cat(total_task_features, dim=1) 283 | total_prompt_masks = torch.cat(total_task_masks, dim=0).to(self.device) 284 | 285 | return total_prompt_features, total_prompt_masks 286 | 287 | def get_language_embedding(self, batched_inputs): 288 | batched_labels = [a["label_set"] for a in batched_inputs] 289 | batched_tasks = [a['tasks'] for a in batched_inputs] 290 | 291 | max_label_size = max([len(a) for a in batched_labels]) 292 | label_features = [] 293 | for i, s_labels in enumerate(batched_labels): 294 | pad_size = max_label_size - len(s_labels) 295 | 296 | label_emb = self.get_cached_label_emb(s_labels) 297 | label_features.append(F.pad(label_emb, (0, 0, 0, pad_size)).unsqueeze(1).to(self.device)) 298 | 299 | label_features = torch.cat(label_features, dim=1) # num_label x batch_size x dim_size 300 | 301 | # Task Features 302 | # prompt_features: max_task_len x batch_size x dim_size 303 | # prompt_mask: batch_size x max_task_len 304 | # batched_tasks = ['detect a person', 'detect dog and cat'] 305 | prompt_features, prompt_mask = self.get_cached_prompt_emb(batched_tasks) 306 | 307 | return label_features, prompt_features, prompt_mask 308 | 309 | def forward(self, batched_inputs, do_postprocess=True, score_thresh=0.0, nms_thresh=1.0, debug=False): 310 | images, images_whwh, ann_types = self.preprocess_image(batched_inputs) 311 | 312 | # Backbone 313 | body_feats = self.backbone(images.tensor) 314 | 315 | if type(body_feats) is dict: 316 | body_feats = [body_feats[i] for i in body_feats.keys()] 317 | 318 | encoder_feats = self.neck(body_feats) 319 | 320 | if not self.training: 321 | # create label and prompt embeddings 322 | label_feats, prompt_feats, prompt_mask = self.get_language_embedding(batched_inputs) 323 | decoder_feats = self.decoder(encoder_feats, label_feats, prompt_feats, prompt_mask) 324 | box_pred, box_cls, _ = self.loss_head(decoder_feats) 325 | 326 | results = self.gen_output(box_cls, box_pred, batched_inputs, images, 327 | score_thresh, nms_thresh, do_postprocess, 328 | max_num_det=self.num_proposals) 329 | 330 | return results 331 | 332 | def print_trainable_parameters(self): 333 | """ 334 | Prints the number of trainable parameters in the model. 335 | """ 336 | trainable_params = 0 337 | all_param = 0 338 | for _, param in self.named_parameters(): 339 | num_params = param.numel() 340 | # if using DS Zero 3 and the weights are initialized empty 341 | if num_params == 0 and hasattr(param, "ds_numel"): 342 | num_params = param.ds_numel 343 | 344 | all_param += num_params 345 | if param.requires_grad: 346 | trainable_params += num_params 347 | print( 348 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" 349 | ) -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/detr_torch.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, List 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn, Tensor 7 | 8 | 9 | class Transformer(nn.Module): 10 | 11 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, 12 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 13 | activation="relu", normalize_before=False, 14 | return_intermediate_dec=False): 15 | super().__init__() 16 | 17 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 18 | dropout, activation, normalize_before) 19 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 20 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 21 | 22 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 23 | dropout, activation, normalize_before) 24 | decoder_norm = nn.LayerNorm(d_model) 25 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 26 | return_intermediate=return_intermediate_dec) 27 | 28 | self._reset_parameters() 29 | 30 | self.d_model = d_model 31 | self.nhead = nhead 32 | 33 | def _reset_parameters(self): 34 | for p in self.parameters(): 35 | if p.dim() > 1: 36 | nn.init.xavier_uniform_(p) 37 | 38 | def forward(self, src, mask, query_embed, pos_embed): 39 | # flatten NxCxHxW to HWxNxC 40 | bs, c, h, w = src.shape 41 | src = src.flatten(2).permute(2, 0, 1) 42 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 43 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) 44 | mask = mask.flatten(1) 45 | 46 | tgt = torch.zeros_like(query_embed) 47 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) 48 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, 49 | pos=pos_embed, query_pos=query_embed) 50 | return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) 51 | 52 | 53 | class TransformerEncoder(nn.Module): 54 | def __init__(self, encoder_layer, num_layers, norm=None): 55 | super(TransformerEncoder, self).__init__() 56 | # self.layers = _get_clones(encoder_layer, num_layers) 57 | self.layers = [encoder_layer] 58 | self.num_layers = num_layers 59 | self.norm = norm 60 | 61 | def forward(self, src, src_mask=None, pos_embed=None): 62 | output = src 63 | pos_embed = pos_embed.clone().detach() if pos_embed is not None else pos_embed 64 | for layer in self.layers: 65 | output = layer(output, src_mask=src_mask, pos_embed=pos_embed) 66 | 67 | if self.norm is not None: 68 | output = self.norm(output) 69 | 70 | return output 71 | 72 | 73 | class TransformerDecoder(nn.Module): 74 | 75 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 76 | super().__init__() 77 | self.layers = _get_clones(decoder_layer, num_layers) 78 | self.num_layers = num_layers 79 | self.norm = norm 80 | self.return_intermediate = return_intermediate 81 | 82 | def forward(self, tgt, memory, 83 | tgt_mask: Optional[Tensor] = None, 84 | memory_mask: Optional[Tensor] = None, 85 | tgt_key_padding_mask: Optional[Tensor] = None, 86 | memory_key_padding_mask: Optional[Tensor] = None, 87 | pos: Optional[Tensor] = None, 88 | query_pos: Optional[Tensor] = None): 89 | output = tgt 90 | 91 | intermediate = [] 92 | 93 | for layer in self.layers: 94 | output = layer(output, memory, tgt_mask=tgt_mask, 95 | memory_mask=memory_mask, 96 | tgt_key_padding_mask=tgt_key_padding_mask, 97 | memory_key_padding_mask=memory_key_padding_mask, 98 | pos=pos, query_pos=query_pos) 99 | if self.return_intermediate: 100 | intermediate.append(self.norm(output)) 101 | 102 | if self.norm is not None: 103 | output = self.norm(output) 104 | if self.return_intermediate: 105 | intermediate.pop() 106 | intermediate.append(output) 107 | 108 | if self.return_intermediate: 109 | return torch.stack(intermediate) 110 | 111 | return output.unsqueeze(0) 112 | 113 | 114 | class TransformerEncoderLayer(nn.Module): 115 | 116 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 117 | activation="relu", normalize_before=False): 118 | super().__init__() 119 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 120 | # Implementation of Feedforward model 121 | self.linear1 = nn.Linear(d_model, dim_feedforward) 122 | self.dropout = nn.Dropout(dropout) 123 | self.linear2 = nn.Linear(dim_feedforward, d_model) 124 | 125 | self.norm1 = nn.LayerNorm(d_model) 126 | self.norm2 = nn.LayerNorm(d_model) 127 | self.dropout1 = nn.Dropout(dropout) 128 | self.dropout2 = nn.Dropout(dropout) 129 | 130 | self.activation = _get_activation_fn(activation) 131 | self.normalize_before = normalize_before 132 | 133 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 134 | return tensor if pos is None else tensor + pos 135 | 136 | def forward_post(self, 137 | src, 138 | src_mask: Optional[Tensor] = None, 139 | src_key_padding_mask: Optional[Tensor] = None, 140 | pos: Optional[Tensor] = None): 141 | q = k = self.with_pos_embed(src, pos) 142 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, 143 | key_padding_mask=src_key_padding_mask)[0] 144 | src = src + self.dropout1(src2) 145 | src = self.norm1(src) 146 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 147 | src = src + self.dropout2(src2) 148 | src = self.norm2(src) 149 | return src 150 | 151 | def forward_pre(self, src, 152 | src_mask: Optional[Tensor] = None, 153 | src_key_padding_mask: Optional[Tensor] = None, 154 | pos: Optional[Tensor] = None): 155 | src2 = self.norm1(src) 156 | q = k = self.with_pos_embed(src2, pos) 157 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, 158 | key_padding_mask=src_key_padding_mask)[0] 159 | src = src + self.dropout1(src2) 160 | src2 = self.norm2(src) 161 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 162 | src = src + self.dropout2(src2) 163 | return src 164 | 165 | def forward(self, src, 166 | src_mask: Optional[Tensor] = None, 167 | src_key_padding_mask: Optional[Tensor] = None, 168 | pos: Optional[Tensor] = None): 169 | if self.normalize_before: 170 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 171 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 172 | 173 | 174 | class TransformerDecoderLayer(nn.Module): 175 | 176 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 177 | activation="relu", normalize_before=False): 178 | super().__init__() 179 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 180 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 181 | # Implementation of Feedforward model 182 | self.linear1 = nn.Linear(d_model, dim_feedforward) 183 | self.dropout = nn.Dropout(dropout) 184 | self.linear2 = nn.Linear(dim_feedforward, d_model) 185 | 186 | self.norm1 = nn.LayerNorm(d_model) 187 | self.norm2 = nn.LayerNorm(d_model) 188 | self.norm3 = nn.LayerNorm(d_model) 189 | self.dropout1 = nn.Dropout(dropout) 190 | self.dropout2 = nn.Dropout(dropout) 191 | self.dropout3 = nn.Dropout(dropout) 192 | 193 | self.activation = _get_activation_fn(activation) 194 | self.normalize_before = normalize_before 195 | 196 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 197 | return tensor if pos is None else tensor + pos 198 | 199 | def forward_post(self, tgt, memory, 200 | tgt_mask: Optional[Tensor] = None, 201 | memory_mask: Optional[Tensor] = None, 202 | tgt_key_padding_mask: Optional[Tensor] = None, 203 | memory_key_padding_mask: Optional[Tensor] = None, 204 | pos: Optional[Tensor] = None, 205 | query_pos: Optional[Tensor] = None): 206 | q = k = self.with_pos_embed(tgt, query_pos) 207 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, 208 | key_padding_mask=tgt_key_padding_mask)[0] 209 | tgt = tgt + self.dropout1(tgt2) 210 | tgt = self.norm1(tgt) 211 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 212 | key=self.with_pos_embed(memory, pos), 213 | value=memory, attn_mask=memory_mask, 214 | key_padding_mask=memory_key_padding_mask)[0] 215 | tgt = tgt + self.dropout2(tgt2) 216 | tgt = self.norm2(tgt) 217 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 218 | tgt = tgt + self.dropout3(tgt2) 219 | tgt = self.norm3(tgt) 220 | return tgt 221 | 222 | def forward_pre(self, tgt, memory, 223 | tgt_mask: Optional[Tensor] = None, 224 | memory_mask: Optional[Tensor] = None, 225 | tgt_key_padding_mask: Optional[Tensor] = None, 226 | memory_key_padding_mask: Optional[Tensor] = None, 227 | pos: Optional[Tensor] = None, 228 | query_pos: Optional[Tensor] = None): 229 | tgt2 = self.norm1(tgt) 230 | q = k = self.with_pos_embed(tgt2, query_pos) 231 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, 232 | key_padding_mask=tgt_key_padding_mask)[0] 233 | tgt = tgt + self.dropout1(tgt2) 234 | tgt2 = self.norm2(tgt) 235 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 236 | key=self.with_pos_embed(memory, pos), 237 | value=memory, attn_mask=memory_mask, 238 | key_padding_mask=memory_key_padding_mask)[0] 239 | tgt = tgt + self.dropout2(tgt2) 240 | tgt2 = self.norm3(tgt) 241 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 242 | tgt = tgt + self.dropout3(tgt2) 243 | return tgt 244 | 245 | def forward(self, tgt, memory, 246 | tgt_mask: Optional[Tensor] = None, 247 | memory_mask: Optional[Tensor] = None, 248 | tgt_key_padding_mask: Optional[Tensor] = None, 249 | memory_key_padding_mask: Optional[Tensor] = None, 250 | pos: Optional[Tensor] = None, 251 | query_pos: Optional[Tensor] = None): 252 | if self.normalize_before: 253 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask, 254 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 255 | return self.forward_post(tgt, memory, tgt_mask, memory_mask, 256 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 257 | 258 | 259 | def _get_clones(module, N): 260 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 261 | 262 | 263 | def build_transformer(args): 264 | return Transformer( 265 | d_model=args.hidden_dim, 266 | dropout=args.dropout, 267 | nhead=args.nheads, 268 | dim_feedforward=args.dim_feedforward, 269 | num_encoder_layers=args.enc_layers, 270 | num_decoder_layers=args.dec_layers, 271 | normalize_before=args.pre_norm, 272 | return_intermediate_dec=True, 273 | ) 274 | 275 | 276 | def _get_activation_fn(activation): 277 | """Return an activation function given a string""" 278 | if activation == "relu": 279 | return F.relu 280 | if activation == "gelu": 281 | return F.gelu 282 | if activation == "glu": 283 | return F.glu 284 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 285 | 286 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/dn_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from omdet.utils.box_ops import xywh2xyxy, xyxy2xywh 3 | 4 | 5 | def get_cdn_group(batch, 6 | num_classes, 7 | num_queries, 8 | class_embed, 9 | num_dn=100, 10 | cls_noise_ratio=0.5, 11 | box_noise_scale=1.0, 12 | training=False, 13 | amp=False): 14 | """ 15 | Get contrastive denoising training group. This function creates a contrastive denoising training group with 16 | positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding 17 | box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information. 18 | 19 | Args: 20 | batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes' 21 | (torch.Tensor with shape [num_gts, 4]), 'gt_groups' (List(int)) which is a list of batch size length 22 | indicating the number of gts of each image. 23 | num_classes (int): Number of classes. 24 | num_queries (int): Number of queries. 25 | class_embed (torch.Tensor): Embedding weights to map class labels to embedding space. 26 | num_dn (int, optional): Number of denoising. Defaults to 100. 27 | cls_noise_ratio (float, optional): Noise ratio for class labels. Defaults to 0.5. 28 | box_noise_scale (float, optional): Noise scale for bounding box coordinates. Defaults to 1.0. 29 | training (bool, optional): If it's in training mode. Defaults to False. 30 | 31 | Returns: 32 | (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Dict]]): The modified class embeddings, 33 | bounding boxes, attention mask and meta information for denoising. If not in training mode or 'num_dn' 34 | is less than or equal to 0, the function returns None for all elements in the tuple. 35 | """ 36 | 37 | if (not training) or num_dn <= 0: 38 | return None, None, None, None 39 | gt_groups = batch['gt_groups'] 40 | total_num = sum(gt_groups) 41 | max_nums = max(gt_groups) 42 | if max_nums == 0: 43 | return None, None, None, None 44 | 45 | num_group = num_dn // max_nums 46 | num_group = 1 if num_group == 0 else num_group 47 | # pad gt to max_num of a batch 48 | bs = len(gt_groups) 49 | gt_cls = batch['cls'] # (bs*num, ) 50 | gt_bbox = batch['bboxes'] # bs*num, 4 51 | b_idx = batch['batch_idx'] 52 | 53 | # each group has positive and negative queries. 54 | dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, ) 55 | dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4 56 | dn_b_idx = b_idx.repeat(2 * num_group).view(-1).to(dn_cls.device) # (2*num_group*bs*num, ) 57 | 58 | # positive and negative mask 59 | # (bs*num*num_group, ), the second total_num*num_group part as negative samples 60 | neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num 61 | 62 | if cls_noise_ratio > 0: 63 | # half of bbox prob 64 | mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5) 65 | idx = torch.nonzero(mask).squeeze(-1) 66 | # randomly put a new one here 67 | new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device) 68 | dn_cls[idx] = new_label 69 | 70 | if box_noise_scale > 0: 71 | known_bbox = xywh2xyxy(dn_bbox) 72 | 73 | diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale # 2*num_group*bs*num, 4 74 | 75 | rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0 76 | rand_part = torch.rand_like(dn_bbox) 77 | rand_part[neg_idx] += 1.0 78 | rand_part *= rand_sign 79 | known_bbox += rand_part * diff 80 | known_bbox.clip_(min=0.0, max=1.0) 81 | dn_bbox = xyxy2xywh(known_bbox) 82 | dn_bbox = inverse_sigmoid(dn_bbox) 83 | 84 | # total denoising queries 85 | num_dn = int(max_nums * 2 * num_group) 86 | # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)]) 87 | dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256 88 | if amp: 89 | data_type = torch.bfloat16 90 | else: 91 | data_type = torch.float32 92 | padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device, dtype=data_type) 93 | padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device) 94 | 95 | map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long, device=gt_cls.device) for num in gt_groups]) 96 | pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0) 97 | 98 | map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)]) 99 | fix_class = dn_cls.dim() == 2 100 | if fix_class: 101 | padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed 102 | else: 103 | padding_cls[(dn_b_idx.long(), map_indices)] = dn_cls_embed.transpose(1,0)[(dn_b_idx.long(), map_indices)] 104 | padding_bbox[(dn_b_idx.long(), map_indices)] = dn_bbox 105 | 106 | tgt_size = num_dn + num_queries 107 | attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool) 108 | # match query cannot see the reconstruct 109 | attn_mask[num_dn:, :num_dn] = True 110 | # reconstruct cannot see each other 111 | for i in range(num_group): 112 | if i == 0: 113 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True 114 | if i == num_group - 1: 115 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * i * 2] = True 116 | else: 117 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True 118 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True 119 | dn_meta = { 120 | 'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)], 121 | 'dn_num_group': num_group, 122 | 'dn_num_split': [num_dn, num_queries]} 123 | 124 | return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to( 125 | class_embed.device), dn_meta 126 | 127 | 128 | def inverse_sigmoid(x, eps=1e-6): 129 | """Inverse sigmoid function.""" 130 | x = x.clip(min=0., max=1.) 131 | return torch.log(x / (1 - x + eps) + eps) 132 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/ela_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .torch_utils import BaseConv, linear_init_ 5 | from .block import RepC3 6 | from .detr_torch import TransformerEncoder 7 | from .build_components import TRANSFORMER_ENCODER_REGISTRY 8 | 9 | __all__ = ['ELAEncoder'] 10 | 11 | 12 | class TransformerLayer(nn.Module): 13 | def __init__(self, 14 | d_model=256, 15 | nhead=8, 16 | dim_feedforward=1024, 17 | dropout=0., 18 | activation="relu", 19 | attn_dropout=None, 20 | act_dropout=None, 21 | normalize_before=False): 22 | super(TransformerLayer, self).__init__() 23 | attn_dropout = dropout if attn_dropout is None else attn_dropout 24 | act_dropout = dropout if act_dropout is None else act_dropout 25 | self.normalize_before = normalize_before 26 | 27 | self.self_attn = torch.nn.MultiheadAttention(d_model, nhead, attn_dropout, batch_first=True) 28 | # Implementation of Feedforward model 29 | self.linear1 = nn.Linear(d_model, dim_feedforward) 30 | self.dropout = nn.Dropout(act_dropout) 31 | self.linear2 = nn.Linear(dim_feedforward, d_model) 32 | 33 | self.norm1 = nn.LayerNorm(d_model) 34 | self.norm2 = nn.LayerNorm(d_model) 35 | self.dropout1 = nn.Dropout(dropout) 36 | self.dropout2 = nn.Dropout(dropout) 37 | self.activation = getattr(F, activation) 38 | self._reset_parameters() 39 | 40 | def _reset_parameters(self): 41 | linear_init_(self.linear1) 42 | linear_init_(self.linear2) 43 | 44 | @staticmethod 45 | def with_pos_embed(tensor, pos_embed): 46 | return tensor if pos_embed is None else tensor + pos_embed 47 | 48 | def forward(self, src, src_mask=None, pos_embed=None): 49 | residual = src 50 | if self.normalize_before: 51 | src = self.norm1(src) 52 | q = k = self.with_pos_embed(src, pos_embed) 53 | src = self.self_attn(q, k, value=src, attn_mask=src_mask) 54 | #print(src[1].shape, src[0].shape) 55 | src = src[0] 56 | src = residual + self.dropout1(src) 57 | if not self.normalize_before: 58 | src = self.norm1(src) 59 | 60 | residual = src 61 | if self.normalize_before: 62 | src = self.norm2(src) 63 | src = self.linear2(self.dropout(self.activation(self.linear1(src)))) 64 | src = residual + self.dropout2(src) 65 | if not self.normalize_before: 66 | src = self.norm2(src) 67 | return src 68 | 69 | 70 | @TRANSFORMER_ENCODER_REGISTRY.register() 71 | class ELAEncoder(nn.Module): 72 | # __shared__ = ['depth_mult', 'act', 'trt', 'eval_size'] 73 | # __inject__ = ['encoder_layer'] 74 | 75 | def __init__(self, 76 | in_channels=[128, 256, 512], 77 | feat_strides=[8, 16, 32], 78 | hidden_dim=256, 79 | use_encoder_idx=[2], 80 | num_encoder_layers=1, 81 | encoder_layer='TransformerLayer', 82 | pe_temperature=10000, 83 | expansion=1.0, 84 | depth_mult=1.0, 85 | act='silu', 86 | trt=False, 87 | dim_feedforward=1024, 88 | eval_size=None): 89 | super(ELAEncoder, self).__init__() 90 | self.in_channels = in_channels 91 | self.feat_strides = feat_strides 92 | self.hidden_dim = hidden_dim 93 | self.use_encoder_idx = use_encoder_idx 94 | self.num_encoder_layers = num_encoder_layers 95 | self.pe_temperature = pe_temperature 96 | self.eval_size = eval_size 97 | 98 | self.encoder_layer = TransformerLayer(dim_feedforward=dim_feedforward) 99 | 100 | # channel projection 101 | self.input_proj = nn.ModuleList() 102 | for in_channel in self.in_channels: 103 | self.input_proj.append( 104 | nn.Sequential( 105 | nn.Conv2d( 106 | in_channel, hidden_dim, kernel_size=(1, 1), bias=False), 107 | nn.BatchNorm2d( 108 | hidden_dim))) 109 | # encoder transformer 110 | self.encoder = nn.ModuleList([ 111 | TransformerEncoder(self.encoder_layer, num_encoder_layers) 112 | for _ in range(len(use_encoder_idx)) 113 | ]) 114 | 115 | # act = get_act_fn( 116 | # act, trt=trt) if act is None or isinstance(act, 117 | # (str, dict)) else act 118 | # top-down fpn 119 | self.lateral_convs = nn.ModuleList() 120 | self.fpn_blocks = nn.ModuleList() 121 | for idx in range(len(self.in_channels) - 1, 0, -1): 122 | self.lateral_convs.append( 123 | BaseConv( 124 | hidden_dim, hidden_dim, 1, 1, act=act)) 125 | self.fpn_blocks.append( 126 | RepC3( 127 | hidden_dim * 2, 128 | hidden_dim, 129 | round(3 * depth_mult), 130 | e=1.0)) 131 | 132 | # bottom-up pan 133 | self.downsample_convs = nn.ModuleList() 134 | self.pan_blocks = nn.ModuleList() 135 | for idx in range(len(self.in_channels) - 1): 136 | self.downsample_convs.append( 137 | BaseConv( 138 | hidden_dim, hidden_dim, 3, stride=2, act=act)) 139 | self.pan_blocks.append( 140 | RepC3( 141 | hidden_dim * 2, 142 | hidden_dim, 143 | round(3 * depth_mult), 144 | e=1.0)) 145 | 146 | # self._reset_parameters() 147 | # 148 | # def _reset_parameters(self): 149 | # if self.eval_size: 150 | # for idx in self.use_encoder_idx: 151 | # stride = self.feat_strides[idx] 152 | # pos_embed = self.build_2d_sincos_position_embedding( 153 | # self.eval_size[1] // stride, self.eval_size[0] // stride, 154 | # self.hidden_dim, self.pe_temperature) 155 | # setattr(self, f'pos_embed{idx}', pos_embed) 156 | 157 | @staticmethod 158 | def build_2d_sincos_position_embedding(w, 159 | h, 160 | embed_dim=256, 161 | temperature=10000.): 162 | grid_w = torch.arange(int(w), dtype=torch.float32) 163 | grid_h = torch.arange(int(h), dtype=torch.float32) 164 | grid_w, grid_h = torch.meshgrid(grid_w, grid_h) 165 | assert embed_dim % 4 == 0, \ 166 | 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' 167 | pos_dim = embed_dim // 4 168 | omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim 169 | omega = 1. / (temperature**omega) 170 | 171 | out_w = grid_w.flatten()[..., None] @omega[None] 172 | out_h = grid_h.flatten()[..., None] @omega[None] 173 | 174 | return torch.cat( 175 | [ 176 | torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), 177 | torch.cos(out_h) 178 | ], 179 | dim=1)[None, :, :] 180 | @classmethod 181 | def from_config(cls, cfg): 182 | enc_cfg = cfg.MODEL.ELAEncoder 183 | return { 184 | 'in_channels': enc_cfg.in_channels, 185 | 'feat_strides': enc_cfg.feat_strides, 186 | 'hidden_dim': enc_cfg.hidden_dim, 187 | 'use_encoder_idx': enc_cfg.use_encoder_idx, 188 | 'num_encoder_layers': enc_cfg.num_encoder_layers, 189 | 'encoder_layer': enc_cfg.encoder_layer, 190 | 'pe_temperature': enc_cfg.pe_temperature, 191 | 'expansion': enc_cfg.expansion, 192 | 'depth_mult': enc_cfg.depth_mult, 193 | 'act': enc_cfg.act, 194 | 'eval_size': enc_cfg.eval_size, 195 | 'dim_feedforward': enc_cfg.dim_feedforward 196 | } 197 | 198 | def forward(self, feats, for_mot=False): 199 | assert len(feats) == len(self.in_channels) 200 | # get projection features 201 | proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] 202 | # encoder 203 | if self.num_encoder_layers > 0: 204 | for i, enc_ind in enumerate(self.use_encoder_idx): 205 | h, w = proj_feats[enc_ind].shape[2:] 206 | # flatten [B, C, H, W] to [B, HxW, C] 207 | src_flatten = proj_feats[enc_ind].flatten(start_dim=2).transpose(1, 2) 208 | if self.training or self.eval_size is None: 209 | pos_embed = self.build_2d_sincos_position_embedding( 210 | w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device) 211 | else: 212 | pos_embed = getattr(self, f'pos_embed{enc_ind}', None) 213 | memory = self.encoder[i](src_flatten, pos_embed=pos_embed) 214 | proj_feats[enc_ind] = memory.transpose(1, 2).reshape((-1, self.hidden_dim, h, w)) 215 | 216 | # top-down fpn 217 | inner_outs = [proj_feats[-1]] 218 | for idx in range(len(self.in_channels) - 1, 0, -1): 219 | feat_heigh = inner_outs[0] 220 | feat_low = proj_feats[idx - 1] 221 | feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( 222 | feat_heigh) 223 | inner_outs[0] = feat_heigh 224 | 225 | upsample_feat = F.interpolate( 226 | feat_heigh, scale_factor=2., mode="nearest") 227 | inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( 228 | torch.cat( 229 | [upsample_feat, feat_low], dim=1)) 230 | inner_outs.insert(0, inner_out) 231 | 232 | # bottom-up pan 233 | outs = [inner_outs[0]] 234 | for idx in range(len(self.in_channels) - 1): 235 | feat_low = outs[-1] 236 | feat_height = inner_outs[idx + 1] 237 | downsample_feat = self.downsample_convs[idx](feat_low) 238 | out = self.pan_blocks[idx](torch.cat( 239 | [downsample_feat, feat_height], dim=1)) 240 | outs.append(out) 241 | 242 | return outs 243 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/head.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from .build_components import DETR_HEAD_REGISTRY 3 | 4 | 5 | __all__ = ['DINOHead'] 6 | @DETR_HEAD_REGISTRY.register() 7 | class DINOHead(nn.Module): 8 | def __init__(self, device="cuda"): 9 | super(DINOHead, self).__init__() 10 | 11 | def forward(self, out_transformer, inputs=None): 12 | (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, 13 | dn_meta) = out_transformer 14 | 15 | return (dec_out_bboxes[-1], dec_out_logits[-1], None) 16 | 17 | @classmethod 18 | def from_config(cls, cfg, *args, **kwargs): 19 | return { 20 | "device": cfg.MODEL.DEVICE 21 | } 22 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/infer_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import copy 6 | from typing import Tuple 7 | 8 | import numpy as np 9 | # import open_clip 10 | from detectron2.structures import Boxes, ImageList, Instances 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from detectron2.modeling import detector_postprocess 15 | from detectron2.layers import batched_nms 16 | from detectron2.modeling import build_backbone 17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head 18 | from detectron2.config import configurable 19 | from omdet.modeling.language_backbone import build_language_backbone 20 | from detectron2.utils.logger import setup_logger 21 | from ..modeling.language_backbone.clip.models import clip as clip 22 | from .torch_utils import bbox_cxcywh_to_xyxy 23 | __all__ = ['OmDetV2TurboInfer'] 24 | 25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 26 | 27 | from ..utils.cache import LRUCache 28 | 29 | from huggingface_hub import PyTorchModelHubMixin 30 | 31 | 32 | @META_ARCH_REGISTRY.register() 33 | class OmDetV2TurboInfer(nn.Module, PyTorchModelHubMixin): 34 | 35 | @configurable 36 | def __init__(self, cfg): 37 | super(OmDetV2TurboInfer, self).__init__() 38 | self.cfg = cfg 39 | self.logger = setup_logger(name=__name__) 40 | 41 | self.backbone = build_backbone(cfg) 42 | self.decoder = build_decoder_model(cfg) 43 | self.neck = build_encoder_model(cfg) 44 | self.device = cfg.MODEL.DEVICE 45 | 46 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) 47 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) 48 | normalizer = lambda x: (x - pixel_mean) / pixel_std 49 | self.normalizer = normalizer 50 | 51 | self.size_divisibility = self.backbone.size_divisibility 52 | self.nms_test_th = 0.0 53 | self.conf_test_th = 0.0 54 | self.loss_type = 'FOCAL' 55 | self.use_language_cache = True 56 | self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 57 | self.num_proposals = cfg.MODEL.ELADecoder.num_queries 58 | 59 | 60 | @classmethod 61 | def from_config(cls, cfg, *args, **kwargs): 62 | return { 63 | 'cfg': cfg 64 | } 65 | 66 | def forward(self, x, label_feats, task_feats, task_mask): 67 | 68 | body_feats = self.backbone(x) 69 | 70 | if type(body_feats) is dict: 71 | body_feats = [body_feats[i] for i in body_feats.keys()] 72 | encoder_feats = self.neck(body_feats) 73 | box_pred, box_cls, _, _, _ = self.decoder(encoder_feats, label_feats, task_feats, task_mask) 74 | 75 | return box_pred, box_cls 76 | -------------------------------------------------------------------------------- /omdet/omdet_v2_turbo/torch_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.init import uniform_ 9 | 10 | __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid' 11 | 12 | 13 | def _get_clones(module, n): 14 | return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) 15 | 16 | 17 | def bias_init_with_prob(prior_prob=0.01): 18 | """initialize conv/fc bias value according to a given probability value.""" 19 | return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init 20 | 21 | 22 | def linear_init_(module): 23 | bound = 1 / math.sqrt(module.weight.shape[0]) 24 | uniform_(module.weight, -bound, bound) 25 | if hasattr(module, 'bias') and module.bias is not None: 26 | uniform_(module.bias, -bound, bound) 27 | 28 | 29 | def inverse_sigmoid(x, eps=1e-5): 30 | x = x.clamp(min=0, max=1) 31 | x1 = x.clamp(min=eps) 32 | x2 = (1 - x).clamp(min=eps) 33 | return torch.log(x1 / x2) 34 | 35 | 36 | def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor, 37 | sampling_locations: torch.Tensor, 38 | attention_weights: torch.Tensor) -> torch.Tensor: 39 | """ 40 | Multi-scale deformable attention. 41 | https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py 42 | """ 43 | 44 | bs, _, num_heads, embed_dims = value.shape 45 | _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for level, (H_, W_) in enumerate(value_spatial_shapes): 50 | # bs, H_*W_, num_heads, embed_dims -> 51 | # bs, H_*W_, num_heads*embed_dims -> 52 | # bs, num_heads*embed_dims, H_*W_ -> 53 | # bs*num_heads, embed_dims, H_, W_ 54 | value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)) 55 | # bs, num_queries, num_heads, num_points, 2 -> 56 | # bs, num_heads, num_queries, num_points, 2 -> 57 | # bs*num_heads, num_queries, num_points, 2 58 | sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) 59 | # bs*num_heads, embed_dims, num_queries, num_points 60 | sampling_value_l_ = F.grid_sample(value_l_, 61 | sampling_grid_l_, 62 | mode='bilinear', 63 | padding_mode='zeros', 64 | align_corners=False) 65 | sampling_value_list.append(sampling_value_l_) 66 | # (bs, num_queries, num_heads, num_levels, num_points) -> 67 | # (bs, num_heads, num_queries, num_levels, num_points) -> 68 | # (bs, num_heads, 1, num_queries, num_levels*num_points) 69 | attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries, 70 | num_levels * num_points) 71 | output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view( 72 | bs, num_heads * embed_dims, num_queries)) 73 | return output.transpose(1, 2).contiguous() 74 | 75 | 76 | def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7): 77 | """ 78 | Calculate Intersection over Union (IoU) of box1(1, 4) to box2(n, 4). 79 | 80 | Args: 81 | box1 (torch.Tensor): A tensor representing a single bounding box with shape (1, 4). 82 | box2 (torch.Tensor): A tensor representing n bounding boxes with shape (n, 4). 83 | xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in 84 | (x1, y1, x2, y2) format. Defaults to True. 85 | GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False. 86 | DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False. 87 | CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False. 88 | eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. 89 | 90 | Returns: 91 | (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags. 92 | """ 93 | 94 | # Get the coordinates of bounding boxes 95 | if xywh: # transform from xywh to xyxy 96 | (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) 97 | w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 98 | b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ 99 | b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ 100 | else: # x1, y1, x2, y2 = box1 101 | b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) 102 | b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) 103 | w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps 104 | w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps 105 | 106 | # Intersection area 107 | inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * \ 108 | (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp_(0) 109 | 110 | # Union Area 111 | union = w1 * h1 + w2 * h2 - inter + eps 112 | 113 | # IoU 114 | iou = inter / union 115 | if CIoU or DIoU or GIoU: 116 | cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width 117 | ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height 118 | if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 119 | c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared 120 | rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 121 | if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 122 | v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) 123 | with torch.no_grad(): 124 | alpha = v / (v - iou + (1 + eps)) 125 | return iou - (rho2 / c2 + v * alpha) # CIoU 126 | return iou - rho2 / c2 # DIoU 127 | c_area = cw * ch + eps # convex area 128 | return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf 129 | return iou # IoU 130 | 131 | def cls_score(cls_type, cls_feature, class_proj, logit_scale): 132 | if cls_type == 'cosine': 133 | class_logits = _b_cosine(cls_feature, class_proj, logit_scale) # 4 100 256 4 256 20 134 | elif cls_type == 'dot': 135 | class_logits = torch.bmm(cls_feature, class_proj) # 4 100 20 136 | else: 137 | raise Exception("Unknown cls type {}".format(cls_type)) 138 | return class_logits 139 | 140 | def _norm(f, dim=-1): 141 | return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12) 142 | 143 | 144 | def _b_cosine(a, b, logit_scale): 145 | """ 146 | a: B x K x H 147 | b: B x H x K 148 | """ 149 | a = _norm(a, dim=2) 150 | b = _norm(b, dim=1) 151 | # Calculating the Loss 152 | logit_scale = logit_scale.exp() 153 | logits_per_image = logit_scale * torch.bmm(a, b) 154 | return logits_per_image 155 | 156 | ########################### 157 | def bbox_cxcywh_to_xyxy(x): 158 | cxcy, wh = torch.split(x, 2, dim=-1) 159 | return torch.cat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], dim=-1) 160 | 161 | def bbox_xyxy2cxcywh(x): 162 | x0, y0, x1, y1 = torch.split(x, 1, dim=-1) 163 | return torch.cat([(x1+x0)/2, (y1+y0)/2, x1-x0, y1-y0], dim=-1) 164 | 165 | class SiLU(nn.Module): 166 | def __init__(self): 167 | super(SiLU, self).__init__() 168 | 169 | def forward(self, x): 170 | return x * torch.sigmoid(x) 171 | 172 | class BaseConv(nn.Module): 173 | def __init__(self, 174 | in_channels, 175 | out_channels, 176 | ksize, 177 | stride, 178 | groups=1, 179 | bias=False, 180 | act="silu"): 181 | super(BaseConv, self).__init__() 182 | self.conv = nn.Conv2d( 183 | in_channels, 184 | out_channels, 185 | kernel_size=ksize, 186 | stride=stride, 187 | padding=(ksize - 1) // 2, 188 | groups=groups, 189 | bias=bias) 190 | self.bn = nn.BatchNorm2d( 191 | out_channels, 192 | # epsilon=1e-3, # for amp(fp16), set in ppdet/engine/trainer.py 193 | # momentum=0.97, 194 | # weight_attr=ParamAttr(regularizer=L2Decay(0.0)), 195 | # bias_attr=ParamAttr(regularizer=L2Decay(0.0)) 196 | ) 197 | 198 | if act == 'silu': 199 | self.act = SiLU() 200 | elif act == 'gelu': 201 | self.act = nn.GELU() 202 | # self._init_weights() 203 | # 204 | # def _init_weights(self): 205 | # conv_init_(self.conv) 206 | 207 | def forward(self, x): 208 | x = self.bn(self.conv(x)) 209 | if self.training: 210 | y = self.act(x) 211 | else: 212 | if isinstance(self.act, nn.SiLU): 213 | self.act = SiLU() 214 | y = self.act(x) 215 | return y 216 | 217 | import random 218 | import torchvision 219 | 220 | class BatchResize(): 221 | def __init__(self, mode="training"): 222 | self.mode = mode 223 | if mode == "training": 224 | self.size = int(random.choice(np.arange(480, 801, step=32))) 225 | else: 226 | self.size = 640 227 | self.resize = torchvision.transforms.Resize((self.size, self.size)) 228 | 229 | def __call__(self, batch_inputs): 230 | for i, b in enumerate(batch_inputs): 231 | h, w = batch_inputs[i]["image"].shape[1:] 232 | batch_inputs[i]["image"] = self.resize(batch_inputs[i]["image"]) 233 | new_h, new_w = (self.size, self.size) 234 | if self.mode: 235 | batch_inputs[i]["instances"].gt_boxes.tensor *= torch.tensor([new_w/w, new_h/h]).repeat(1, 2) 236 | batch_inputs[i]["instances"]._image_size = (new_h, new_w) 237 | 238 | return batch_inputs 239 | 240 | 241 | def get_contrastive_denoising_training_group(targets, 242 | num_classes, 243 | num_queries, 244 | class_embed, 245 | num_denoising=100, 246 | label_noise_ratio=0.5, 247 | box_noise_scale=1.0): 248 | """ 249 | targets: [targets] that contains labels, bboxes, etc 250 | num_classes: the size of labels 251 | num_queries: 300 252 | class_embed: num_class x batch_size x label_dim OR num_class x batch_size (in the old case) 253 | """ 254 | if num_denoising <= 0: 255 | return None, None, None, None 256 | # number of gt_bboxes in each batch sample 257 | num_gts = [len(t["labels"]) for t in targets] 258 | max_gt_num = max(num_gts) 259 | if max_gt_num == 0: 260 | return None, None, None, None 261 | 262 | num_group = num_denoising // max_gt_num # the number of denoising group given num_denoising 263 | num_group = 1 if num_group == 0 else num_group 264 | # pad gt to max_num of a batch 265 | bs = len(targets) 266 | input_query_class = torch.full((bs, max_gt_num), num_classes, dtype=torch.int32) # batch_size x max_gt_num (initialized with num_class) 267 | input_query_bbox = torch.zeros((bs, max_gt_num, 4)) # batch_size x max_gt_num x 4 268 | pad_gt_mask = torch.zeros((bs, max_gt_num)) 269 | for i in range(bs): 270 | num_gt = num_gts[i] 271 | if num_gt > 0: 272 | input_query_class[i, :num_gt] = targets[i]["labels"].squeeze(-1) 273 | input_query_bbox[i, :num_gt] = targets[i]["boxes"] 274 | pad_gt_mask[i, :num_gt] = 1 275 | # each group has positive and negative queries. 276 | input_query_class = input_query_class.tile([1, 2 * num_group]) # batch_size x (max_gt_num*2*num_group) 277 | input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) 278 | pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) 279 | # positive and negative mask 280 | negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1]) # bs x max_gt_num*2 x 1 281 | negative_gt_mask[:, max_gt_num:] = 1 # set the second half to be NEGATIVE 282 | negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) # bs x max_gt_num*2*num_group x 1 283 | positive_gt_mask = 1 - negative_gt_mask 284 | # contrastive denoising training positive index 285 | positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask 286 | dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] 287 | dn_positive_idx = torch.split(dn_positive_idx, 288 | [n * num_group for n in num_gts]) # split by batch+soze 289 | # total denoising queries 290 | num_denoising = int(max_gt_num * 2 * num_group) 291 | 292 | if label_noise_ratio > 0: 293 | input_query_class = input_query_class.flatten() # (batch_size*max_gt_num*2*num_group) * 1 294 | pad_gt_mask = pad_gt_mask.flatten() 295 | # half of bbox prob 296 | mask = torch.rand(input_query_class.shape) < (label_noise_ratio * 0.5) 297 | chosen_idx = torch.nonzero(mask * pad_gt_mask).squeeze(-1) 298 | # randomly put a new one here 299 | new_label = torch.randint_like( 300 | chosen_idx, 0, num_classes, dtype=input_query_class.dtype) 301 | input_query_class.scatter_(0, chosen_idx, new_label) 302 | input_query_class = input_query_class.reshape(bs, num_denoising) 303 | pad_gt_mask = pad_gt_mask.reshape(bs, num_denoising) 304 | 305 | if box_noise_scale > 0: 306 | known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox) 307 | 308 | diff = torch.tile(input_query_bbox[..., 2:] * 0.5, 309 | [1, 1, 2]) * box_noise_scale 310 | 311 | rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 312 | rand_part = torch.rand(input_query_bbox.shape) 313 | rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * ( 314 | 1 - negative_gt_mask) 315 | rand_part *= rand_sign 316 | known_bbox += rand_part * diff 317 | known_bbox.clip_(min=0.0, max=1.0) 318 | input_query_bbox = bbox_xyxy2cxcywh(known_bbox) 319 | input_query_bbox = inverse_sigmoid(input_query_bbox) 320 | 321 | fixed_class = class_embed.dim() == 2 322 | if fixed_class: # fixed class embedding. num_class * hidden_dim 323 | class_embed = torch.cat( 324 | [class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)]) # (num_class+1) * hidden_dim 325 | else: 326 | assert class_embed.dim() == 3 327 | # (num_class+1) x batch_size x hidden_dim 328 | class_embed = torch.cat( 329 | [class_embed, torch.zeros([1, class_embed.shape[-2], class_embed.shape[-1]], device=class_embed.device)]) 330 | 331 | if fixed_class: 332 | input_query_class_index = input_query_class.view(input_query_class.shape[0], -1)\ 333 | .long().flatten().reshape(-1,1).repeat(1, class_embed.shape[-1]) 334 | input_query_class = torch.gather(class_embed.to(input_query_class_index.device), 335 | dim=0, 336 | index=input_query_class_index).reshape([bs, num_denoising, -1]) 337 | else: 338 | temp = [] 339 | input_query_class_index = input_query_class.view(input_query_class.shape[0], -1) \ 340 | .long().flatten().reshape(-1, 1).repeat(1, class_embed.shape[-1]).reshape([bs, num_denoising, -1]) 341 | for b_id in range(bs): 342 | t = torch.gather(class_embed[:, b_id].to(input_query_class_index.device), 343 | dim=0, index=input_query_class_index[b_id]) 344 | temp.append(t) 345 | input_query_class = torch.cat(temp, dim=0).reshape([bs, num_denoising, -1]) 346 | 347 | tgt_size = num_denoising + num_queries 348 | attn_mask = torch.ones([tgt_size, tgt_size]) < 0 349 | # match query cannot see the reconstruction 350 | attn_mask[num_denoising:, :num_denoising] = True 351 | # reconstruct cannot see each other 352 | for i in range(num_group): 353 | if i == 0: 354 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * 355 | 2 * (i + 1):num_denoising] = True 356 | if i == num_group - 1: 357 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * 358 | i * 2] = True 359 | else: 360 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * 361 | 2 * (i + 1):num_denoising] = True 362 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * 363 | 2 * i] = True 364 | attn_mask = ~attn_mask 365 | dn_meta = { 366 | "dn_positive_idx": dn_positive_idx, 367 | "dn_num_group": num_group, 368 | "dn_num_split": [num_denoising, num_queries] 369 | } 370 | 371 | return input_query_class, input_query_bbox, attn_mask, dn_meta 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /omdet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /omdet/utils/analyze_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import Counter 3 | 4 | import numpy as np 5 | from detectron2.checkpoint import DetectionCheckpointer 6 | from detectron2.config import CfgNode, instantiate 7 | from detectron2.data import build_detection_test_loader 8 | from detectron2.modeling import build_model 9 | from detectron2.utils.analysis import FlopCountAnalysis 10 | from fvcore.nn import flop_count_table 11 | 12 | __all__=["do_flop"] 13 | 14 | logger = logging.getLogger("detectron2") 15 | 16 | def do_flop(cfg): 17 | if isinstance(cfg, CfgNode): 18 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TRAIN[0]) 19 | model = build_model(cfg) 20 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 21 | else: 22 | data_loader = instantiate(cfg.dataloader.test) 23 | model = instantiate(cfg.model) 24 | model.to(cfg.train.device) 25 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 26 | model.eval() 27 | 28 | counts = Counter() 29 | total_flops = [] 30 | for idx, data in zip(range(10), data_loader): # noqa 31 | flops = FlopCountAnalysis(model, data) 32 | if idx > 0: 33 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 34 | counts += flops.by_operator() 35 | total_flops.append(flops.total()) 36 | 37 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 38 | logger.info( 39 | "Average GFlops for each type of operators:\n" 40 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 41 | ) 42 | logger.info( 43 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 44 | ) 45 | -------------------------------------------------------------------------------- /omdet/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import numpy as np 6 | import torch 7 | from torchvision.ops.boxes import box_area 8 | 9 | 10 | def box_cxcywh_to_xyxy(x): 11 | x_c, y_c, w, h = x.unbind(-1) 12 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 13 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 14 | return torch.stack(b, dim=-1) 15 | 16 | 17 | def box_xyxy_to_cxcywh(x): 18 | x0, y0, x1, y1 = x.unbind(-1) 19 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 20 | (x1 - x0), (y1 - y0)] 21 | return torch.stack(b, dim=-1) 22 | 23 | 24 | # modified from torchvision to also return the union 25 | def box_iou(boxes1, boxes2): 26 | area1 = box_area(boxes1) 27 | area2 = box_area(boxes2) 28 | 29 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 30 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 31 | 32 | wh = (rb - lt).clamp(min=0) # [N,M,2] 33 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 34 | 35 | union = area1[:, None] + area2 - inter 36 | 37 | iou = inter / union 38 | return iou, union 39 | 40 | 41 | def generalized_box_iou(boxes1, boxes2): 42 | """ 43 | Generalized IoU from https://giou.stanford.edu/ 44 | 45 | The boxes should be in [x0, y0, x1, y1] format 46 | 47 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 48 | and M = len(boxes2) 49 | """ 50 | # degenerate boxes gives inf / nan results 51 | # so do an early check 52 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 53 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 54 | iou, union = box_iou(boxes1, boxes2) 55 | 56 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 57 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 58 | 59 | wh = (rb - lt).clamp(min=0) # [N,M,2] 60 | area = wh[:, :, 0] * wh[:, :, 1] 61 | 62 | return iou - (area - union) / area 63 | 64 | 65 | def masks_to_boxes(masks): 66 | """Compute the bounding boxes around the provided masks 67 | 68 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 69 | 70 | Returns a [N, 4] tensors, with the boxes in xyxy format 71 | """ 72 | if masks.numel() == 0: 73 | return torch.zeros((0, 4), device=masks.device) 74 | 75 | h, w = masks.shape[-2:] 76 | 77 | y = torch.arange(0, h, dtype=torch.float) 78 | x = torch.arange(0, w, dtype=torch.float) 79 | y, x = torch.meshgrid(y, x) 80 | 81 | x_mask = (masks * x.unsqueeze(0)) 82 | x_max = x_mask.flatten(1).max(-1)[0] 83 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 84 | 85 | y_mask = (masks * y.unsqueeze(0)) 86 | y_max = y_mask.flatten(1).max(-1)[0] 87 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 88 | 89 | return torch.stack([x_min, y_min, x_max, y_max], 1) 90 | 91 | 92 | def xyxy2xywh(x): 93 | """ 94 | Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the 95 | top-left corner and (x2, y2) is the bottom-right corner. 96 | 97 | Args: 98 | x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format. 99 | 100 | Returns: 101 | y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format. 102 | """ 103 | assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}" 104 | y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy 105 | y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center 106 | y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center 107 | y[..., 2] = x[..., 2] - x[..., 0] # width 108 | y[..., 3] = x[..., 3] - x[..., 1] # height 109 | return y 110 | 111 | 112 | def xywh2xyxy(x): 113 | """ 114 | Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the 115 | top-left corner and (x2, y2) is the bottom-right corner. 116 | 117 | Args: 118 | x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format. 119 | 120 | Returns: 121 | y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format. 122 | """ 123 | assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}" 124 | y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy 125 | dw = x[..., 2] / 2 # half-width 126 | dh = x[..., 3] / 2 # half-height 127 | y[..., 0] = x[..., 0] - dw # top left x 128 | y[..., 1] = x[..., 1] - dh # top left y 129 | y[..., 2] = x[..., 0] + dw # bottom right x 130 | y[..., 3] = x[..., 1] + dh # bottom right y 131 | return y -------------------------------------------------------------------------------- /omdet/utils/cache.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import lmdb 3 | from collections import OrderedDict 4 | 5 | 6 | class LRUCache: 7 | # initialising capacity 8 | def __init__(self, capacity: int): 9 | self.cache = OrderedDict() 10 | self.capacity = capacity 11 | 12 | def has(self, key) -> bool: 13 | return key in self.cache 14 | 15 | # we return the value of the key 16 | # that is queried in O(1) and return -1 if we 17 | # don't find the key in out dict / cache. 18 | # And also move the key to the end 19 | # to show that it was recently used. 20 | def get(self, key): 21 | if key not in self.cache: 22 | return None 23 | else: 24 | self.cache.move_to_end(key) 25 | return self.cache[key] 26 | 27 | # first, we add / update the key by conventional methods. 28 | # And also move the key to the end to show that it was recently used. 29 | # But here we will also check whether the length of our 30 | # ordered dictionary has exceeded our capacity, 31 | # If so we remove the first key (least recently used) 32 | def put(self, key, value) -> None: 33 | self.cache[key] = value 34 | self.cache.move_to_end(key) 35 | if len(self.cache) > self.capacity: 36 | self.cache.popitem(last=False) 37 | 38 | def pop(self, key, value): 39 | self.cache.pop(key, None) 40 | 41 | 42 | class LmdbReader: 43 | def __init__(self, path): 44 | self.path = path 45 | self.env = self.init_lmdb(path) 46 | 47 | def init_lmdb(self, l_path): 48 | env = lmdb.open( 49 | l_path, readonly=True, 50 | create=False, lock=False) # readahead=not _check_distributed() 51 | txn = env.begin(buffers=True) 52 | return txn 53 | 54 | def read(self, _id): 55 | try: 56 | value = self.env.get(str(_id).encode("utf-8")) 57 | value = pkl.loads(value) 58 | return value 59 | except Exception as e: 60 | print("Error in reading {} from {}".format(_id, self.path)) 61 | raise e 62 | -------------------------------------------------------------------------------- /omdet/utils/plots.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import cv2 5 | import matplotlib 6 | import numpy as np 7 | import torch 8 | from PIL import Image, ImageDraw, ImageFont, ImageOps 9 | import platform 10 | import math 11 | 12 | def is_writeable(dir, test=False): 13 | # Return True if directory has write permissions, test opening a file with write permissions if test=True 14 | if test: # method 1 15 | file = Path(dir) / 'tmp.txt' 16 | try: 17 | with open(file, 'w'): # open file with write permissions 18 | pass 19 | file.unlink() # remove file 20 | return True 21 | except IOError: 22 | return False 23 | else: # method 2 24 | return os.access(dir, os.R_OK) # possible issues on Windows 25 | 26 | def user_config_dir(dir='Ultralytics', env_var='YOLOV5_CONFIG_DIR'): 27 | # Return path of user configuration directory. Prefer environment variable if exists. Make dir if required. 28 | env = os.getenv(env_var) 29 | if env: 30 | path = Path(env) # use environment variable 31 | else: 32 | cfg = {'Windows': 'AppData/Roaming', 'Linux': '.config', 'Darwin': 'Library/Application Support'} # 3 OS dirs 33 | path = Path.home() / cfg.get(platform.system(), '') # OS-specific config dir 34 | path = (path if is_writeable(path) else Path('/tmp')) / dir # GCP and AWS lambda fix, only /tmp is writeable 35 | path.mkdir(exist_ok=True) # make if required 36 | return path 37 | 38 | # Settings 39 | CONFIG_DIR = user_config_dir() # Ultralytics settings dir 40 | RANK = int(os.getenv('RANK', -1)) 41 | matplotlib.rc('font', **{'size': 11}) 42 | matplotlib.use('Agg') # for writing to files only 43 | 44 | def check_font(font='Arial.ttf', size=10): 45 | # Return a PIL TrueType Font, downloading to CONFIG_DIR if necessary 46 | font = Path(font) 47 | font = font if font.exists() else (CONFIG_DIR / font.name) 48 | try: 49 | return ImageFont.truetype(str(font) if font.exists() else font.name, size) 50 | except Exception as e: # download if missing 51 | url = "https://ultralytics.com/assets/" + font.name 52 | print(f'Downloading {url} to {font}...') 53 | torch.hub.download_url_to_file(url, str(font), progress=False) 54 | return ImageFont.truetype(str(font), size) 55 | 56 | def is_ascii(s=''): 57 | # Is string composed of all ASCII (no UTF) characters? 58 | s = str(s) # convert list, tuple, None, etc. to str 59 | return len(s.encode().decode('ascii', 'ignore')) == len(s) 60 | 61 | 62 | class Annotator: 63 | # if RANK in (-1, 0): 64 | # check_font() # download TTF if necessary 65 | 66 | # YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations 67 | def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=True): 68 | assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.' 69 | self.pil = pil 70 | self.offset = 0 71 | if self.pil: # use PIL 72 | self.im = im if isinstance(im, Image.Image) else Image.fromarray(im) 73 | self.im = ImageOps.expand(self.im, border=self.offset, fill=(255, 255, 255)) 74 | self.draw = ImageDraw.Draw(self.im) 75 | self.font = check_font(font, size=font_size or max(round(sum(self.im.size) / 2 * 0.035), 12)) 76 | self.fh = 5 # font height 77 | else: # use cv2 78 | self.im = im 79 | self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2) # line width 80 | 81 | def _offset_box(self, box): 82 | return (np.array(box)+self.offset).tolist() 83 | 84 | def draw_arrow(self, ptA, ptB, width=1, color=(0, 255, 0)): 85 | """Draw line from ptA to ptB with arrowhead at ptB""" 86 | # Get drawing context 87 | # Draw the line without arrows 88 | self.draw.line((ptA, ptB), width=width, fill=color) 89 | 90 | # Now work out the arrowhead 91 | # = it will be a triangle with one vertex at ptB 92 | # - it will start at 95% of the length of the line 93 | # - it will extend 8 pixels either side of the line 94 | x0, y0 = ptA 95 | x1, y1 = ptB 96 | # Now we can work out the x,y coordinates of the bottom of the arrowhead triangle 97 | xb = 0.95 * (x1 - x0) + x0 98 | yb = 0.95 * (y1 - y0) + y0 99 | 100 | # Work out the other two vertices of the triangle 101 | # Check if line is vertical 102 | if x0 == x1: 103 | vtx0 = (xb - 5, yb) 104 | vtx1 = (xb + 5, yb) 105 | # Check if line is horizontal 106 | elif y0 == y1: 107 | vtx0 = (xb, yb + 5) 108 | vtx1 = (xb, yb - 5) 109 | else: 110 | alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180 111 | a = 8 * math.cos(alpha) 112 | b = 8 * math.sin(alpha) 113 | vtx0 = (xb + a, yb + b) 114 | vtx1 = (xb - a, yb - b) 115 | 116 | # draw.point((xb,yb), fill=(255,0,0)) # DEBUG: draw point of base in red - comment out draw.polygon() below if using this line 117 | # im.save('DEBUG-base.png') # DEBUG: save 118 | 119 | # Now draw the arrowhead triangle 120 | self.draw.polygon([vtx0, vtx1, ptB], fill=color) 121 | 122 | def box_label(self, box, label='', sub_label='', color=(128, 128, 128), txt_color=(255, 255, 255)): 123 | # Add one xyxy box to image with label 124 | box = self._offset_box(box) 125 | if self.pil or not is_ascii(label): 126 | self.draw.rectangle(box, width=self.lw, outline=color) # box 127 | if label: 128 | w, h = 2, 2 # text width 129 | self.draw.rectangle([box[0], box[1] - self.fh, box[0] + w + 1, box[1] + 1], fill=color) 130 | # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls') # for PIL>8.0 131 | self.draw.text((box[0], box[1] - h), label+'\n'+sub_label, fill=txt_color, font=self.font) 132 | else: # cv2 133 | c1, c2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) 134 | cv2.rectangle(self.im, c1, c2, color, thickness=self.lw, lineType=cv2.LINE_AA) 135 | if label: 136 | tf = max(self.lw - 1, 1) # font thickness 137 | w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0] 138 | c2 = c1[0] + w, c1[1] - h - 3 139 | cv2.rectangle(self.im, c1, c2, color, -1, cv2.LINE_AA) # filled 140 | ft = cv2.freetype.createFreeType2() 141 | ft.putText(self.im, label+'\n'+sub_label, (c1[0], c1[1] - 2), 0, self.lw / 3, txt_color, thickness=tf, 142 | lineType=cv2.LINE_AA) 143 | 144 | def tuple_label(self, src_box, dest_box, label='', src_color='red', dest_color='blue', txt_color=(255, 255, 255)): 145 | # Add one xyxy box to image with label 146 | src_box = self._offset_box(src_box) 147 | dest_box = self._offset_box(dest_box) 148 | 149 | if self.pil or not is_ascii(label): 150 | self.draw.rectangle(src_box, width=self.lw, outline=src_color) # box 151 | self.draw.rectangle(dest_box, width=self.lw, outline=dest_color) # box 152 | src_c = (int((src_box[2]+src_box[0])/2), int((src_box[3]+src_box[1])/2)) 153 | dest_c = (int((dest_box[2]+dest_box[0])/2), int((dest_box[3]+dest_box[1])/2)) 154 | c_c = [(src_c[0]+dest_c[0])/2, (src_c[1]+dest_c[1])/2] 155 | # self.draw.line(xy=[src_c, dest_c], fill='green') 156 | self.draw_arrow(src_c, dest_c, color='green', width=2) 157 | 158 | if label: 159 | w, h = self.font.getsize(label) # text width 160 | self.draw.rectangle([c_c[0], c_c[1] - self.fh, c_c[0] + w + 1, c_c[1] + 1], fill='green') 161 | self.draw.text((c_c[0], c_c[1] - h), label, fill=txt_color, font=self.font) 162 | 163 | else: # cv2 164 | raise Exception("CV2 is not supported yet") 165 | 166 | def rectangle(self, xy, fill=None, outline=None, width=1): 167 | # Add rectangle to image (PIL-only) 168 | self.draw.rectangle(xy, fill, outline, width) 169 | 170 | def text(self, xy, text, txt_color=(255, 255, 255)): 171 | # Add text to image (PIL-only) 172 | w, h = self.font.getsize(text) # text width, height 173 | self.draw.text((xy[0], xy[1] - h + 1), text, fill=txt_color, font=self.font) 174 | 175 | def result(self): 176 | # Return annotated image as array 177 | return np.asarray(self.im) 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /omdet/utils/registry.py: -------------------------------------------------------------------------------- 1 | def _register_generic(module_dict, module_name, module): 2 | assert module_name not in module_dict 3 | module_dict[module_name] = module 4 | 5 | 6 | class Registry(dict): 7 | ''' 8 | A helper class for managing registering modules, it extends a dictionary 9 | and provides a register functions. 10 | Eg. creeting a registry: 11 | some_registry = Registry({"default": default_module}) 12 | There're two ways of registering new modules: 13 | 1): normal way is just calling register function: 14 | def foo(): 15 | ... 16 | some_registry.register("foo_module", foo) 17 | 2): used as decorator when declaring the module: 18 | @some_registry.register("foo_module") 19 | @some_registry.register("foo_modeul_nickname") 20 | def foo(): 21 | ... 22 | Access of module is just like using a dictionary, eg: 23 | f = some_registry["foo_modeul"] 24 | ''' 25 | def __init__(self, *args, **kwargs): 26 | super(Registry, self).__init__(*args, **kwargs) 27 | 28 | def register(self, module_name, module=None): 29 | # used as function call 30 | if module is not None: 31 | _register_generic(self, module_name, module) 32 | return 33 | 34 | # used as decorator 35 | def register_fn(fn): 36 | _register_generic(self, module_name, fn) 37 | return fn 38 | 39 | return register_fn -------------------------------------------------------------------------------- /omdet/utils/tools.py: -------------------------------------------------------------------------------- 1 | import io 2 | import base64 3 | import re 4 | from PIL import ImageDraw, Image 5 | import lmdb 6 | from detectron2.data import transforms as T 7 | import logging 8 | from tqdm import tqdm 9 | import os 10 | from detectron2.data import detection_utils as utils 11 | import pickle 12 | import numpy as np 13 | from detectron2.config import CfgNode 14 | from typing import Generator, Sequence 15 | from joblib import Parallel, delayed 16 | import torch 17 | import random 18 | 19 | def make_continuous_categories(cats, verbose=True): 20 | # return a continuous categord_id from 1 to num_classes 21 | diff_cnt = 0 22 | for c_id, c in enumerate(cats): 23 | if c['id'] != c_id+1: 24 | diff_cnt += 1 25 | c['id'] = c_id + 1 26 | 27 | if verbose: 28 | print("Changed {} category_id among {} cats".format(diff_cnt, len(cats))) 29 | 30 | return cats 31 | 32 | def is_overlap(a, b): 33 | if b[1] - b[0] == 0 or a[1] - a[0] == 0: 34 | return False 35 | 36 | return a[0] <= b[0] < a[1] or b[0] <= a[0] < b[1] 37 | 38 | 39 | def get_span_embedding(model, tokenizer, sent, spans, layers, device): 40 | assert len(sent) == len(spans) 41 | encoded = tokenizer.batch_encode_plus(sent, return_tensors="pt", padding=True) 42 | encoded = encoded.to(device) 43 | # token_ids_word = np.where(np.array(encoded.word_ids()) == idx) 44 | with torch.no_grad(): 45 | output = model(**encoded) 46 | 47 | # Get all hidden states 48 | states = output.hidden_states 49 | # Stack and sum all requested layers 50 | output = torch.stack([states[i] for i in layers]).sum(0).squeeze() 51 | 52 | # Only select the tokens that constitute the requested word 53 | results = [] 54 | for b_id, b_span in enumerate(spans): 55 | offsets = encoded.encodings[b_id].offsets 56 | feats = [] 57 | valid_offsets = [] 58 | for t_id, t_span in enumerate(offsets): 59 | valid = False 60 | for s in b_span: 61 | if is_overlap(t_span, s): 62 | valid = True 63 | break 64 | if valid: 65 | feats.append(output[b_id, t_id].view(1, -1)) 66 | valid_offsets.append(t_span) 67 | 68 | if len(feats) == 0: 69 | raise Exception(f"Sentence '{sent[b_id]}' ({len(sent[b_id])}) cannot find valid span for {b_span}.") 70 | 71 | res = torch.mean(torch.stack(feats, dim=0), dim=0).cpu().tolist() 72 | results.append(res[0]) 73 | return results 74 | 75 | 76 | def get_txt_embedding(model, sent): 77 | txt_embedding = model._text_encode(sent) 78 | return txt_embedding 79 | 80 | 81 | def clean_t(x, max_len, rm_sym=True, must_idx=None, return_offset=False): 82 | """ 83 | rm_sym: remove symbol _ 84 | """ 85 | s_id = 0 86 | x = x.lower() 87 | if rm_sym: 88 | x = x.replace('_', ' ').replace('-', ' ') 89 | x = ' '.join(x.split()) # remove duplicate space 90 | 91 | if must_idx is not None: 92 | min_id, max_id = must_idx 93 | if max_id >= max_len: 94 | s_id = max(0, min(min_id, int(max_id - (max_len / 2)))) 95 | e_id = min(len(x), int(max_id + (max_len / 2))) 96 | # print(f"Special cut ({must_idx}): from {s_id} to {e_id} for sent of len {len(x)}") 97 | x = x[s_id:e_id] 98 | else: 99 | x = x[0:max_len] 100 | if return_offset: 101 | return x, s_id 102 | else: 103 | return x 104 | 105 | def sample_true(prob): 106 | if prob <= 0: 107 | return False 108 | generated_neg_prob = random.random() 109 | valid = generated_neg_prob < prob 110 | return valid 111 | 112 | def rm_duplicates(input_list, keep_order=False): 113 | if not keep_order: 114 | return list(set(input_list)) 115 | 116 | # Create an empty set to store the items that have been seen 117 | seen = set() 118 | 119 | # Create an empty list to store the result 120 | result = [] 121 | 122 | # Iterate over the input list 123 | for item in input_list: 124 | # If the item is not already in the seen set, add it to the result list 125 | if item not in seen: 126 | result.append(item) 127 | 128 | # Add the item to the seen set 129 | seen.add(item) 130 | 131 | # Return the result list 132 | return result 133 | 134 | 135 | def chunks(l: Sequence, n: int = 5) -> Generator[Sequence, None, None]: 136 | """Yield successive n-sized chunks from l.""" 137 | for i in range(0, len(l), n): 138 | yield l[i:i + n] 139 | 140 | 141 | def encode_dump_text(model, feat_path, text_vocab, batch_size): 142 | text_keys = [] 143 | for block in tqdm(chunks(text_vocab, n=batch_size)): 144 | block_feats = [] 145 | block_keys = [] 146 | for batch in chunks(block, n=500): 147 | batch_fs = get_txt_embedding(model, batch) 148 | batch_keys = batch 149 | block_feats.extend(batch_fs) 150 | block_keys.extend(batch_keys) 151 | 152 | text_keys.extend(block_keys) 153 | write_lmdb_from_id_data_pairs( 154 | id_data_pairs=[(key, embed) for key, embed in zip(block_keys, block_feats)], 155 | lmdb_save_dir=feat_path 156 | ) 157 | return text_keys 158 | 159 | 160 | def cropbox(xmin, ymin, xmax, ymax, img_size, ratio=1.5, make_square=False): 161 | if xmin < 0 or ymin < 0 or xmax < 0 or ymax < 0: 162 | raise Exception 163 | w, h = img_size 164 | if xmin > w or ymin > h or xmax > w or ymax > h: 165 | raise Exception 166 | 167 | xc = xmin + (xmax - xmin) / 2 168 | yc = ymin + (ymax - ymin) / 2 169 | w = xmax - xmin 170 | h = ymax - ymin 171 | nw = w * ratio 172 | nh = h * ratio 173 | 174 | if make_square: 175 | if nw > nh: 176 | nh = nw 177 | else: 178 | nw = nh 179 | 180 | nxmin = max(xc - (nw / 2), 0) 181 | nymin = max(yc - (nh / 2), 0) 182 | 183 | nxmax = min(xc + (nw / 2), img_size[0]) 184 | nymax = min(yc + (nh / 2), img_size[1]) 185 | 186 | return nxmin, nymin, nxmax, nymax 187 | 188 | 189 | def image_to_base64(img): 190 | output_buffer = io.BytesIO() 191 | img.save(output_buffer, format='JPEG') 192 | byte_data = output_buffer.getvalue() 193 | base64_str = base64.b64encode(byte_data) 194 | return base64_str 195 | 196 | 197 | def base64_to_image(base64_str): 198 | return Image.open(io.BytesIO(base64.b64decode(base64_str))) 199 | 200 | 201 | def draw_bounding_box_on_image(image, xmin, ymin, xmax, ymax, 202 | color='red', 203 | text='', 204 | thickness=4): 205 | draw = ImageDraw.Draw(image) 206 | draw.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=thickness) 207 | draw.text((xmin, ymin), text) 208 | return image 209 | 210 | 211 | def build_transform_gen(cfg, is_train): 212 | """ 213 | Create a list of :class:`TransformGen` from config. 214 | Returns: 215 | list[TransformGen] 216 | """ 217 | if is_train: 218 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 219 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 220 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 221 | else: 222 | min_size = cfg.INPUT.MIN_SIZE_TEST 223 | max_size = cfg.INPUT.MAX_SIZE_TEST 224 | sample_style = "choice" 225 | if sample_style == "range": 226 | assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size)) 227 | 228 | tfm_gens = [] 229 | if is_train: 230 | tfm_gens.append(T.RandomFlip()) 231 | tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 232 | # tfm_gens.append(T.Resize(min_size)) 233 | if is_train: 234 | logger = logging.getLogger(__name__) 235 | logger.info("TransformGens used in training: " + str(tfm_gens)) 236 | return tfm_gens 237 | 238 | 239 | def jp(a, b): 240 | return os.path.join(a, b) 241 | 242 | 243 | def check_img(i, img_root): 244 | # i['file_name'] = i['file_name'].split('/')[-1] 245 | try: 246 | iimage = utils.read_image(jp(img_root, i["file_name"]), format='RGB') 247 | utils.check_image_size(i, iimage) 248 | 249 | except Exception as e: 250 | print("BAD D2 IMG", i) 251 | if 'image_id' in i: 252 | return i['image_id'] 253 | else: 254 | return i['id'] 255 | 256 | return None 257 | 258 | 259 | def fix_img_size(i, img_root): 260 | try: 261 | if not "file_name" in i: 262 | i["file_name"] = i["coco_url"].split("/")[-1] 263 | img = Image.open(jp(img_root, i['file_name'])) 264 | w, h = img.size 265 | if i['width'] != w or i['height'] != h: 266 | print("Found image {} with wrong size.\n".format(i['id'])) 267 | i['width'] = w 268 | i['height'] = h 269 | 270 | return i 271 | except Exception as e: 272 | print("BAD IMG", i, e) 273 | return None 274 | 275 | 276 | def fix_data(img_root, data): 277 | if type(data) is dict: 278 | num_imgs = len(data['images']) 279 | data['images'] = Parallel(n_jobs=15, backend='threading')( 280 | delayed(fix_img_size)(i, img_root) for i in tqdm(data['images'])) 281 | data['images'] = [i for i in data['images'] if i is not None] 282 | print("First stage image fixing go from {} to {}".format(num_imgs, len(data['images']))) 283 | 284 | bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data['images'])) 285 | bad_ids = [x for x in set(bad_ids) if x is not None] 286 | print("Found {} bad images with D2 checking".format(len(bad_ids))) 287 | data['images'] = [d for d in data['images'] if d['id'] not in bad_ids] 288 | print("Images go from {} to {}".format(num_imgs, len(data['images']))) 289 | 290 | prev_anno_size = len(data['annotations']) 291 | valid_imgs = {i['id'] for i in data['images']} 292 | data['annotations'] = [d for d in data['annotations'] if d['image_id'] in valid_imgs] 293 | print("Anno go from {} to {} after fixing.".format(prev_anno_size, len(data['annotations']))) 294 | else: 295 | num_imgs = len(data) 296 | data = Parallel(n_jobs=15, backend='threading')(delayed(fix_img_size)(i, img_root) for i in tqdm(data)) 297 | data = [i for i in data if i is not None] 298 | print("First stage image fixing go from {} to {}".format(num_imgs, len(data))) 299 | 300 | bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data)) 301 | bad_ids = [x for x in set(bad_ids) if x is not None] 302 | print("Found {} bad images with D2 checking".format(len(bad_ids))) 303 | data = [d for d in data if d['id'] not in bad_ids] 304 | print("Images go from {} to {}".format(num_imgs, len(data))) 305 | return data 306 | 307 | 308 | def convert_cfg_to_dict(cfg_node, key_list): 309 | if not isinstance(cfg_node, CfgNode): 310 | return cfg_node 311 | else: 312 | cfg_dict = dict(cfg_node) 313 | for k, v in cfg_dict.items(): 314 | cfg_dict[k] = convert_cfg_to_dict(v, key_list + [k]) 315 | return cfg_dict 316 | 317 | 318 | def flatten_json(json_file): 319 | out = {} 320 | 321 | def flatten(x, name=''): 322 | if type(x) is dict: 323 | for a in x: 324 | flatten(x[a], name + a + '.') 325 | elif type(x) is list: 326 | i = 0 327 | for a in x: 328 | flatten(a, name + str(i) + '.') 329 | i += 1 330 | else: 331 | out[name[:-1]] = x 332 | 333 | flatten(json_file) 334 | return out 335 | 336 | 337 | def convert_to_float(value): 338 | if isinstance(value, float): 339 | return value 340 | try: # try pytorch 341 | return value.item() 342 | except: 343 | try: # try numpy 344 | print(value.dtype) 345 | return np.asscalar(value) 346 | except: 347 | raise ValueError('do not know how to convert this number {} to float'.format(value)) 348 | 349 | 350 | def remove_punctuation(text: str) -> str: 351 | punct = ['|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', 352 | '\'', '\"', '’', '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.' 353 | ] 354 | for p in punct: 355 | text = text.replace(p, '') 356 | return text.strip() -------------------------------------------------------------------------------- /outputs/000000574769.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/outputs/000000574769.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | timm==0.9.16 2 | transformers==4.21.0 3 | lmdb==1.4.1 4 | Pillow==8.4.0 5 | ftfy==6.2.0 6 | joblib==1.3.2 7 | opencv-python==4.7.0.72 8 | pydantic 9 | fastapi 10 | uvicorn -------------------------------------------------------------------------------- /run_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from omdet.inference.det_engine import DetEngine 4 | from omdet.utils.plots import Annotator 5 | from PIL import Image 6 | import numpy as np 7 | 8 | if __name__ == "__main__": 9 | engine = DetEngine(batch_size=1, device='cuda') 10 | img_paths = ['./sample_data/000000574769.jpg'] # path of images 11 | labels = ["person", "cat", "orange"] # labels to be predicted 12 | prompt = 'Detect {}.'.format(','.join(labels)) # prompt of detection task, use "Detect {}." as default 13 | 14 | res = engine.inf_predict('OmDet-Turbo_tiny_SWIN_T', # prefix name of the pretrained checkpoints 15 | task=prompt, 16 | data=img_paths, 17 | labels=labels, 18 | src_type='local', # type of the image_paths, "local"/"url" 19 | conf_threshold=0.30, 20 | nms_threshold=0.5 21 | ) 22 | print(res) 23 | 24 | out_folder = './outputs' 25 | for idx, img_path in enumerate(img_paths): 26 | im = Image.open(img_path) 27 | a = Annotator(np.ascontiguousarray(im), font_size=12, line_width=1, pil=True, font='sample_data/simsun.ttc') 28 | for R in res[idx]: 29 | a.box_label([R['xmin'], R['ymin'], R['xmax'], R['ymax']], 30 | label=f"{R['label']} {str(int(R['conf'] * 100))}%", 31 | color='red') 32 | 33 | if not os.path.exists(out_folder): 34 | os.mkdir(out_folder) 35 | 36 | image = a.result() 37 | img = Image.fromarray(image) 38 | img.save('outputs/'+img_path.split('/')[-1]) -------------------------------------------------------------------------------- /run_wsgi.py: -------------------------------------------------------------------------------- 1 | import time 2 | import uvicorn 3 | from fastapi import FastAPI 4 | from omdet.inference.det_engine import DetEngine 5 | from pydantic import BaseModel 6 | from typing import List, Dict, Union 7 | 8 | 9 | class InfDetectBody(BaseModel): 10 | model_id: str 11 | data: List[str] 12 | src_type: str = "url" 13 | task: str 14 | labels: List[str] 15 | threshold: float = 0.1 16 | nms_threshold: float = 0.5 17 | 18 | 19 | class Object(BaseModel): 20 | xmin: float 21 | ymin: float 22 | xmax: float 23 | ymax: float 24 | conf: float 25 | label: str 26 | 27 | 28 | class DetectionRes(BaseModel): 29 | took: int 30 | objects: List[List[Object]] = [] 31 | 32 | 33 | app = FastAPI() 34 | 35 | 36 | @app.on_event("startup") 37 | async def startup_event(): 38 | app.state.detector = DetEngine(model_dir="resources/", device="cuda", batch_size=10) 39 | 40 | 41 | @app.post( 42 | "/inf_predict", 43 | response_model=DetectionRes, 44 | name="Detect objects with Inf Possibilities", 45 | ) 46 | async def detect_urls( 47 | body: InfDetectBody = None, 48 | ) -> DetectionRes: 49 | s_time = time.time() 50 | out = app.state.detector.inf_predict( 51 | body.model_id, 52 | task=body.task, 53 | labels=body.labels, 54 | data=body.data, 55 | src_type=body.src_type, 56 | conf_threshold=body.threshold, 57 | nms_threshold=body.nms_threshold, 58 | ) 59 | 60 | resp = DetectionRes(took=int((time.time() - s_time) * 1000), objects=out) 61 | return resp 62 | 63 | 64 | if __name__ == "__main__": 65 | uvicorn.run("run_wsgi:app", host="0.0.0.0", port=8000) 66 | -------------------------------------------------------------------------------- /sample_data/000000574769.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/000000574769.jpg -------------------------------------------------------------------------------- /sample_data/simsun.ttc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/simsun.ttc --------------------------------------------------------------------------------