├── .gitignore
├── LICENSE
├── README.md
├── add_hf.py
├── configs
    └── OmDet-Turbo_tiny_SWIN_T.yaml
├── docs
    ├── cvt_grounding_dino-en.md
    ├── cvt_grounding_dino-zh.md
    ├── main_results.png
    ├── speed_compare.jpeg
    └── turbo_model.jpeg
├── export.py
├── install.md
├── omdet
    ├── __init__.py
    ├── inference
    │   ├── __init__.py
    │   ├── base_engine.py
    │   └── det_engine.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── bifpn.py
    │   │   ├── config.py
    │   │   ├── convnext.py
    │   │   ├── dlafpn.py
    │   │   └── swint.py
    │   ├── common.py
    │   ├── language_backbone
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── clip
    │   │   │   ├── __init__.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   │   ├── clip.py
    │   │   │   │   └── model.py
    │   │   │   └── simple_tokenizer.py
    │   │   └── word_utils.py
    │   └── registry.py
    ├── omdet_v2_turbo
    │   ├── __init__.py
    │   ├── block.py
    │   ├── build_components.py
    │   ├── config.py
    │   ├── conv.py
    │   ├── detector.py
    │   ├── detr_torch.py
    │   ├── dn_ops.py
    │   ├── ela_decoder.py
    │   ├── ela_encoder.py
    │   ├── head.py
    │   ├── infer_model.py
    │   └── torch_utils.py
    └── utils
    │   ├── __init__.py
    │   ├── analyze_model.py
    │   ├── box_ops.py
    │   ├── cache.py
    │   ├── plots.py
    │   ├── registry.py
    │   └── tools.py
├── outputs
    └── 000000574769.jpg
├── requirements.txt
├── run_demo.py
├── run_wsgi.py
└── sample_data
    ├── 000000574769.jpg
    └── simsun.ttc


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OmDet-Turbo
  2 | 
  3 | <p align="center">
  4 |  <a href="https://arxiv.org/abs/2403.06892"><strong> [Paper 📄] </strong></a> <a href="https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T"><strong> [Model 🗂️] </strong></a>
  5 | </p>
  6 | <p align="center">
  7 | Fast and accurate open-vocabulary end-to-end object detection
  8 | </p>
  9 | 
 10 | ***
 11 | ## 🗓️ Updates
 12 | * 09/26/2024：OmDet-Turbo has been integrated into Transformers version 4.45.0. The code is available at [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models/omdet_turbo), and the Hugging Face model is available at [here](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf).
 13 | * 07/05/2024: Our new open-source project, [OmAget: A multimodal agent framework for solving complex tasks](https://github.com/om-ai-lab/OmAgent) is available !!! Additionally, OmDet has been seamlessly integrated as an OVD tool within it. Feel free to delve into our innovative multimodal agent framework. 
 14 | * 06/24/2024: Guidance for [converting OmDet-Turbo to ONNX](https://github.com/om-ai-lab/OmDet#:~:text=How%20To%20Export%20ONNX%20Model)
 15 | * 03/25/2024: Inference code and a pretrained OmDet-Turbo-Tiny model released.
 16 | * 03/12/2024: Github open-source project created
 17 | 
 18 | ***
 19 | ## 🔗 Related Works
 20 | If you are interested in our research, we welcome you to explore our other wonderful projects.
 21 | 
 22 | 🔆 [How to Evaluate the Generalization of Detection? A Benchmark for Comprehensive Open-Vocabulary Detection](https://arxiv.org/abs/2308.13177)(AAAI24) &nbsp;🏠[Github Repository](https://github.com/om-ai-lab/OVDEval/tree/main)
 23 | 
 24 | 🔆 [OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network](https://ietresearch.onlinelibrary.wiley.com/doi/full/10.1049/cvi2.12268)(IET Computer Vision)
 25 | 
 26 | ***
 27 | ## 📖 Introduction
 28 | This repository is the official PyTorch implementation for **OmDet-Turbo**, a fast transformer-based open-vocabulary object detection model.
 29 | 
 30 | **⭐️Highlights**
 31 | 1. **OmDet-Turbo** is a transformer-based real-time open-vocabulary
 32 | detector that combines strong OVD capabilities with fast inference speed.
 33 | This model addresses the challenges of efficient detection in open-vocabulary
 34 | scenarios while maintaining high detection performance.
 35 | 2. We introduce the **Efficient Fusion Head**, a swift multimodal fusion module
 36 | designed to alleviate the computational burden on the encoder and reduce
 37 | the time consumption of the head with ROI. 
 38 | 3. OmDet-Turbo-Base model, achieves state-of-the-art zero-shot performance on the ODinW and OVDEval datasets, with AP scores
 39 | of **30.1** and **26.86**, respectively. 
 40 | 4. The inference speed of OmDetTurbo-Base on the COCO val2017 dataset reach **100.2** FPS on an A100 GPU.
 41 | 
 42 | For more details, check out our paper **[Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892)**
 43 | <img src="docs/turbo_model.jpeg" alt="model_structure" width="100%">
 44 | 
 45 | 
 46 | ***
 47 | ## ⚡️ Inference Speed
 48 | Comparison of inference speeds for each component in tiny-size model.
 49 | <img src="docs/speed_compare.jpeg" alt="speed" width="100%">
 50 | 
 51 | ***
 52 | ## 🛠️ How To Install 
 53 | Follow the [Installation Instructions](install.md) to set up the environments for OmDet-Turbo
 54 | 
 55 | ***
 56 | ## 🚀 How To Run
 57 | ### Local Inference
 58 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints.
 59 | 2. Create a folder named **resources**, put downloaded models into this folder.
 60 | 3. Run **run_demo.py**, the images with predicted results will be saved at **./outputs** folder.
 61 | ### Run as a API Server
 62 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints.
 63 | 2. Create a folder named **resources**, put downloaded models into this folder.
 64 | 3. Run **run_wsgi.py**, the API server will be started at **http://host_ip:8000/inf_predict**, check **http://host_ip:8000/docs** to have a try.
 65 | 
 66 | We already added language cache while inferring with **run_demo.py**. For more details, please open and check **run_demo.py** scripts. 
 67 | 
 68 | 
 69 | ***
 70 | ## ⚙️ How To Export ONNX Model
 71 | 1. Replace **OmDetV2Turbo** in **OmDet-Turbo_tiny_SWIN_T.yaml** with **OmDetV2TurboInfer**
 72 | 2. Run **export.py**, and the omdet.onnx will be exported.
 73 |    
 74 | In the above example, post processing is not included in onnx model , and all input size are fixed. You can add more post processing and change the input size according to your needs.
 75 | 
 76 | 
 77 | ***
 78 | ## 📦 Model Zoo
 79 | The performance of COCO and LVIS are evaluated under zero-shot setting.
 80 | 
 81 | Model | Backbone | Pre-Train Data  | COCO | LVIS | FPS (pytorch/trt) |Weight 
 82 | -- |--------|-----------------| -- | -- |-------------------| --
 83 | OmDet-Turbo-Tiny| Swin-T | O365,GoldG | 42.5 | 30.3 | 21.5/140.0 |  [weight](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/tree/main)     
 84 | 
 85 | ***
 86 | ## 📝 Main Results
 87 | <img src="docs/main_results.png" alt="main_result" width="100%">
 88 | 
 89 | ***
 90 | ## Citation
 91 | Please consider citing our papers if you use our projects:
 92 | 
 93 | ```
 94 | @article{zhao2024real,
 95 |   title={Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head},
 96 |   author={Zhao, Tiancheng and Liu, Peng and He, Xuan and Zhang, Lu and Lee, Kyusong},
 97 |   journal={arXiv preprint arXiv:2403.06892},
 98 |   year={2024}
 99 | }
100 | ```
101 | 
102 | ```
103 | @article{zhao2024omdet,
104 |   title={OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network},
105 |   author={Zhao, Tiancheng and Liu, Peng and Lee, Kyusong},
106 |   journal={IET Computer Vision},
107 |   year={2024},
108 |   publisher={Wiley Online Library}
109 | }
110 | ```
111 | 


--------------------------------------------------------------------------------
/add_hf.py:
--------------------------------------------------------------------------------
 1 | from omdet.inference.det_engine import DetEngine
 2 | from omdet.omdet_v2_turbo.detector import OmDetV2Turbo
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     engine = DetEngine(batch_size=1, device='cuda')
 7 |     img_paths = ['./sample_data/000000574769.jpg']       # path of images
 8 |     labels = ["person", "cat", "orange"]          # labels to be predicted
 9 |     prompt = 'Detect {}.'.format(','.join(labels))        # prompt of detection task, use "Detect {}." as default
10 | 
11 |     model_id = 'OmDet-Turbo_tiny_SWIN_T'
12 |     model, cfg = engine._load_model(model_id)
13 | 
14 |     # push to hub
15 |     model.push_to_hub("nielsr/omde-v2-turbo-tiny-swin-tiny")
16 | 
17 |     # reload
18 |     model = OmDetV2Turbo.from_pretrained("nielsr/omde-v2-turbo-tiny-swin-tiny")


--------------------------------------------------------------------------------
/configs/OmDet-Turbo_tiny_SWIN_T.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: OmDetV2Turbo
 3 |   DEPLOY_MODE: true
 4 |   SWIN:
 5 |     OUT_FEATURES:
 6 |       - 1
 7 |       - 2
 8 |       - 3
 9 |     SIZE: T
10 |     USE_CHECKPOINT: false
11 |   BACKBONE:
12 |     NAME: build_swintransformer_backbone
13 |   LANGUAGE_BACKBONE:
14 |     MODEL_TYPE: "clip"
15 |     LANG_DIM: 512
16 |   DEVICE: cuda
17 |   FUSE_TYPE: merged_attn
18 |   TRANSFORMER_DECODER: ELADecoder
19 |   TRANSFORMER_ENCODER: ELAEncoder
20 |   HEAD: DINOHead
21 |   ELAEncoder:
22 |     act: gelu
23 |     depth_mult: 1.0
24 |     dim_feedforward: 2048
25 |     encoder_layer: TransformerLayer
26 |     eval_size: null
27 |     expansion: 1.0
28 |     feat_strides:
29 |     - 8
30 |     - 16
31 |     - 32
32 |     hidden_dim: 256
33 |     in_channels:
34 |     - 192
35 |     - 384
36 |     - 768
37 |     num_encoder_layers: 1
38 |     pe_temperature: 10000
39 |     use_encoder_idx:
40 |     - 2
41 |   PIXEL_MEAN:
42 |   - 123.675
43 |   - 116.28
44 |   - 103.53
45 |   PIXEL_STD:
46 |   - 58.395
47 |   - 57.12
48 |   - 57.375
49 |   ELADecoder:
50 |     activation: relu
51 |     backbone_feat_channels:
52 |     - 256
53 |     - 256
54 |     - 256
55 |     box_noise_scale: 1.0
56 |     cls_type: cosine
57 |     dim_feedforward: 2048
58 |     dropout: 0.0
59 |     eps: 0.01
60 |     eval_idx: -1
61 |     eval_size: null
62 |     feat_strides:
63 |     - 8
64 |     - 16
65 |     - 32
66 |     hidden_dim: 256
67 |     label_noise_ratio: 0.5
68 |     learnt_init_query: false
69 |     nhead: 8
70 |     num_decoder_layers: 6
71 |     num_decoder_points: 4
72 |     num_denoising: 100
73 |     num_levels: 3
74 |     num_queries: 900
75 |     position_embed_type: sine
76 |   WEIGHTS: resources/swin_tiny_patch4_window7_224.pkl
77 | INPUT:
78 |   FORMAT: RGB
79 |   MAX_SIZE_TEST: 640
80 |   MIN_SIZE_TEST: 640
81 | 


--------------------------------------------------------------------------------
/docs/cvt_grounding_dino-en.md:
--------------------------------------------------------------------------------
 1 | # Grounding DINO to TensorRT Conversion
 2 | 
 3 | Given that many people are interested about how to convert Grounding DINO mentioned in our paper to TensorRT, here is a brief introduction to our previous conversion approach. Additionally, while organizing the TRT conversion, we discovered a minor issue with the previous Grounding-DINO-T conversion. The correct FP16 speed after proper conversion should be approximately 27 FPS.
 4 | 
 5 | ## Converting PyTorch Model to ONNX Model
 6 | The original Grounding DINO code requires slight modifications to be converted to an ONNX model. However, when converting the ONNX model to a TensorRT model, various errors may occur. To avoid errors during ONNX to TensorRT conversion, some additional changes must be made when converting to the ONNX model.
 7 | 
 8 | - Comment out the statements using checkpoints in the backbone.
 9 | - Rewrite the NestedTensor in the code; avoid using the NestedTensor data structure. NestedTensor is mainly concentrated in the visual part. Use Tensor directly instead.
10 | - Rewrite the Joiner class in `backbone.py` as shown in the example below. The rewritten class should inherit from `nn.Module` instead of `nn.Sequential`. This might be the key to avoiding issues when converting the ONNX model to a TensorRT model. Some content in the `build_backbone` function can be moved to the rewritten Joiner class.
11 | - Treat the tokenizer as data preprocessing and place it outside the model; the output should be directly passed as input to the model's forward function.
12 | - The special handling in the `nested_tensor_from_tensor_list` function for ONNX conversion needs to be retained.
13 | - Make other necessary changes due to the above modifications.
14 | 
15 | ```python
16 | class Joiner(nn.Module):
17 |     def __init__(self):
18 |         self.backbone = xxxx
19 |         self.position_embedding = xxx
20 |     
21 |     def forward(self):
22 |         pass
23 | ```
24 | 
25 | ## Converting ONNX Model to TensorRT Model
26 | The ONNX model converted according to the above suggestions can be smoothly converted to a TensorRT model.
27 | 
28 | - It is recommended to use the latest version of TensorRT; it is indeed very fast.
29 | - Fixing the input dimensions can provide certain advantages. The speed tests for Grounding DINO in Omdet are based on fixed input dimensions.
30 | - F32 is almost lossless. When converting to FP16, there is a significant loss of precision, and some layers with substantial losses need extra handling. The speed tests for Grounding DINO in Omdet are based on FP16 models. FP32 is about 25-30% slower than FP16.
31 | 


--------------------------------------------------------------------------------
/docs/cvt_grounding_dino-zh.md:
--------------------------------------------------------------------------------
 1 | # Grounding DINO 转TensorRT
 2 | 鉴于不少同学提问想知道我们Paper提到的Grounding DINO的TRT是如何转换，所以在这里简单介绍一下我们之前的转换思路。此外，我们在整理TRT转换时也发现之前的Grounding-DINO-T转换得有点小问题，实际正确转换之后的FP16速度应该为～27FPS。
 3 | 
 4 | ## pytorch模型 转换成 onnx模型
 5 |   原始的Grounding DINO代码稍作修改就能转换成onnx模型， 但是转换成onnx模型后再转换成TensorRT模型时，会有各式各样的花式报错。为了避免onnx 转TensorRT时的报错，必须在转onnx模型时做一些额外的改动。
 6 |   
 7 | - 注释掉backbone中使用checkpoint的语句
 8 | - 将代码中的 NestedTensor 进行改写，不要使用NestedTensor数据结构。NestedTensor主要集中在视觉部分。直接使用Tensor即可
 9 | - 将backbone.py 中的Joiner类改写成下面示例。改写后的类要继承nn.Module, 而不是nn.Sequential类。这可能是避免onnx转TensorRT模型出现问题的关键。build_backbone函数里面的部分内容可以移动到改写后的Joint类中
10 | - 将tokenizer 当成数据预处理放在模型的外面，输出直接作为forward函数的输入传入模型
11 | - nested_tensor_from_tensor_list 函数中针对转onnx做的特殊处理需要保留
12 | - 其他一些因为上述改动导致的必要改动
13 | 
14 | ```python
15 | class Joiner(nn.Module):
16 |     def __init__(self):
17 |         self.backbone = xxxx
18 |         self.position_embedding = xxx
19 |     
20 |     def forward(self):
21 |         pass
22 | 
23 | ```
24 | 
25 | 
26 | ## onnx模型转TensorRT模型
27 |   按照上述建议转出的onnx模型可以流畅的转成TensorRT模型
28 |   
29 | - 建议使用最新版本TensorRT, 真的很快
30 | - 固定输入维度，会有一定的优势。Omdet中关于Grounding DINO 的速度测试都是基于固定的输入维度
31 | - F32 几乎无损， 转换FP16的时候精度损失较大，需要对一些损失较大的层进行额外的处理。Omdet中关于Grounding DINO 的速度测试都是基于FP16模型。FP32 比 FP16 慢 25~30%左右
32 | 


--------------------------------------------------------------------------------
/docs/main_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/main_results.png


--------------------------------------------------------------------------------
/docs/speed_compare.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/speed_compare.jpeg


--------------------------------------------------------------------------------
/docs/turbo_model.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/turbo_model.jpeg


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
 1 | from omdet.inference.det_engine import DetEngine
 2 | import torch
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     model_dir = "./resources"
 7 |     img_tensor =  torch.rand(1, 3, 640, 640) #
 8 |     label_feats = torch.rand(80, 1, 512) # 80 is cls num, 512 is clip dim
 9 |     task_feats = torch.rand(77, 1, 512)  # 77 is task dim
10 |     task_mask = torch.rand(1, 77)
11 | 
12 |     engine = DetEngine(model_dir=model_dir, batch_size=1, device='cpu')
13 |     onnx_model_path = "./omdet.onnx"
14 |     engine.export_onnx('OmDet-Turbo_tiny_SWIN_T', img_tensor, label_feats, task_feats, task_mask, onnx_model_path)
15 | 
16 | 


--------------------------------------------------------------------------------
/install.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | ## Requirements
 3 | 
 4 | * CUDA>=11.8
 5 |   
 6 | * Python>=3.9
 7 |   
 8 |   Create Python environments.
 9 |   ```bash
10 |   conda create -n omdet python=3.9
11 |   ```
12 |   Activate the environment:
13 |   ```bash
14 |   conda activate omdet
15 |   ```
16 | 
17 | * Pytorch>=2.1.0, Torchvision>=0.16.0
18 |   
19 |   If your CUDA version is 11.8, you can install Pytorch as following:
20 |   ```bash
21 |   conda install pytorch==2.1.0 torchvision==0.16.0 pytorch-cuda=11.8 -c pytorch -c nvidia
22 |   ```
23 | 
24 | * detectron2>=0.6.0:
25 | 
26 |   Install detectron2:
27 |   ```bash
28 |   python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
29 |   ```
30 | 
31 | * Other requirements
32 |     ```bash
33 |     pip install -r requirements.txt
34 |     ```
35 | 


--------------------------------------------------------------------------------
/omdet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/__init__.py


--------------------------------------------------------------------------------
/omdet/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/inference/__init__.py


--------------------------------------------------------------------------------
/omdet/inference/base_engine.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import requests
 4 | import io
 5 | import base64
 6 | from detectron2.data.detection_utils import _apply_exif_orientation, convert_PIL_to_numpy
 7 | import numpy as np
 8 | 
 9 | 
10 | def get_output_shape(oldh: int, oldw: int, short_edge_length: int, max_size: int):
11 |     """
12 |     Compute the output size given input size and target short edge length.
13 |     """
14 |     h, w = oldh, oldw
15 |     size = short_edge_length * 1.0
16 |     scale = size / min(h, w)
17 |     if h < w:
18 |         newh, neww = size, scale * w
19 |     else:
20 |         newh, neww = scale * h, size
21 |     if max(newh, neww) > max_size:
22 |         scale = max_size * 1.0 / max(newh, neww)
23 |         newh = newh * scale
24 |         neww = neww * scale
25 |     neww = int(neww + 0.5)
26 |     newh = int(newh + 0.5)
27 |     return (newh, neww)
28 | 
29 | 
30 | class BaseEngine(object):
31 |     def _load_data(self, src_type, cfg, data, return_transform=False):
32 |         if src_type == 'local':
33 |             image_data = [Image.open(x) for x in data]
34 | 
35 |         elif src_type == 'url':
36 |             image_data = []
37 |             for x in data:
38 |                 temp = Image.open(io.BytesIO(requests.get(x).content))
39 |                 image_data.append(temp)
40 | 
41 |         elif src_type == "base64":
42 |             image_data = []
43 |             for x in data:
44 |                 temp = Image.open(io.BytesIO(base64.b64decode(x))).convert("RGB")
45 |                 image_data.append(temp)
46 | 
47 |         else:
48 |             raise Exception("Unknown mode {}.".format(src_type))
49 | 
50 |         input_data = []
51 |         transforms = []
52 |         for x in image_data:
53 |             width, height = x.size
54 |             pil_image = x.resize((cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST), Image.BILINEAR)
55 |             image = convert_PIL_to_numpy(pil_image, cfg.INPUT.FORMAT)
56 | 
57 |             image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
58 |             input_data.append({"image": image, "height": height, "width": width})
59 | 
60 |         if return_transform:
61 |             return input_data, transforms
62 |         else:
63 |             return input_data


--------------------------------------------------------------------------------
/omdet/inference/det_engine.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from typing import List, Union, Dict
  4 | from omdet.utils.tools import chunks
  5 | from detectron2.checkpoint import DetectionCheckpointer
  6 | from detectron2.config import get_cfg
  7 | from detectron2.engine import DefaultTrainer as Trainer
  8 | from omdet.utils.cache import LRUCache
  9 | from omdet.inference.base_engine import BaseEngine
 10 | from detectron2.utils.logger import setup_logger
 11 | from omdet.omdet_v2_turbo.config import add_omdet_v2_turbo_config
 12 | 
 13 | 
 14 | class DetEngine(BaseEngine):
 15 |     def __init__(self, model_dir='resources/', device='cpu', batch_size=10):
 16 |         self.model_dir = model_dir
 17 |         self._models = LRUCache(10)
 18 |         self.device = device
 19 |         self.batch_size = batch_size
 20 |         self.logger = setup_logger(name=__name__)
 21 | 
 22 |     def _init_cfg(self, cfg, model_id):
 23 |         cfg.MODEL.WEIGHTS = os.path.join(self.model_dir, model_id+'.pth')
 24 |         cfg.MODEL.DEVICE = self.device
 25 |         cfg.INPUT.MAX_SIZE_TEST = 640
 26 |         cfg.INPUT.MIN_SIZE_TEST = 640
 27 |         cfg.MODEL.DEPLOY_MODE = True
 28 |         cfg.freeze()
 29 |         return cfg
 30 | 
 31 |     def count_parameters(self, model):
 32 |         return sum(p.numel() for p in model.parameters())
 33 | 
 34 |     def _load_model(self, model_id):
 35 |         if not self._models.has(model_id):
 36 |             cfg = get_cfg()
 37 |             add_omdet_v2_turbo_config(cfg)
 38 |             cfg.merge_from_file(os.path.join('configs', model_id+'.yaml'))
 39 |             cfg = self._init_cfg(cfg, model_id)
 40 |             model = Trainer.build_model(cfg)
 41 |             self.logger.info("Model:\n{}".format(model))
 42 |             DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 43 |             print("Loading a OmDet model {}".format(cfg.MODEL.WEIGHTS))
 44 |             model.eval()
 45 |             model.to(cfg.MODEL.DEVICE)
 46 |             print("Total parameters: {}".format(self.count_parameters(model)))
 47 |             self._models.put(model_id, (model, cfg))
 48 | 
 49 |         return self._models.get(model_id)
 50 | 
 51 |     def inf_predict(self, model_id,
 52 |                     data: List,
 53 |                     task: Union[str, List],
 54 |                     labels: List[str],
 55 |                     src_type: str = 'local',
 56 |                     conf_threshold: float = 0.5,
 57 |                     nms_threshold: float = 0.5
 58 |                     ):
 59 | 
 60 |         if len(task) == 0:
 61 |             raise Exception("Task cannot be empty.")
 62 | 
 63 |         model, cfg = self._load_model(model_id)
 64 | 
 65 |         resp = []
 66 |         flat_labels = labels
 67 | 
 68 |         with torch.no_grad():
 69 |             for batch in chunks(data, self.batch_size):
 70 |                 batch_image = self._load_data(src_type, cfg, batch)
 71 |                 for img in batch_image:
 72 |                     img['label_set'] = labels
 73 |                     img['tasks'] = task
 74 | 
 75 |                 batch_y = model(batch_image, score_thresh=conf_threshold, nms_thresh=nms_threshold)
 76 | 
 77 |                 for z in batch_y:
 78 |                     temp = []
 79 |                     instances = z['instances'].to('cpu')
 80 |                     instances = instances[instances.scores > conf_threshold]
 81 | 
 82 |                     for idx, pred in enumerate(zip(instances.pred_boxes, instances.scores, instances.pred_classes)):
 83 |                         (x, y, xx, yy), conf, cls = pred
 84 |                         conf = float(conf)
 85 |                         cls = flat_labels[int(cls)]
 86 | 
 87 |                         temp.append({'xmin': int(x),
 88 |                                      'ymin': int(y),
 89 |                                      'xmax': int(xx),
 90 |                                      'ymax': int(yy),
 91 |                                      'conf': conf,
 92 |                                      'label': cls})
 93 |                     resp.append(temp)
 94 | 
 95 |         return resp
 96 | 
 97 |     def export_onnx(self, model_id, img_tensor, label_feats, task_feats, task_mask, onnx_model_path):
 98 | 
 99 |         model, _ = self._load_model(model_id)
100 |         model.to("cpu")
101 |         model.eval()
102 |         inputs = (img_tensor, label_feats, task_feats, task_mask)
103 | 
104 |         print("start cvt onnx...")
105 |         torch.onnx.export(model,  # model being run
106 |                           inputs,  # model input (or a tuple for multiple inputs)
107 |                           onnx_model_path,  # where to save the model (can be a file or file-like object)
108 |                           export_params=True,  # store the trained parameter weights inside the model file
109 |                           opset_version=17,  # the ONNX version to export the model to
110 |                           do_constant_folding=True,  # whether to execute constant folding for optimization
111 |                           input_names=['img_tensor', "label_feats", "task_feats", "task_mask"],
112 |                           )


--------------------------------------------------------------------------------
/omdet/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/__init__.py


--------------------------------------------------------------------------------
/omdet/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from omdet.modeling.backbone import (convnext,  swint)


--------------------------------------------------------------------------------
/omdet/modeling/backbone/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode as CN
 2 | 
 3 | 
 4 | def add_backbone_config(cfg):
 5 |     add_convnext_config(cfg)
 6 |     add_swint_config(cfg)
 7 | 
 8 | 
 9 | def add_convnext_config(cfg):
10 |     # extra configs for convnext
11 |     cfg.MODEL.CONVNEXT = CN()
12 |     cfg.MODEL.CONVNEXT.SIZE = "T"
13 |     cfg.MODEL.CONVNEXT.DEPTHS= [3, 3, 9, 3]
14 |     cfg.MODEL.CONVNEXT.DIMS= [96, 192, 384, 768]
15 |     cfg.MODEL.CONVNEXT.DROP_PATH_RATE= 0.2
16 |     cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE= 1e-6
17 |     cfg.MODEL.CONVNEXT.OUT_FEATURES= [0, 1, 2, 3]
18 |     cfg.SOLVER.WEIGHT_DECAY_RATE= 0.95
19 | 
20 | 
21 | def add_swint_config(cfg):
22 |     cfg.MODEL.SWIN = CN()
23 |     cfg.MODEL.SWIN.SIZE = 'T'  # 'T', 'S', 'B'
24 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
25 |     cfg.MODEL.SWIN.OUT_FEATURES = (0, 1, 2, 3) # FPN stride 8 - 32
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/omdet/modeling/backbone/convnext.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from timm.models.layers import trunc_normal_, DropPath
  6 | from detectron2.modeling.backbone import Backbone
  7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  8 | from detectron2.modeling.backbone.fpn import FPN
  9 | from detectron2.layers import ShapeSpec
 10 | 
 11 | 
 12 | class Block(nn.Module):
 13 |     r""" ConvNeXt Block. There are two equivalent implementations:
 14 |     (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
 15 |     (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
 16 |     We use (2) as we find it slightly faster in PyTorch
 17 | 
 18 |     Args:
 19 |         dim (int): Number of input channels.
 20 |         drop_path (float): Stochastic depth rate. Default: 0.0
 21 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 22 |     """
 23 | 
 24 |     def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
 25 |         super().__init__()
 26 |         self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
 27 |         self.norm = LayerNorm(dim, eps=1e-6)
 28 |         self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
 29 |         self.act = nn.GELU()
 30 |         self.pwconv2 = nn.Linear(4 * dim, dim)
 31 |         self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
 32 |                                   requires_grad=True) if layer_scale_init_value > 0 else None
 33 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 34 | 
 35 |     def forward(self, x):
 36 |         input = x
 37 |         x = self.dwconv(x)
 38 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
 39 |         x = self.norm(x)
 40 |         x = self.pwconv1(x)
 41 |         x = self.act(x)
 42 |         x = self.pwconv2(x)
 43 |         if self.gamma is not None:
 44 |             x = self.gamma * x
 45 |         x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
 46 | 
 47 |         x = input + self.drop_path(x)
 48 |         return x
 49 | 
 50 | 
 51 | class LayerNorm(nn.Module):
 52 |     r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
 53 |     The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
 54 |     shape (batch_size, height, width, channels) while channels_first corresponds to inputs
 55 |     with shape (batch_size, channels, height, width).
 56 |     """
 57 | 
 58 |     def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
 59 |         super().__init__()
 60 |         self.weight = nn.Parameter(torch.ones(normalized_shape))
 61 |         self.bias = nn.Parameter(torch.zeros(normalized_shape))
 62 |         self.eps = eps
 63 |         self.data_format = data_format
 64 |         if self.data_format not in ["channels_last", "channels_first"]:
 65 |             raise NotImplementedError
 66 |         self.normalized_shape = (normalized_shape,)
 67 | 
 68 |     def forward(self, x):
 69 |         if self.data_format == "channels_last":
 70 |             return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
 71 |         elif self.data_format == "channels_first":
 72 |             u = x.mean(1, keepdim=True)
 73 |             s = (x - u).pow(2).mean(1, keepdim=True)
 74 |             x = (x - u) / torch.sqrt(s + self.eps)
 75 |             x = self.weight[:, None, None] * x + self.bias[:, None, None]
 76 |             return x
 77 | 
 78 | 
 79 | class ConvNeXt(Backbone):
 80 |     r""" ConvNeXt
 81 |         A PyTorch impl of : `A ConvNet for the 2020s`  -
 82 |           https://arxiv.org/pdf/2201.03545.pdf
 83 |     Args:
 84 |         in_chans (int): Number of input image channels. Default: 3
 85 |         num_classes (int): Number of classes for classification head. Default: 1000
 86 |         depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
 87 |         dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
 88 |         drop_path_rate (float): Stochastic depth rate. Default: 0.
 89 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
 90 |         head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
 91 |         out_features (tuple(int)): Stage numbers of the outputs given to the Neck.
 92 |     """
 93 | 
 94 |     def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768],
 95 |                  drop_path_rate=0., layer_scale_init_value=1e-6, out_features=None):
 96 |         super().__init__()
 97 | 
 98 |         self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
 99 |         stem = nn.Sequential(
100 |             nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
101 |             LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
102 |         )
103 | 
104 |         self.downsample_layers.append(stem)
105 |         for i in range(3):
106 |             downsample_layer = nn.Sequential(
107 |                 LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
108 |                 nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
109 |             )
110 |             self.downsample_layers.append(downsample_layer)
111 | 
112 |         self.num_layers = len(depths)
113 |         num_features = [int(dims[i] * 2 ** i) for i in range(self.num_layers)]
114 |         self.num_features = num_features
115 |         self._out_features = out_features
116 | 
117 |         self._out_feature_strides = {}
118 |         self._out_feature_channels = {}
119 | 
120 |         self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
121 |         dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
122 |         cur = 0
123 |         strides = [4, 4, 4, 4]
124 |         for i in range(4):
125 |             stage = nn.Sequential(
126 |                 *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
127 |                         layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
128 |             )
129 |             self.stages.append(stage)
130 |             cur += depths[i]
131 | 
132 |             self._out_feature_channels[i] = dims[i]
133 |             self._out_feature_strides[i] = strides[i] * 2 ** i
134 | 
135 |         norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
136 |         for i_layer in range(4):
137 |             layer = norm_layer(dims[i_layer])
138 |             layer_name = f'norm{i_layer}'
139 |             self.add_module(layer_name, layer)
140 | 
141 |         self.apply(self._init_weights)
142 | 
143 |     def _init_weights(self, m):
144 |         if isinstance(m, (nn.Conv2d, nn.Linear)):
145 |             trunc_normal_(m.weight, std=.02)
146 |             nn.init.constant_(m.bias, 0)
147 | 
148 |     def init_weights(self, pretrained=None):
149 |         """Initialize the weights in backbone.
150 |         Args:
151 |             pretrained (str, optional): Path to pre-trained weights.
152 |                 Defaults to None.
153 |         """
154 | 
155 |         def _init_weights(m):
156 |             if isinstance(m, nn.Linear):
157 |                 trunc_normal_(m.weight, std=.02)
158 |                 if isinstance(m, nn.Linear) and m.bias is not None:
159 |                     nn.init.constant_(m.bias, 0)
160 |             elif isinstance(m, nn.LayerNorm):
161 |                 nn.init.constant_(m.bias, 0)
162 |                 nn.init.constant_(m.weight, 1.0)
163 | 
164 |         self.apply(_init_weights)
165 | 
166 |     def forward_features(self, x):
167 |         outs = {}
168 |         for i in range(4):
169 |             x = self.downsample_layers[i](x)
170 |             x = self.stages[i](x)
171 |             if i in self._out_features:
172 |                 norm_layer = getattr(self, f'norm{i}')
173 |                 x_out = norm_layer(x)
174 |                 out = x_out.contiguous()
175 |                 stage_name = i
176 |                 outs[stage_name] = out
177 | 
178 |         return outs  # {"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
179 | 
180 |     def forward(self, x):
181 |         x = self.forward_features(x)
182 |         return x
183 | 
184 | 
185 | model_urls = {
186 |     "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
187 |     "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
188 |     "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
189 |     "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
190 |     "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
191 |     "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
192 |     "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
193 |     "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
194 |     "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
195 | }
196 | 
197 | size2config = {
198 |     "N": {
199 |         "DEPTHS": [2, 2, 8, 2],
200 |         "DIMS": [80, 160, 320, 640]
201 |     },
202 |     'T': {
203 |         "DEPTHS": [3, 3, 9, 3],
204 |         "DIMS": [96, 192, 384, 768]
205 |     },
206 |     'S': {
207 |         "DEPTHS": [3, 3, 27, 3],
208 |         "DIMS": [96, 192, 384, 768]
209 |     },
210 |     'B': {
211 |         "DEPTHS": [3, 3, 27, 3],
212 |         "DIMS": [128, 256, 512, 1024]
213 |     },
214 |     'L': {
215 |         "DEPTHS": [3, 3, 27, 3],
216 |         "DIMS": [192, 384, 768, 1536]
217 |     },
218 |     'XL': {
219 |         "DEPTHS": [3, 3, 27, 3],
220 |         "DIMS": [256, 512, 1024, 2048]
221 |     }
222 | }
223 | 
224 | 
225 | @BACKBONE_REGISTRY.register()
226 | def build_convnext_backbone(cfg, input_shape):
227 |     """
228 |     Create a ConvNeXt instance from config.
229 | 
230 |     Returns:
231 |         VoVNet: a :class:`VoVNet` instance.
232 |     """
233 |     size = cfg.MODEL.CONVNEXT.SIZE
234 |     if size in size2config:
235 |         depth = size2config[size]['DEPTHS']
236 |         dims = size2config[size]['DIMS']
237 |     else:
238 |         depth = cfg.MODEL.CONVNEXT.DEPTHS
239 |         dims = cfg.MODEL.CONVNEXT.DIMS
240 | 
241 |     return ConvNeXt(
242 |         in_chans=input_shape.channels,
243 |         depths=depth,
244 |         dims=dims,
245 |         drop_path_rate=cfg.MODEL.CONVNEXT.DROP_PATH_RATE,
246 |         layer_scale_init_value=cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE,
247 |         out_features=cfg.MODEL.CONVNEXT.OUT_FEATURES
248 |     )
249 | 
250 | 
251 | @BACKBONE_REGISTRY.register()
252 | def build_convnext_fpn_backbone(cfg, input_shape: ShapeSpec):
253 |     """
254 |     Args:
255 |         cfg: a detectron2 CfgNode
256 | 
257 |     Returns:
258 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
259 |     """
260 |     bottom_up = build_convnext_backbone(cfg, input_shape)
261 |     in_features = cfg.MODEL.FPN.IN_FEATURES
262 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
263 |     backbone = FPN(
264 |         bottom_up=bottom_up,
265 |         in_features=in_features,
266 |         out_channels=out_channels,
267 |         norm=cfg.MODEL.FPN.NORM,
268 |         top_block=None,
269 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
270 |     )
271 |     return backbone
272 | 


--------------------------------------------------------------------------------
/omdet/modeling/common.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn, Tensor
  4 | import copy
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | class PositionalEncoding(nn.Module):
  9 | 
 10 |     def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
 11 |         super().__init__()
 12 |         self.dropout = nn.Dropout(p=dropout)
 13 | 
 14 |         position = torch.arange(max_len).unsqueeze(1)
 15 |         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
 16 |         pe = torch.zeros(max_len, 1, d_model)
 17 |         pe[:, 0, 0::2] = torch.sin(position * div_term)
 18 |         pe[:, 0, 1::2] = torch.cos(position * div_term)
 19 |         self.register_buffer('pe', pe)
 20 | 
 21 |     def forward(self, x: Tensor) -> Tensor:
 22 |         """
 23 |         Args:
 24 |             x: Tensor, shape [seq_len, batch_size, embedding_dim]
 25 |         """
 26 |         x = x + self.pe[:x.size(0)]
 27 |         return self.dropout(x)
 28 | 
 29 | 
 30 | class AbsPositionalEncoding(nn.Module):
 31 | 
 32 |     def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
 33 |         super().__init__()
 34 |         self.dropout = nn.Dropout(p=dropout)
 35 |         self.pe = nn.Embedding(max_len, d_model)
 36 | 
 37 |     def forward(self, x: Tensor) -> Tensor:
 38 |         """
 39 |         Args:
 40 |             x: Tensor, shape [seq_len, batch_size, embedding_dim]
 41 |         """
 42 |         seq_len = x.size(0)
 43 |         position = torch.arange(seq_len, device=x.device).unsqueeze(1)
 44 |         pos_emb = self.pe(position)
 45 |         x = x + pos_emb
 46 |         return self.dropout(x)
 47 | 
 48 | 
 49 | class ResMultiHeadAttention(nn.Module):
 50 |     def __init__(self, d_q, d_k, d_v, nhead, dropout):
 51 |         super().__init__()
 52 |         self.self_attn = nn.MultiheadAttention(d_q, nhead, dropout=dropout, kdim=d_k, vdim=d_v)
 53 |         self.norm1 = nn.LayerNorm(d_q)
 54 |         self.dropout = nn.Dropout(dropout)
 55 | 
 56 |     def forward(self, q, k=None, v=None, attn_mask=None):
 57 |         """
 58 |         """
 59 |         if k is None:
 60 |             k = q
 61 | 
 62 |         if v is None:
 63 |             v = q
 64 | 
 65 |         q1 = self.self_attn(query=q, key=k, value=v, attn_mask=attn_mask)[0]
 66 |         q = q + self.dropout(q1)
 67 |         q = self.norm1(q)
 68 |         return q
 69 | 
 70 | 
 71 | class DistilMLP(nn.Module):
 72 |     def __init__(self, input_size, output_size, dropout=0.1):
 73 |         super(DistilMLP, self).__init__()
 74 |         self.squash = nn.GELU()
 75 |         self.LayerNorm = nn.LayerNorm(input_size, eps=1e-12)
 76 |         self.intermediate = nn.Linear(input_size, input_size)
 77 |         self.dropout = nn.Dropout(dropout)
 78 |         self.dense = nn.Linear(input_size, output_size)
 79 | 
 80 |     def forward(self, word_emb):
 81 |         word_emb = self.squash(word_emb)
 82 |         word_emb = self.LayerNorm(word_emb)
 83 |         word_emb = self.dropout(word_emb)
 84 |         word_emb = self.dense(word_emb)
 85 |         return word_emb
 86 | 
 87 | 
 88 | class ResidualLayer(nn.Module):
 89 |     """
 90 |     A residual connection followed by a layer norm.
 91 |     """
 92 |     def __init__(self, size, dropout):
 93 |         super(ResidualLayer, self).__init__()
 94 |         self.norm1 = nn.LayerNorm(size)
 95 |         self.dropout = nn.Dropout(dropout)
 96 | 
 97 |     def forward(self, x, y):
 98 |         "Apply residual connection to any sublayer with the same size."
 99 |         return self.norm1(x + self.dropout(y))
100 | 
101 | 
102 | class ResidualMLP(nn.Module):
103 |     def __init__(self, d_m, dropout, d_hidden=1024, activation='relu'):
104 |         super(ResidualMLP, self).__init__()
105 |         self.mlp = MLP(d_m, d_m, d_hidden, dropout, activation)
106 |         self.res1 = ResidualLayer(d_m, dropout)
107 | 
108 |     def forward(self, x):
109 |         mlp_out = self.mlp(x)
110 |         x = self.res1(x, mlp_out)
111 |         return x
112 | 
113 | 
114 | class MLP(nn.Module):
115 |     def __init__(self, d_input, d_output, d_hidden=1024, dropout=0.1, activation='relu'):
116 |         super(MLP, self).__init__()
117 |         self.linear1 = nn.Linear(d_input, d_hidden)
118 |         self.activation = _get_activation_fn(activation)
119 |         self.dropout = nn.Dropout(dropout)
120 |         self.linear2 = nn.Linear(d_hidden, d_output)
121 | 
122 |     def forward(self, x):
123 |         return self.linear2(self.dropout(self.activation(self.linear1(x))))
124 | 
125 | 
126 | def apply_deltas(deltas, boxes, bbox_weights, scale_clamp):
127 |     """
128 |     Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
129 | 
130 |     Args:
131 |         deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
132 |             deltas[i] represents k potentially different class-specific
133 |             box transformations for the single box boxes[i].
134 |         boxes (Tensor): boxes to transform, of shape (N, 4)
135 |     """
136 |     boxes = boxes.to(deltas.dtype)
137 | 
138 |     widths = boxes[:, 2] - boxes[:, 0]
139 |     heights = boxes[:, 3] - boxes[:, 1]
140 |     ctr_x = boxes[:, 0] + 0.5 * widths
141 |     ctr_y = boxes[:, 1] + 0.5 * heights
142 | 
143 |     wx, wy, ww, wh = bbox_weights
144 |     dx = deltas[:, 0::4] / wx
145 |     dy = deltas[:, 1::4] / wy
146 |     dw = deltas[:, 2::4] / ww
147 |     dh = deltas[:, 3::4] / wh
148 | 
149 |     # Prevent sending too large values into torch.exp()
150 |     dw = torch.clamp(dw, max=scale_clamp)
151 |     dh = torch.clamp(dh, max=scale_clamp)
152 | 
153 |     pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
154 |     pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
155 |     pred_w = torch.exp(dw) * widths[:, None]
156 |     pred_h = torch.exp(dh) * heights[:, None]
157 | 
158 |     pred_boxes = torch.zeros_like(deltas)
159 |     pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
160 |     pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
161 |     pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
162 |     pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
163 | 
164 |     return pred_boxes
165 | 
166 | 
167 | def _get_clones(module, N):
168 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
169 | 
170 | 
171 | def _get_activation_fn(activation):
172 |     """Return an activation function given a string"""
173 |     if activation == "relu":
174 |         return F.relu
175 |     if activation == "gelu":
176 |         return F.gelu
177 |     if activation == "glu":
178 |         return F.glu
179 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
180 | 
181 | 
182 | def _norm(f, dim=-1):
183 |     return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12)
184 | 
185 | 
186 | def _b_cosine(a, b, logit_scale):
187 |     """
188 |     a: B x K x H
189 |     b: B x H x K
190 |     """
191 |     a = _norm(a, dim=2)
192 |     b = _norm(b, dim=1)
193 |     # Calculating the Loss
194 |     logit_scale = logit_scale.exp()
195 |     logits_per_image = logit_scale * torch.bmm(a, b)
196 |     return logits_per_image
197 | 
198 | def _cosine(a, b, logit_scale):
199 |     """
200 |     a: ?/1 x K x H
201 |     b: ?/1 x H x 1
202 |     """
203 |     a = _norm(a, dim=2)
204 |     b = _norm(b, dim=1)
205 |     # Calculating the Loss
206 |     logit_scale = logit_scale.exp()
207 |     logits_per_image = logit_scale * torch.matmul(a, b)
208 |     return logits_per_image


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_language_backbone 
2 | #from .build import build_tokenizer
3 | 
4 | # from .hfpt_tokenizer import HFPTTokenizer
5 | # from .simple_tokenizer import SimpleTokenizer
6 | 


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from omdet.modeling import registry
 3 | from omdet.modeling.language_backbone.clip.models import clip as clip
 4 | 
 5 | 
 6 | @registry.LANGUAGE_BACKBONES.register("clip")
 7 | def build_clip_backbone(cfg):
 8 |     model, _ = clip.load("resources/ViT-B-16.pt", device=torch.device(cfg.MODEL.DEVICE), jit=False)
 9 |     model.visual = None # delete the vision part
10 |     model.logit_scale = None
11 |     return model
12 | 
13 | 
14 | def build_language_backbone(cfg):
15 |     print ("cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE", cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE)
16 |     assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \
17 |         "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format(
18 |             cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
19 |         )
20 |     return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     a = build_clip_backbone('')
25 |     print(a)


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/__init__.py


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/__init__.py


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/clip.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import urllib
  4 | import warnings
  5 | from typing import Union, List
  6 | 
  7 | import torch
  8 | from PIL import Image
  9 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 10 | from tqdm import tqdm
 11 | 
 12 | from omdet.modeling.language_backbone.clip.models.model import build_model
 13 | from omdet.modeling.language_backbone.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
 14 | 
 15 | try:
 16 |     from torchvision.transforms import InterpolationMode
 17 |     BICUBIC = InterpolationMode.BICUBIC
 18 | except ImportError:
 19 |     BICUBIC = Image.BICUBIC
 20 | 
 21 | 
 22 | __all__ = ["available_models", "load", "tokenize"]
 23 | _tokenizer = _Tokenizer()
 24 | 
 25 | 
 26 | _MODELS = {
 27 |     "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
 28 |     "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
 29 |     "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
 30 |     "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
 31 |     "ViT-B-32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
 32 |     "ViT-B-16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
 33 | }
 34 | 
 35 | 
 36 | def _download(url: str, root: str):
 37 |     os.makedirs(root, exist_ok=True)
 38 |     filename = os.path.basename(url)
 39 | 
 40 |     expected_sha256 = url.split("/")[-2]
 41 |     download_target = os.path.join(root, filename)
 42 | 
 43 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
 44 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
 45 | 
 46 |     if os.path.isfile(download_target):
 47 |         if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
 48 |             return download_target
 49 |         else:
 50 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 51 | 
 52 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
 53 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
 54 |             while True:
 55 |                 buffer = source.read(8192)
 56 |                 if not buffer:
 57 |                     break
 58 | 
 59 |                 output.write(buffer)
 60 |                 loop.update(len(buffer))
 61 | 
 62 |     if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
 63 |         raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
 64 | 
 65 |     return download_target
 66 | 
 67 | 
 68 | def _transform(n_px):
 69 |     return Compose([
 70 |         Resize(n_px, interpolation=BICUBIC),
 71 |         CenterCrop(n_px),
 72 |         lambda image: image.convert("RGB"),
 73 |         ToTensor(),
 74 |         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
 75 |     ])
 76 | 
 77 | 
 78 | def available_models() -> List[str]:
 79 |     """Returns the names of available CLIP rclip"""
 80 |     return list(_MODELS.keys())
 81 | 
 82 | 
 83 | def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
 84 |          jit: bool = False, download_root: str = None):
 85 |     """Load a CLIP model
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     name : str
 90 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
 91 | 
 92 |     device : Union[str, torch.device]
 93 |         The device to put the loaded model
 94 | 
 95 |     jit : bool
 96 |         Whether to load the optimized JIT model or more hackable non-JIT model (default).
 97 | 
 98 |     download_root: str
 99 |         path to download the model files; by default, it uses "~/.cache/clip"
100 | 
101 |     Returns
102 |     -------
103 |     model : torch.nn.Module
104 |         The CLIP model
105 | 
106 |     preprocess : Callable[[PIL.Image], torch.Tensor]
107 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
108 |     """
109 |     if name in _MODELS:
110 |         model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
111 |     elif os.path.isfile(name):
112 |         model_path = name
113 |     else:
114 |         raise RuntimeError(f"Model {name} not found; available rclip = {available_models()}")
115 | 
116 |     try:
117 |         # loading JIT archive
118 |         model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
119 |         state_dict = None
120 |     except RuntimeError:
121 |         # loading saved state dict
122 |         if jit:
123 |             warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
124 |             jit = False
125 |         state_dict = torch.load(model_path, map_location="cpu")
126 | 
127 |     if not jit:
128 |         model = build_model(state_dict or model.state_dict()).to(device)
129 |         if str(device) == "cpu":
130 |             model.float()
131 |         return model, _transform(model.visual.input_resolution)
132 | 
133 |     # patch the device names
134 |     device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
135 |     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
136 | 
137 |     def patch_device(module):
138 |         try:
139 |             graphs = [module.graph] if hasattr(module, "graph") else []
140 |         except RuntimeError:
141 |             graphs = []
142 | 
143 |         if hasattr(module, "forward1"):
144 |             graphs.append(module.forward1.graph)
145 | 
146 |         for graph in graphs:
147 |             for node in graph.findAllNodes("prim::Constant"):
148 |                 if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
149 |                     node.copyAttributes(device_node)
150 | 
151 |     model.apply(patch_device)
152 |     patch_device(model.encode_image)
153 |     patch_device(model.encode_text)
154 | 
155 |     # patch dtype to float32 on CPU
156 |     if str(device) == "cpu":
157 |         float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
158 |         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
159 |         float_node = float_input.node()
160 | 
161 |         def patch_float(module):
162 |             try:
163 |                 graphs = [module.graph] if hasattr(module, "graph") else []
164 |             except RuntimeError:
165 |                 graphs = []
166 | 
167 |             if hasattr(module, "forward1"):
168 |                 graphs.append(module.forward1.graph)
169 | 
170 |             for graph in graphs:
171 |                 for node in graph.findAllNodes("aten::to"):
172 |                     inputs = list(node.inputs())
173 |                     for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
174 |                         if inputs[i].node()["value"] == 5:
175 |                             inputs[i].node().copyAttributes(float_node)
176 | 
177 |         model.apply(patch_float)
178 |         patch_float(model.encode_image)
179 |         patch_float(model.encode_text)
180 | 
181 |         model.float()
182 | 
183 |     return model, _transform(model.input_resolution.item())
184 | 
185 | 
186 | def tokenize(texts: Union[str, List[str]], context_length: int = 77,
187 |              truncate: bool = False) -> torch.LongTensor:
188 |     """
189 |     Returns the tokenized representation of given input string(s)
190 | 
191 |     Parameters
192 |     ----------
193 |     texts : Union[str, List[str]]
194 |         An input string or a list of input strings to tokenize
195 | 
196 |     context_length : int
197 |         The context length to use; all CLIP rclip use 77 as the context length
198 | 
199 |     truncate: bool
200 |         Whether to truncate the text in case its encoding is longer than the context length
201 | 
202 |     Returns
203 |     -------
204 |     A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
205 |     """
206 |     if isinstance(texts, str):
207 |         texts = [texts]
208 | 
209 |     sot_token = _tokenizer.encoder["<|startoftext|>"]
210 |     eot_token = _tokenizer.encoder["<|endoftext|>"]
211 |     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
212 |     result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
213 | 
214 |     for i, tokens in enumerate(all_tokens):
215 |         if len(tokens) > context_length:
216 |             if truncate:
217 |                 tokens = tokens[:context_length]
218 |                 tokens[-1] = eot_token
219 |             else:
220 |                 raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
221 |         result[i, :len(tokens)] = torch.tensor(tokens)
222 | 
223 |     return result
224 | 


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text
133 | 


--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/word_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Language-related data loading helper functions and class wrappers.
  3 | """
  4 | 
  5 | import re
  6 | import torch
  7 | import codecs
  8 | 
  9 | UNK_TOKEN = '<unk>'
 10 | PAD_TOKEN = '<pad>'
 11 | END_TOKEN = '<eos>'
 12 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 13 | 
 14 | 
 15 | class Dictionary(object):
 16 |     def __init__(self):
 17 |         self.word2idx = {}
 18 |         self.idx2word = []
 19 | 
 20 |     def add_word(self, word):
 21 |         if word not in self.word2idx:
 22 |             self.idx2word.append(word)
 23 |             self.word2idx[word] = len(self.idx2word) - 1
 24 |         return self.word2idx[word]
 25 | 
 26 |     def __len__(self):
 27 |         return len(self.idx2word)
 28 | 
 29 |     def __getitem__(self, a):
 30 |         if isinstance(a, int):
 31 |             return self.idx2word[a]
 32 |         elif isinstance(a, list):
 33 |             return [self.idx2word[x] for x in a]
 34 |         elif isinstance(a, str):
 35 |             return self.word2idx[a]
 36 |         else:
 37 |             raise TypeError("Query word/index argument must be int or str")
 38 | 
 39 |     def __contains__(self, word):
 40 |         return word in self.word2idx
 41 | 
 42 | 
 43 | class Corpus(object):
 44 |     def __init__(self):
 45 |         self.dictionary = Dictionary()
 46 | 
 47 |     def set_max_len(self, value):
 48 |         self.max_len = value
 49 | 
 50 |     def load_file(self, filename):
 51 |         with codecs.open(filename, 'r', 'utf-8') as f:
 52 |             for line in f:
 53 |                 line = line.strip()
 54 |                 self.add_to_corpus(line)
 55 |         self.dictionary.add_word(UNK_TOKEN)
 56 |         self.dictionary.add_word(PAD_TOKEN)
 57 | 
 58 |     def add_to_corpus(self, line):
 59 |         """Tokenizes a text line."""
 60 |         # Add words to the dictionary
 61 |         words = line.split()
 62 |         # tokens = len(words)
 63 |         for word in words:
 64 |             word = word.lower()
 65 |             self.dictionary.add_word(word)
 66 | 
 67 |     def tokenize(self, line, max_len=20):
 68 |         # Tokenize line contents
 69 |         words = SENTENCE_SPLIT_REGEX.split(line.strip())
 70 |         # words = [w.lower() for w in words if len(w) > 0]
 71 |         words = [w.lower() for w in words if (len(w) > 0 and w != ' ')]  ## do not include space as a token
 72 | 
 73 |         if words[-1] == '.':
 74 |             words = words[:-1]
 75 | 
 76 |         if max_len > 0:
 77 |             if len(words) > max_len:
 78 |                 words = words[:max_len]
 79 |             elif len(words) < max_len:
 80 |                 # words = [PAD_TOKEN] * (max_len - len(words)) + words
 81 |                 words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
 82 | 
 83 |         tokens = len(words)  ## for end token
 84 |         ids = torch.LongTensor(tokens)
 85 |         token = 0
 86 |         for word in words:
 87 |             if word not in self.dictionary:
 88 |                 word = UNK_TOKEN
 89 |             if type(word) != type('a'):
 90 |                 print(word, type(word), word.encode('ascii', 'ignore').decode('ascii'),
 91 |                       type(word.encode('ascii', 'ignore').decode('ascii')))
 92 |                 word = word.encode('ascii', 'ignore').decode('ascii')
 93 |             ids[token] = self.dictionary[word]
 94 |             token += 1
 95 |         # ids[token] = self.dictionary[END_TOKEN]
 96 |         return ids
 97 | 
 98 |     def __len__(self):
 99 |         return len(self.dictionary)
100 | 


--------------------------------------------------------------------------------
/omdet/modeling/registry.py:
--------------------------------------------------------------------------------
1 | from omdet.utils.registry import Registry
2 | 
3 | LANGUAGE_BACKBONES = Registry()
4 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import add_omdet_v2_turbo_config
2 | from .detector import OmDetV2Turbo
3 | from .ela_encoder import ELAEncoder
4 | from .ela_decoder import ELADecoder
5 | from .head import DINOHead
6 | from .infer_model import OmDetV2TurboInfer
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/block.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
  6 | 
  7 | __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
  8 |            'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3')
  9 | 
 10 | 
 11 | class DFL(nn.Module):
 12 |     """
 13 |     Integral module of Distribution Focal Loss (DFL).
 14 |     Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
 15 |     """
 16 | 
 17 |     def __init__(self, c1=16):
 18 |         """Initialize a convolutional layer with a given number of input channels."""
 19 |         super().__init__()
 20 |         self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
 21 |         x = torch.arange(c1, dtype=torch.float)
 22 |         self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
 23 |         self.c1 = c1
 24 | 
 25 |     def forward(self, x):
 26 |         """Applies a transformer layer on input tensor 'x' and returns a tensor."""
 27 |         b, c, a = x.shape  # batch, channels, anchors
 28 |         return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
 29 |         # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
 30 | 
 31 | 
 32 | class Proto(nn.Module):
 33 |     """YOLOv8 mask Proto module for segmentation models."""
 34 | 
 35 |     def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
 36 |         super().__init__()
 37 |         self.cv1 = Conv(c1, c_, k=3)
 38 |         self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
 39 |         self.cv2 = Conv(c_, c_, k=3)
 40 |         self.cv3 = Conv(c_, c2)
 41 | 
 42 |     def forward(self, x):
 43 |         """Performs a forward pass through layers using an upsampled input image."""
 44 |         return self.cv3(self.cv2(self.upsample(self.cv1(x))))
 45 | 
 46 | 
 47 | class HGStem(nn.Module):
 48 |     """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
 49 |     https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
 50 |     """
 51 | 
 52 |     def __init__(self, c1, cm, c2):
 53 |         super().__init__()
 54 |         self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
 55 |         self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
 56 |         self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())
 57 |         self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())
 58 |         self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
 59 |         self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
 60 | 
 61 |     def forward(self, x):
 62 |         """Forward pass of a PPHGNetV2 backbone layer."""
 63 |         x = self.stem1(x)
 64 |         x = F.pad(x, [0, 1, 0, 1])
 65 |         x2 = self.stem2a(x)
 66 |         x2 = F.pad(x2, [0, 1, 0, 1])
 67 |         x2 = self.stem2b(x2)
 68 |         x1 = self.pool(x)
 69 |         x = torch.cat([x1, x2], dim=1)
 70 |         x = self.stem3(x)
 71 |         x = self.stem4(x)
 72 |         return x
 73 | 
 74 | 
 75 | class HGBlock(nn.Module):
 76 |     """HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
 77 |     https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
 78 |     """
 79 | 
 80 |     def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
 81 |         super().__init__()
 82 |         block = LightConv if lightconv else Conv
 83 |         self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
 84 |         self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act)  # squeeze conv
 85 |         self.ec = Conv(c2 // 2, c2, 1, 1, act=act)  # excitation conv
 86 |         self.add = shortcut and c1 == c2
 87 | 
 88 |     def forward(self, x):
 89 |         """Forward pass of a PPHGNetV2 backbone layer."""
 90 |         y = [x]
 91 |         y.extend(m(y[-1]) for m in self.m)
 92 |         y = self.ec(self.sc(torch.cat(y, 1)))
 93 |         return y + x if self.add else y
 94 | 
 95 | 
 96 | class SPP(nn.Module):
 97 |     """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
 98 | 
 99 |     def __init__(self, c1, c2, k=(5, 9, 13)):
100 |         """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
101 |         super().__init__()
102 |         c_ = c1 // 2  # hidden channels
103 |         self.cv1 = Conv(c1, c_, 1, 1)
104 |         self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
105 |         self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
106 | 
107 |     def forward(self, x):
108 |         """Forward pass of the SPP layer, performing spatial pyramid pooling."""
109 |         x = self.cv1(x)
110 |         return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
111 | 
112 | 
113 | class SPPF(nn.Module):
114 |     """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
115 | 
116 |     def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
117 |         super().__init__()
118 |         c_ = c1 // 2  # hidden channels
119 |         self.cv1 = Conv(c1, c_, 1, 1)
120 |         self.cv2 = Conv(c_ * 4, c2, 1, 1)
121 |         self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
122 | 
123 |     def forward(self, x):
124 |         """Forward pass through Ghost Convolution block."""
125 |         x = self.cv1(x)
126 |         y1 = self.m(x)
127 |         y2 = self.m(y1)
128 |         return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
129 | 
130 | 
131 | class C1(nn.Module):
132 |     """CSP Bottleneck with 1 convolution."""
133 | 
134 |     def __init__(self, c1, c2, n=1):  # ch_in, ch_out, number
135 |         super().__init__()
136 |         self.cv1 = Conv(c1, c2, 1, 1)
137 |         self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
138 | 
139 |     def forward(self, x):
140 |         """Applies cross-convolutions to input in the C3 module."""
141 |         y = self.cv1(x)
142 |         return self.m(y) + y
143 | 
144 | 
145 | class C2(nn.Module):
146 |     """CSP Bottleneck with 2 convolutions."""
147 | 
148 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
149 |         super().__init__()
150 |         self.c = int(c2 * e)  # hidden channels
151 |         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
152 |         self.cv2 = Conv(2 * self.c, c2, 1)  # optional act=FReLU(c2)
153 |         # self.attention = ChannelAttention(2 * self.c)  # or SpatialAttention()
154 |         self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
155 | 
156 |     def forward(self, x):
157 |         """Forward pass through the CSP bottleneck with 2 convolutions."""
158 |         a, b = self.cv1(x).chunk(2, 1)
159 |         return self.cv2(torch.cat((self.m(a), b), 1))
160 | 
161 | 
162 | class C2f(nn.Module):
163 |     """CSP Bottleneck with 2 convolutions."""
164 | 
165 |     def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
166 |         super().__init__()
167 |         self.c = int(c2 * e)  # hidden channels
168 |         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
169 |         self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
170 |         self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
171 | 
172 |     def forward(self, x):
173 |         """Forward pass through C2f layer."""
174 |         y = list(self.cv1(x).chunk(2, 1))
175 |         y.extend(m(y[-1]) for m in self.m)
176 |         return self.cv2(torch.cat(y, 1))
177 | 
178 |     def forward_split(self, x):
179 |         """Forward pass using split() instead of chunk()."""
180 |         y = list(self.cv1(x).split((self.c, self.c), 1))
181 |         y.extend(m(y[-1]) for m in self.m)
182 |         return self.cv2(torch.cat(y, 1))
183 | 
184 | 
185 | class C3(nn.Module):
186 |     """CSP Bottleneck with 3 convolutions."""
187 | 
188 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
189 |         super().__init__()
190 |         c_ = int(c2 * e)  # hidden channels
191 |         self.cv1 = Conv(c1, c_, 1, 1)
192 |         self.cv2 = Conv(c1, c_, 1, 1)
193 |         self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
194 |         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
195 | 
196 |     def forward(self, x):
197 |         """Forward pass through the CSP bottleneck with 2 convolutions."""
198 |         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
199 | 
200 | 
201 | class C3x(C3):
202 |     """C3 module with cross-convolutions."""
203 | 
204 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
205 |         """Initialize C3TR instance and set default parameters."""
206 |         super().__init__(c1, c2, n, shortcut, g, e)
207 |         self.c_ = int(c2 * e)
208 |         self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
209 | 
210 | 
211 | class RepC3(nn.Module):
212 |     """Rep C3."""
213 | 
214 |     def __init__(self, c1, c2, n=3, e=1.0):
215 |         super().__init__()
216 |         c_ = int(c2 * e)  # hidden channels
217 |         self.cv1 = Conv(c1, c2, 1, 1)
218 |         self.cv2 = Conv(c1, c2, 1, 1)
219 |         self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
220 |         self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
221 | 
222 |     def forward(self, x):
223 |         """Forward pass of RT-DETR neck layer."""
224 |         return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
225 | 
226 | #
227 | # class C3TR(C3):
228 | #     """C3 module with TransformerBlock()."""
229 | #
230 | #     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
231 | #         """Initialize C3Ghost module with GhostBottleneck()."""
232 | #         super().__init__(c1, c2, n, shortcut, g, e)
233 | #         c_ = int(c2 * e)
234 | #         self.m = TransformerBlock(c_, c_, 4, n)
235 | 
236 | 
237 | class C3Ghost(C3):
238 |     """C3 module with GhostBottleneck()."""
239 | 
240 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
241 |         """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
242 |         super().__init__(c1, c2, n, shortcut, g, e)
243 |         c_ = int(c2 * e)  # hidden channels
244 |         self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
245 | 
246 | 
247 | class GhostBottleneck(nn.Module):
248 |     """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
249 | 
250 |     def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
251 |         super().__init__()
252 |         c_ = c2 // 2
253 |         self.conv = nn.Sequential(
254 |             GhostConv(c1, c_, 1, 1),  # pw
255 |             DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
256 |             GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
257 |         self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
258 |                                                                             act=False)) if s == 2 else nn.Identity()
259 | 
260 |     def forward(self, x):
261 |         """Applies skip connection and concatenation to input tensor."""
262 |         return self.conv(x) + self.shortcut(x)
263 | 
264 | 
265 | class Bottleneck(nn.Module):
266 |     """Standard bottleneck."""
267 | 
268 |     def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, groups, kernels, expand
269 |         super().__init__()
270 |         c_ = int(c2 * e)  # hidden channels
271 |         self.cv1 = Conv(c1, c_, k[0], 1)
272 |         self.cv2 = Conv(c_, c2, k[1], 1, g=g)
273 |         self.add = shortcut and c1 == c2
274 | 
275 |     def forward(self, x):
276 |         """'forward()' applies the YOLOv5 FPN to input data."""
277 |         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
278 | 
279 | 
280 | class BottleneckCSP(nn.Module):
281 |     """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
282 | 
283 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
284 |         super().__init__()
285 |         c_ = int(c2 * e)  # hidden channels
286 |         self.cv1 = Conv(c1, c_, 1, 1)
287 |         self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
288 |         self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
289 |         self.cv4 = Conv(2 * c_, c2, 1, 1)
290 |         self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
291 |         self.act = nn.SiLU()
292 |         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
293 | 
294 |     def forward(self, x):
295 |         """Applies a CSP bottleneck with 3 convolutions."""
296 |         y1 = self.cv3(self.m(self.cv1(x)))
297 |         y2 = self.cv2(x)
298 |         return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
299 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/build_components.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from detectron2.utils.logger import _log_api_usage
 3 | from detectron2.utils.registry import Registry
 4 | 
 5 | TRANSFORMER_ENCODER_REGISTRY = Registry("TRANSFORMER_ENCODER")  # noqa F401 isort:skip
 6 | TRANSFORMER_ENCODER_REGISTRY.__doc__ = """
 7 | """
 8 | 
 9 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_DECODER")  # noqa F401 isort:skip
10 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ """
11 | 
12 | DETR_HEAD_REGISTRY = Registry("DETR_HEAD")  # noqa F401 isort:skip
13 | DETR_HEAD_REGISTRY.__doc__ = """ """
14 | 
15 | 
16 | def build_encoder_model(cfg):
17 |     """
18 |     Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
19 |     Note that it does not load any weights from ``cfg``.
20 |     """
21 |     encoder = cfg.MODEL.TRANSFORMER_ENCODER
22 |     mode_class = TRANSFORMER_ENCODER_REGISTRY.get(encoder)
23 |     model = mode_class(**mode_class.from_config(cfg))
24 |     # model = TRANSFORMER_ENCODER_REGISTRY.get(encoder)(cfg)
25 |     model.to(torch.device(cfg.MODEL.DEVICE))
26 |     _log_api_usage("modeling.transfor_encoder." + encoder)
27 |     return model
28 | 
29 | 
30 | def build_decoder_model(cfg):
31 |     """
32 |     Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
33 |     Note that it does not load any weights from ``cfg``.
34 |     """
35 |     decoder = cfg.MODEL.TRANSFORMER_DECODER
36 |     mode_class = TRANSFORMER_DECODER_REGISTRY.get(decoder)
37 |     model = mode_class(**mode_class.from_config(cfg))
38 |     # model = TRANSFORMER_DECODER_REGISTRY.get(decoder)(cfg)
39 |     model.to(torch.device(cfg.MODEL.DEVICE))
40 |     _log_api_usage("modeling.transfor_encoder." + decoder)
41 |     return model
42 | 
43 | 
44 | def build_detr_head(cfg):
45 |     """
46 |     Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
47 |     Note that it does not load any weights from ``cfg``.
48 |     """
49 |     head = cfg.MODEL.HEAD
50 |     # model = DETR_HEAD_REGISTRY.get(head)(cfg)
51 |     mode_class = DETR_HEAD_REGISTRY.get(head)
52 |     model = mode_class(**mode_class.from_config(cfg))
53 |     model.to(torch.device(cfg.MODEL.DEVICE))
54 |     _log_api_usage("modeling.transfor_encoder." + head)
55 |     return model


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode as CN
 2 | from omdet.modeling.backbone.config import add_backbone_config
 3 | 
 4 | 
 5 | def add_omdet_v2_turbo_config(cfg):
 6 |     """
 7 |     Add config for Modulated OmDet Turn.
 8 |     """
 9 |     cfg.MODEL.HEAD = "DINOHead"
10 |     cfg.MODEL.LOSS = "DINOLoss"
11 |     cfg.MODEL.TRANSFORMER_ENCODER = "ELAEncoder"
12 |     cfg.MODEL.TRANSFORMER_DECODER = "ELADecoder"
13 | 
14 |     cfg.MODEL.LANGUAGE_BACKBONE = CN()
15 |     cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "clip"
16 |     cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 512
17 | 
18 |     # Task Head
19 |     cfg.MODEL.ELAEncoder = CN()
20 |     cfg.MODEL.ELAEncoder.in_channels = [192, 384, 768]
21 |     cfg.MODEL.ELAEncoder.feat_strides = [8, 16, 32]
22 |     cfg.MODEL.ELAEncoder.hidden_dim = 256
23 |     cfg.MODEL.ELAEncoder.use_encoder_idx = [2]
24 |     cfg.MODEL.ELAEncoder.num_encoder_layers = 1
25 |     cfg.MODEL.ELAEncoder.encoder_layer = 'TransformerLayer'
26 |     cfg.MODEL.ELAEncoder.pe_temperature = 10000
27 |     cfg.MODEL.ELAEncoder.expansion = 1.0
28 |     cfg.MODEL.ELAEncoder.depth_mult = 1.0
29 |     cfg.MODEL.ELAEncoder.act = 'silu'
30 |     cfg.MODEL.ELAEncoder.eval_size = None
31 |     cfg.MODEL.ELAEncoder.dim_feedforward=1024
32 | 
33 |     cfg.MODEL.ELADecoder = CN()
34 |     cfg.MODEL.ELADecoder.hidden_dim = 256
35 |     cfg.MODEL.ELADecoder.num_queries = 300
36 |     cfg.MODEL.ELADecoder.position_embed_type = 'sine'
37 |     cfg.MODEL.ELADecoder.backbone_feat_channels = [256, 256, 256]
38 |     cfg.MODEL.ELADecoder.feat_strides = [8, 16, 32]
39 |     cfg.MODEL.ELADecoder.num_levels = 3
40 |     cfg.MODEL.ELADecoder.num_decoder_points = 4
41 |     cfg.MODEL.ELADecoder.nhead = 8
42 |     cfg.MODEL.ELADecoder.num_decoder_layers = 3
43 |     cfg.MODEL.ELADecoder.dim_feedforward = 1024
44 |     cfg.MODEL.ELADecoder.dropout = 0.0
45 |     cfg.MODEL.ELADecoder.activation = 'relu'
46 |     cfg.MODEL.ELADecoder.num_denoising = 100
47 |     cfg.MODEL.ELADecoder.label_noise_ratio = 0.5
48 |     cfg.MODEL.ELADecoder.box_noise_scale = 1.0
49 |     cfg.MODEL.ELADecoder.learnt_init_query = True
50 |     cfg.MODEL.ELADecoder.eval_size = None
51 |     cfg.MODEL.ELADecoder.eval_idx = -1
52 |     cfg.MODEL.ELADecoder.eps = 1e-2
53 |     cfg.MODEL.ELADecoder.cls_type = 'cosine'
54 | 
55 |     cfg.MODEL.FUSE_TYPE = None
56 | 
57 |     cfg.INPUT.RANDOM_CROP = None
58 |     cfg.INPUT.RANDOM_CONTRAST = None
59 |     cfg.INPUT.RANDOM_BRIGHTNESS = None
60 |     cfg.INPUT.RANDOM_SATURATION = None
61 | 
62 |     cfg.MODEL.DEPLOY_MODE = False
63 | 
64 |     add_backbone_config(cfg)


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/conv.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | __all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
  8 |            'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
  9 | 
 10 | 
 11 | def autopad(k, p=None, d=1):  # kernel, padding, dilation
 12 |     """Pad to 'same' shape outputs."""
 13 |     if d > 1:
 14 |         k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
 15 |     if p is None:
 16 |         p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
 17 |     return p
 18 | 
 19 | 
 20 | class Conv(nn.Module):
 21 |     """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
 22 |     default_act = nn.SiLU()  # default activation
 23 | 
 24 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
 25 |         """Initialize Conv layer with given arguments including activation."""
 26 |         super().__init__()
 27 |         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
 28 |         self.bn = nn.BatchNorm2d(c2)
 29 |         self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
 30 | 
 31 |     def forward(self, x):
 32 |         """Apply convolution, batch normalization and activation to input tensor."""
 33 |         return self.act(self.bn(self.conv(x)))
 34 | 
 35 |     def forward_fuse(self, x):
 36 |         """Perform transposed convolution of 2D data."""
 37 |         return self.act(self.conv(x))
 38 | 
 39 | 
 40 | class Conv2(Conv):
 41 |     """Simplified RepConv module with Conv fusing."""
 42 | 
 43 |     def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
 44 |         """Initialize Conv layer with given arguments including activation."""
 45 |         super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
 46 |         self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False)  # add 1x1 conv
 47 | 
 48 |     def forward(self, x):
 49 |         """Apply convolution, batch normalization and activation to input tensor."""
 50 |         return self.act(self.bn(self.conv(x) + self.cv2(x)))
 51 | 
 52 |     def fuse_convs(self):
 53 |         """Fuse parallel convolutions."""
 54 |         w = torch.zeros_like(self.conv.weight.data)
 55 |         i = [x // 2 for x in w.shape[2:]]
 56 |         w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
 57 |         self.conv.weight.data += w
 58 |         self.__delattr__('cv2')
 59 | 
 60 | 
 61 | class LightConv(nn.Module):
 62 |     """Light convolution with args(ch_in, ch_out, kernel).
 63 |     https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
 64 |     """
 65 | 
 66 |     def __init__(self, c1, c2, k=1, act=nn.ReLU()):
 67 |         """Initialize Conv layer with given arguments including activation."""
 68 |         super().__init__()
 69 |         self.conv1 = Conv(c1, c2, 1, act=False)
 70 |         self.conv2 = DWConv(c2, c2, k, act=act)
 71 | 
 72 |     def forward(self, x):
 73 |         """Apply 2 convolutions to input tensor."""
 74 |         return self.conv2(self.conv1(x))
 75 | 
 76 | 
 77 | class DWConv(Conv):
 78 |     """Depth-wise convolution."""
 79 | 
 80 |     def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
 81 |         super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
 82 | 
 83 | 
 84 | class DWConvTranspose2d(nn.ConvTranspose2d):
 85 |     """Depth-wise transpose convolution."""
 86 | 
 87 |     def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
 88 |         super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
 89 | 
 90 | 
 91 | class ConvTranspose(nn.Module):
 92 |     """Convolution transpose 2d layer."""
 93 |     default_act = nn.SiLU()  # default activation
 94 | 
 95 |     def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
 96 |         """Initialize ConvTranspose2d layer with batch normalization and activation function."""
 97 |         super().__init__()
 98 |         self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
 99 |         self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
100 |         self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
101 | 
102 |     def forward(self, x):
103 |         """Applies transposed convolutions, batch normalization and activation to input."""
104 |         return self.act(self.bn(self.conv_transpose(x)))
105 | 
106 |     def forward_fuse(self, x):
107 |         """Applies activation and convolution transpose operation to input."""
108 |         return self.act(self.conv_transpose(x))
109 | 
110 | 
111 | class Focus(nn.Module):
112 |     """Focus wh information into c-space."""
113 | 
114 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
115 |         super().__init__()
116 |         self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
117 |         # self.contract = Contract(gain=2)
118 | 
119 |     def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
120 |         return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
121 |         # return self.conv(self.contract(x))
122 | 
123 | 
124 | class GhostConv(nn.Module):
125 |     """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
126 | 
127 |     def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
128 |         super().__init__()
129 |         c_ = c2 // 2  # hidden channels
130 |         self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
131 |         self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
132 | 
133 |     def forward(self, x):
134 |         """Forward propagation through a Ghost Bottleneck layer with skip connection."""
135 |         y = self.cv1(x)
136 |         return torch.cat((y, self.cv2(y)), 1)
137 | 
138 | 
139 | class RepConv(nn.Module):
140 |     """RepConv is a basic rep-style block, including training and deploy status
141 |     This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
142 |     """
143 |     default_act = nn.SiLU()  # default activation
144 | 
145 |     def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
146 |         super().__init__()
147 |         assert k == 3 and p == 1
148 |         self.g = g
149 |         self.c1 = c1
150 |         self.c2 = c2
151 |         self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
152 | 
153 |         self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
154 |         self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
155 |         self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
156 | 
157 |     def forward_fuse(self, x):
158 |         """Forward process"""
159 |         return self.act(self.conv(x))
160 | 
161 |     def forward(self, x):
162 |         """Forward process"""
163 |         id_out = 0 if self.bn is None else self.bn(x)
164 |         return self.act(self.conv1(x) + self.conv2(x) + id_out)
165 | 
166 |     def get_equivalent_kernel_bias(self):
167 |         kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
168 |         kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
169 |         kernelid, biasid = self._fuse_bn_tensor(self.bn)
170 |         return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
171 | 
172 |     def _avg_to_3x3_tensor(self, avgp):
173 |         channels = self.c1
174 |         groups = self.g
175 |         kernel_size = avgp.kernel_size
176 |         input_dim = channels // groups
177 |         k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
178 |         k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
179 |         return k
180 | 
181 |     def _pad_1x1_to_3x3_tensor(self, kernel1x1):
182 |         if kernel1x1 is None:
183 |             return 0
184 |         else:
185 |             return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
186 | 
187 |     def _fuse_bn_tensor(self, branch):
188 |         if branch is None:
189 |             return 0, 0
190 |         if isinstance(branch, Conv):
191 |             kernel = branch.conv.weight
192 |             running_mean = branch.bn.running_mean
193 |             running_var = branch.bn.running_var
194 |             gamma = branch.bn.weight
195 |             beta = branch.bn.bias
196 |             eps = branch.bn.eps
197 |         elif isinstance(branch, nn.BatchNorm2d):
198 |             if not hasattr(self, 'id_tensor'):
199 |                 input_dim = self.c1 // self.g
200 |                 kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
201 |                 for i in range(self.c1):
202 |                     kernel_value[i, i % input_dim, 1, 1] = 1
203 |                 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
204 |             kernel = self.id_tensor
205 |             running_mean = branch.running_mean
206 |             running_var = branch.running_var
207 |             gamma = branch.weight
208 |             beta = branch.bias
209 |             eps = branch.eps
210 |         std = (running_var + eps).sqrt()
211 |         t = (gamma / std).reshape(-1, 1, 1, 1)
212 |         return kernel * t, beta - running_mean * gamma / std
213 | 
214 |     def fuse_convs(self):
215 |         if hasattr(self, 'conv'):
216 |             return
217 |         kernel, bias = self.get_equivalent_kernel_bias()
218 |         self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
219 |                               out_channels=self.conv1.conv.out_channels,
220 |                               kernel_size=self.conv1.conv.kernel_size,
221 |                               stride=self.conv1.conv.stride,
222 |                               padding=self.conv1.conv.padding,
223 |                               dilation=self.conv1.conv.dilation,
224 |                               groups=self.conv1.conv.groups,
225 |                               bias=True).requires_grad_(False)
226 |         self.conv.weight.data = kernel
227 |         self.conv.bias.data = bias
228 |         for para in self.parameters():
229 |             para.detach_()
230 |         self.__delattr__('conv1')
231 |         self.__delattr__('conv2')
232 |         if hasattr(self, 'nm'):
233 |             self.__delattr__('nm')
234 |         if hasattr(self, 'bn'):
235 |             self.__delattr__('bn')
236 |         if hasattr(self, 'id_tensor'):
237 |             self.__delattr__('id_tensor')
238 | 
239 | 
240 | class ChannelAttention(nn.Module):
241 |     """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
242 | 
243 |     def __init__(self, channels: int) -> None:
244 |         super().__init__()
245 |         self.pool = nn.AdaptiveAvgPool2d(1)
246 |         self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
247 |         self.act = nn.Sigmoid()
248 | 
249 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
250 |         return x * self.act(self.fc(self.pool(x)))
251 | 
252 | 
253 | class SpatialAttention(nn.Module):
254 |     """Spatial-attention module."""
255 | 
256 |     def __init__(self, kernel_size=7):
257 |         """Initialize Spatial-attention module with kernel size argument."""
258 |         super().__init__()
259 |         assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
260 |         padding = 3 if kernel_size == 7 else 1
261 |         self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
262 |         self.act = nn.Sigmoid()
263 | 
264 |     def forward(self, x):
265 |         """Apply channel and spatial attention on input for feature recalibration."""
266 |         return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
267 | 
268 | 
269 | class CBAM(nn.Module):
270 |     """Convolutional Block Attention Module."""
271 | 
272 |     def __init__(self, c1, kernel_size=7):  # ch_in, kernels
273 |         super().__init__()
274 |         self.channel_attention = ChannelAttention(c1)
275 |         self.spatial_attention = SpatialAttention(kernel_size)
276 | 
277 |     def forward(self, x):
278 |         """Applies the forward pass through C1 module."""
279 |         return self.spatial_attention(self.channel_attention(x))
280 | 
281 | 
282 | class Concat(nn.Module):
283 |     """Concatenate a list of tensors along dimension."""
284 | 
285 |     def __init__(self, dimension=1):
286 |         """Concatenates a list of tensors along a specified dimension."""
287 |         super().__init__()
288 |         self.d = dimension
289 | 
290 |     def forward(self, x):
291 |         """Forward pass for the YOLOv8 mask Proto module."""
292 |         return torch.cat(x, self.d)
293 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/detector.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import copy
  6 | from typing import Tuple
  7 | 
  8 | import numpy as np
  9 | # import open_clip
 10 | from detectron2.structures import Boxes, ImageList, Instances
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from detectron2.modeling import detector_postprocess
 15 | from detectron2.layers import batched_nms
 16 | from detectron2.modeling import build_backbone
 17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head
 18 | from detectron2.config import configurable
 19 | from omdet.modeling.language_backbone import build_language_backbone
 20 | from detectron2.utils.logger import setup_logger
 21 | from ..modeling.language_backbone.clip.models import clip as clip
 22 | from .torch_utils import bbox_cxcywh_to_xyxy
 23 | __all__ = ['OmDetV2Turbo']
 24 | 
 25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 26 | 
 27 | from ..utils.cache import LRUCache
 28 | 
 29 | from huggingface_hub import PyTorchModelHubMixin
 30 | 
 31 | 
 32 | @META_ARCH_REGISTRY.register()
 33 | class OmDetV2Turbo(nn.Module, PyTorchModelHubMixin):
 34 | 
 35 |     @configurable
 36 |     def __init__(self, cfg):
 37 |         super(OmDetV2Turbo, self).__init__()
 38 |         self.cfg = cfg
 39 |         self.logger = setup_logger(name=__name__)
 40 | 
 41 |         self.backbone = build_backbone(cfg)
 42 |         self.decoder = build_decoder_model(cfg)
 43 |         self.neck = build_encoder_model(cfg)
 44 |         self.loss_head = build_detr_head(cfg)
 45 |         self.device = cfg.MODEL.DEVICE
 46 | 
 47 |         pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
 48 |         pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
 49 |         normalizer = lambda x: (x - pixel_mean) / pixel_std
 50 |         self.normalizer = normalizer
 51 | 
 52 |         self.size_divisibility = self.backbone.size_divisibility
 53 |         self.nms_test_th = 0.0
 54 |         self.conf_test_th = 0.0
 55 |         self.loss_type = 'FOCAL'
 56 |         self.use_language_cache = True
 57 |         self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
 58 |         self.num_proposals = cfg.MODEL.ELADecoder.num_queries
 59 | 
 60 |         # Build language Encoder
 61 |         self.language_backbone = build_language_backbone(cfg)
 62 |         self.language_cache_label = LRUCache(100)
 63 |         self.language_cache_prompt = LRUCache(100)
 64 | 
 65 | 
 66 |     @classmethod
 67 |     def from_config(cls, cfg, *args, **kwargs):
 68 |         return {
 69 |             'cfg': cfg
 70 |         }
 71 | 
 72 |     def preprocess_image(self, batched_inputs):
 73 |         """
 74 |         Normalize, pad and batch the input images.
 75 |         """
 76 |         images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
 77 |         images = ImageList.from_tensors(images, self.size_divisibility)
 78 | 
 79 |         images_whwh = list()
 80 |         for bi in batched_inputs:
 81 |             h, w = bi["image"].shape[-2:]
 82 |             images_whwh.append(torch.tensor([w, h, w, h], dtype=torch.float32, device=self.device))
 83 |         images_whwh = torch.stack(images_whwh)
 84 |         ann_types = [x["ann_type"] if "ann_type" in x else "box" for x in batched_inputs]
 85 |         return images, images_whwh, ann_types
 86 | 
 87 |     def gen_output(self, box_cls, box_pred, batched_inputs, images, score_thresh, nms_thresh, do_postprocess,
 88 |                    max_num_det=None):
 89 |         results = self.inference(box_cls, box_pred, images.image_sizes, score_thresh, nms_thresh, max_num_det)
 90 | 
 91 |         if do_postprocess:
 92 |             processed_results = []
 93 |             for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
 94 |                 height = input_per_image.get("height", image_size[0])
 95 |                 width = input_per_image.get("width", image_size[1])
 96 |                 r = detector_postprocess(results_per_image, height, width)
 97 |                 processed_results.append({"instances": r})
 98 |             results = processed_results
 99 |         return results
100 | 
101 |     def inference(self, box_cls, box_pred, image_sizes, score_thresh=None, nms_thresh=None, max_num_det=None):
102 |         assert len(box_cls) == len(image_sizes)
103 |         if score_thresh is None:
104 |             score_thresh = self.conf_test_th
105 | 
106 |         if nms_thresh is None:
107 |             nms_thresh = self.nms_test_th
108 | 
109 |         num_classes = box_cls.shape[2]
110 |         scores, labels = self.compute_score(box_cls)
111 |         results = []
112 |         if self.loss_type in {"FOCAL", "BCE"}:
113 |             for i, (scores_img, box_per_img, image_size) in enumerate(zip(scores, box_pred, image_sizes
114 |                                                                           )):
115 |                 results.append(self.inference_single_image(box_per_img, scores_img, labels, image_size, num_classes,
116 |                                                            score_thresh=score_thresh,
117 |                                                            nms_thresh=nms_thresh,
118 |                                                            max_num_det=max_num_det))
119 |         else:
120 |             for i, (scores_img, label_img, box_per_img, image_size) in enumerate(zip(
121 |                     scores, labels, box_pred, image_sizes
122 |             )):
123 |                 results.append(
124 |                     self.inference_single_image(box_per_img, scores_img, label_img, image_size, num_classes,
125 |                                                 score_thresh=score_thresh,
126 |                                                 nms_thresh=nms_thresh,
127 |                                                 max_num_det=max_num_det))
128 | 
129 |         return results
130 | 
131 |     def inference_single_image(self, boxes, scores, labels,
132 |                                image_size: Tuple[int, int],
133 |                                num_classes: int,
134 |                                score_thresh: float,
135 |                                nms_thresh: float,
136 |                                max_num_det: int = None):
137 |         """
138 |         Call `fast_rcnn_inference_single_image` for all images.
139 |         Args:
140 |             boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
141 |                 boxes for each image. Element i has shape (Ri, K * 4) if doing
142 |                 class-specific regression, or (Ri, 4) if doing class-agnostic
143 |                 regression, where Ri is the number of predicted objects for image i.
144 |                 This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
145 |             scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
146 |                 Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
147 |                 for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
148 |             image_size (list[tuple]): A list of (width, height) tuples for each image in the batch.
149 |             score_thresh (float): Only return detections with a confidence score exceeding this
150 |                 threshold.
151 |             nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
152 |         Returns:
153 |             instances: (list[Instances]): A list of N instances, one for each image in the batch,
154 |                 that stores the topk most confidence detections.
155 |             kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
156 |                 the corresponding boxes/scores index in [0, Ri) from the input, for image i.
157 |         """
158 |         # scores_per_image: num_proposal
159 |         # labels_per_image: num_proposal
160 |         # box_per_images: num_proposal x 4'
161 |         if self.loss_type in {"FOCAL", "BCE"}:
162 |             proposal_num = len(boxes) if max_num_det is None else max_num_det
163 |             scores_per_image, topk_indices = scores.flatten(0, 1).topk(proposal_num, sorted=False)
164 |             labels_per_image = labels[topk_indices]
165 |             box_pred_per_image = boxes.view(-1, 1, 4).repeat(1, num_classes, 1).view(-1, 4)
166 |             box_pred_per_image = box_pred_per_image[topk_indices]
167 |         else:
168 |             box_pred_per_image = boxes
169 |             scores_per_image = scores
170 |             labels_per_image = labels
171 | 
172 |         # Score filtering
173 |         box_pred_per_image = bbox_cxcywh_to_xyxy(box_pred_per_image) * torch.tensor(image_size).repeat(2).to(self.device)
174 |         filter_mask = scores_per_image > score_thresh  # R x K
175 |         score_keep = filter_mask.nonzero(as_tuple=False).view(-1)
176 |         box_pred_per_image = box_pred_per_image[score_keep]
177 |         scores_per_image = scores_per_image[score_keep]
178 |         labels_per_image = labels_per_image[score_keep]
179 | 
180 |         # NMS
181 |         scores_per_image.to(self.device)
182 |         keep = batched_nms(box_pred_per_image, scores_per_image, labels_per_image, nms_thresh)
183 |         box_pred_per_image = box_pred_per_image[keep]
184 |         scores_per_image = scores_per_image[keep]
185 |         labels_per_image = labels_per_image[keep]
186 | 
187 |         # create an instance
188 |         result = Instances(image_size)
189 |         result.pred_boxes = Boxes(box_pred_per_image)
190 |         result.pred_boxes.clip(image_size)
191 |         result.scores = scores_per_image
192 |         result.pred_classes = labels_per_image
193 | 
194 |         return result
195 | 
196 |     def compute_score(self, box_cls):
197 |         """
198 |         Args:
199 |             box_cls: tensor of shape (batch_size, num_proposals, K).
200 |                 The tensor predicts the classification probability for each proposal.
201 | 
202 |         Returns:
203 |         """
204 |         if self.loss_type in {"FOCAL", "BCE"}:
205 |             num_classes = box_cls.shape[2]
206 |             proposal_num = box_cls.shape[1]
207 |             scores = torch.sigmoid(box_cls)
208 |             labels = torch.arange(num_classes, device=self.device). \
209 |                 unsqueeze(0).repeat(proposal_num, 1).flatten(0, 1)
210 |         else:
211 |             scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
212 |             # scores: batch_size x num_proposal
213 | 
214 |         return scores, labels
215 | 
216 |     def language_encode(self, batched_inputs, encode_type="task"):
217 |         texts = batched_inputs
218 | 
219 |         if self.language_encoder_type == "clip":
220 |             text_input = clip.tokenize(texts, truncate=True).to(self.device)
221 | 
222 |         return self.language_backbone(text_input, encode_type == "task")
223 | 
224 |     def get_cached_label_emb(self, labels):
225 |         self.logger.info('processing labels embeddings for {}'.format(labels))
226 |         not_cached_index = []
227 |         not_cached_labels = []
228 |         total_embs = []
229 |         for idx, l in enumerate(labels):
230 |             if self.language_cache_label.has(l):
231 |                 total_embs.append(self.language_cache_label.get(l))
232 |             else:
233 |                 total_embs.append(None)
234 |                 not_cached_index.append(idx)
235 |                 not_cached_labels.append(l)
236 | 
237 |         self.logger.info('cached label emb num: {}, not cached num: {}'.format(len(total_embs) - len(not_cached_labels),
238 |                                                                                len(not_cached_labels)))
239 | 
240 |         if not_cached_labels:
241 |             embeddings = self.language_encode(not_cached_labels, encode_type="label")
242 |             for idx, emb in enumerate(embeddings):
243 |                 idx_to_put = not_cached_index[idx]
244 |                 total_embs[idx_to_put] = emb
245 |                 self.language_cache_label.put(not_cached_labels[idx], emb)
246 | 
247 |         total_label_embs = torch.stack(total_embs).to(self.device)
248 |         return total_label_embs
249 | 
250 |     def get_cached_prompt_emb(self, batched_tasks):
251 |         self.logger.info('processing prompt embeddings for {}'.format(batched_tasks))
252 |         not_cached_index = []
253 |         not_cached_tasks = []
254 |         total_task_features = []
255 |         total_task_masks = []
256 |         for idx, t in enumerate(batched_tasks):
257 |             if self.language_cache_prompt.has(t):
258 |                 task_feature, task_mask = self.language_cache_prompt.get(t)
259 |                 total_task_features.append(task_feature)
260 |                 total_task_masks.append(task_mask)
261 |             else:
262 |                 total_task_features.append(None)
263 |                 total_task_masks.append(None)
264 |                 not_cached_index.append(idx)
265 |                 not_cached_tasks.append(t)
266 | 
267 |         self.logger.info(
268 |             'cached prompt emb num: {}, not cached num: {}'.format(len(total_task_features) - len(not_cached_tasks),
269 |                                                                   len(not_cached_tasks)))
270 | 
271 |         if not_cached_tasks:
272 |             embeddings, task_masks = self.language_encode(not_cached_tasks, encode_type="task")
273 | 
274 |             for idx in range(embeddings.shape[1]):
275 |                 emb = embeddings[:, [idx], :]
276 |                 idx_to_put = not_cached_index[idx]
277 |                 cur_mask = torch.unsqueeze(task_masks[idx], dim=0).to(self.device)
278 |                 total_task_features[idx_to_put] = emb
279 |                 total_task_masks[idx_to_put] = cur_mask
280 |                 self.language_cache_prompt.put(not_cached_tasks[idx], (emb, cur_mask))
281 | 
282 |         total_prompt_features = torch.cat(total_task_features, dim=1)
283 |         total_prompt_masks = torch.cat(total_task_masks, dim=0).to(self.device)
284 | 
285 |         return total_prompt_features, total_prompt_masks
286 | 
287 |     def get_language_embedding(self, batched_inputs):
288 |         batched_labels = [a["label_set"] for a in batched_inputs]
289 |         batched_tasks = [a['tasks'] for a in batched_inputs]
290 | 
291 |         max_label_size = max([len(a) for a in batched_labels])
292 |         label_features = []
293 |         for i, s_labels in enumerate(batched_labels):
294 |             pad_size = max_label_size - len(s_labels)
295 | 
296 |             label_emb = self.get_cached_label_emb(s_labels)
297 |             label_features.append(F.pad(label_emb, (0, 0, 0, pad_size)).unsqueeze(1).to(self.device))
298 | 
299 |         label_features = torch.cat(label_features, dim=1)  # num_label x batch_size x dim_size
300 | 
301 |         # Task Features
302 |         # prompt_features: max_task_len x batch_size x dim_size
303 |         # prompt_mask: batch_size x max_task_len
304 |         # batched_tasks = ['detect a person', 'detect dog and cat']
305 |         prompt_features, prompt_mask = self.get_cached_prompt_emb(batched_tasks)
306 | 
307 |         return label_features, prompt_features, prompt_mask
308 | 
309 |     def forward(self, batched_inputs, do_postprocess=True, score_thresh=0.0, nms_thresh=1.0, debug=False):
310 |         images, images_whwh, ann_types = self.preprocess_image(batched_inputs)
311 | 
312 |         # Backbone
313 |         body_feats = self.backbone(images.tensor)
314 | 
315 |         if type(body_feats) is dict:
316 |             body_feats = [body_feats[i] for i in body_feats.keys()]
317 | 
318 |         encoder_feats = self.neck(body_feats)
319 | 
320 |         if not self.training:
321 |             # create label and prompt embeddings
322 |             label_feats, prompt_feats, prompt_mask = self.get_language_embedding(batched_inputs)
323 |             decoder_feats = self.decoder(encoder_feats, label_feats, prompt_feats, prompt_mask)
324 |             box_pred, box_cls, _ = self.loss_head(decoder_feats)
325 | 
326 |             results = self.gen_output(box_cls, box_pred, batched_inputs, images,
327 |                                       score_thresh, nms_thresh, do_postprocess,
328 |                                       max_num_det=self.num_proposals)
329 | 
330 |         return results
331 | 
332 |     def print_trainable_parameters(self):
333 |         """
334 |         Prints the number of trainable parameters in the model.
335 |         """
336 |         trainable_params = 0
337 |         all_param = 0
338 |         for _, param in self.named_parameters():
339 |             num_params = param.numel()
340 |             # if using DS Zero 3 and the weights are initialized empty
341 |             if num_params == 0 and hasattr(param, "ds_numel"):
342 |                 num_params = param.ds_numel
343 | 
344 |             all_param += num_params
345 |             if param.requires_grad:
346 |                 trainable_params += num_params
347 |         print(
348 |             f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
349 |         )


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/detr_torch.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, List
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch import nn, Tensor
  7 | 
  8 | 
  9 | class Transformer(nn.Module):
 10 | 
 11 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 12 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 13 |                  activation="relu", normalize_before=False,
 14 |                  return_intermediate_dec=False):
 15 |         super().__init__()
 16 | 
 17 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 18 |                                                 dropout, activation, normalize_before)
 19 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 20 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 21 | 
 22 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 23 |                                                 dropout, activation, normalize_before)
 24 |         decoder_norm = nn.LayerNorm(d_model)
 25 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 26 |                                           return_intermediate=return_intermediate_dec)
 27 | 
 28 |         self._reset_parameters()
 29 | 
 30 |         self.d_model = d_model
 31 |         self.nhead = nhead
 32 | 
 33 |     def _reset_parameters(self):
 34 |         for p in self.parameters():
 35 |             if p.dim() > 1:
 36 |                 nn.init.xavier_uniform_(p)
 37 | 
 38 |     def forward(self, src, mask, query_embed, pos_embed):
 39 |         # flatten NxCxHxW to HWxNxC
 40 |         bs, c, h, w = src.shape
 41 |         src = src.flatten(2).permute(2, 0, 1)
 42 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
 43 |         query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
 44 |         mask = mask.flatten(1)
 45 | 
 46 |         tgt = torch.zeros_like(query_embed)
 47 |         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
 48 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
 49 |                           pos=pos_embed, query_pos=query_embed)
 50 |         return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
 51 | 
 52 | 
 53 | class TransformerEncoder(nn.Module):
 54 |     def __init__(self, encoder_layer, num_layers, norm=None):
 55 |         super(TransformerEncoder, self).__init__()
 56 |         # self.layers = _get_clones(encoder_layer, num_layers)
 57 |         self.layers = [encoder_layer]
 58 |         self.num_layers = num_layers
 59 |         self.norm = norm
 60 | 
 61 |     def forward(self, src, src_mask=None, pos_embed=None):
 62 |         output = src
 63 |         pos_embed = pos_embed.clone().detach() if pos_embed is not None else pos_embed
 64 |         for layer in self.layers:
 65 |             output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
 66 | 
 67 |         if self.norm is not None:
 68 |             output = self.norm(output)
 69 | 
 70 |         return output
 71 | 
 72 | 
 73 | class TransformerDecoder(nn.Module):
 74 | 
 75 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
 76 |         super().__init__()
 77 |         self.layers = _get_clones(decoder_layer, num_layers)
 78 |         self.num_layers = num_layers
 79 |         self.norm = norm
 80 |         self.return_intermediate = return_intermediate
 81 | 
 82 |     def forward(self, tgt, memory,
 83 |                 tgt_mask: Optional[Tensor] = None,
 84 |                 memory_mask: Optional[Tensor] = None,
 85 |                 tgt_key_padding_mask: Optional[Tensor] = None,
 86 |                 memory_key_padding_mask: Optional[Tensor] = None,
 87 |                 pos: Optional[Tensor] = None,
 88 |                 query_pos: Optional[Tensor] = None):
 89 |         output = tgt
 90 | 
 91 |         intermediate = []
 92 | 
 93 |         for layer in self.layers:
 94 |             output = layer(output, memory, tgt_mask=tgt_mask,
 95 |                            memory_mask=memory_mask,
 96 |                            tgt_key_padding_mask=tgt_key_padding_mask,
 97 |                            memory_key_padding_mask=memory_key_padding_mask,
 98 |                            pos=pos, query_pos=query_pos)
 99 |             if self.return_intermediate:
100 |                 intermediate.append(self.norm(output))
101 | 
102 |         if self.norm is not None:
103 |             output = self.norm(output)
104 |             if self.return_intermediate:
105 |                 intermediate.pop()
106 |                 intermediate.append(output)
107 | 
108 |         if self.return_intermediate:
109 |             return torch.stack(intermediate)
110 | 
111 |         return output.unsqueeze(0)
112 | 
113 | 
114 | class TransformerEncoderLayer(nn.Module):
115 | 
116 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
117 |                  activation="relu", normalize_before=False):
118 |         super().__init__()
119 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
120 |         # Implementation of Feedforward model
121 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
122 |         self.dropout = nn.Dropout(dropout)
123 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
124 | 
125 |         self.norm1 = nn.LayerNorm(d_model)
126 |         self.norm2 = nn.LayerNorm(d_model)
127 |         self.dropout1 = nn.Dropout(dropout)
128 |         self.dropout2 = nn.Dropout(dropout)
129 | 
130 |         self.activation = _get_activation_fn(activation)
131 |         self.normalize_before = normalize_before
132 | 
133 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
134 |         return tensor if pos is None else tensor + pos
135 | 
136 |     def forward_post(self,
137 |                      src,
138 |                      src_mask: Optional[Tensor] = None,
139 |                      src_key_padding_mask: Optional[Tensor] = None,
140 |                      pos: Optional[Tensor] = None):
141 |         q = k = self.with_pos_embed(src, pos)
142 |         src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
143 |                               key_padding_mask=src_key_padding_mask)[0]
144 |         src = src + self.dropout1(src2)
145 |         src = self.norm1(src)
146 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
147 |         src = src + self.dropout2(src2)
148 |         src = self.norm2(src)
149 |         return src
150 | 
151 |     def forward_pre(self, src,
152 |                     src_mask: Optional[Tensor] = None,
153 |                     src_key_padding_mask: Optional[Tensor] = None,
154 |                     pos: Optional[Tensor] = None):
155 |         src2 = self.norm1(src)
156 |         q = k = self.with_pos_embed(src2, pos)
157 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
158 |                               key_padding_mask=src_key_padding_mask)[0]
159 |         src = src + self.dropout1(src2)
160 |         src2 = self.norm2(src)
161 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
162 |         src = src + self.dropout2(src2)
163 |         return src
164 | 
165 |     def forward(self, src,
166 |                 src_mask: Optional[Tensor] = None,
167 |                 src_key_padding_mask: Optional[Tensor] = None,
168 |                 pos: Optional[Tensor] = None):
169 |         if self.normalize_before:
170 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
171 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos)
172 | 
173 | 
174 | class TransformerDecoderLayer(nn.Module):
175 | 
176 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
177 |                  activation="relu", normalize_before=False):
178 |         super().__init__()
179 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
180 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
181 |         # Implementation of Feedforward model
182 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
183 |         self.dropout = nn.Dropout(dropout)
184 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
185 | 
186 |         self.norm1 = nn.LayerNorm(d_model)
187 |         self.norm2 = nn.LayerNorm(d_model)
188 |         self.norm3 = nn.LayerNorm(d_model)
189 |         self.dropout1 = nn.Dropout(dropout)
190 |         self.dropout2 = nn.Dropout(dropout)
191 |         self.dropout3 = nn.Dropout(dropout)
192 | 
193 |         self.activation = _get_activation_fn(activation)
194 |         self.normalize_before = normalize_before
195 | 
196 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
197 |         return tensor if pos is None else tensor + pos
198 | 
199 |     def forward_post(self, tgt, memory,
200 |                      tgt_mask: Optional[Tensor] = None,
201 |                      memory_mask: Optional[Tensor] = None,
202 |                      tgt_key_padding_mask: Optional[Tensor] = None,
203 |                      memory_key_padding_mask: Optional[Tensor] = None,
204 |                      pos: Optional[Tensor] = None,
205 |                      query_pos: Optional[Tensor] = None):
206 |         q = k = self.with_pos_embed(tgt, query_pos)
207 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
208 |                               key_padding_mask=tgt_key_padding_mask)[0]
209 |         tgt = tgt + self.dropout1(tgt2)
210 |         tgt = self.norm1(tgt)
211 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
212 |                                    key=self.with_pos_embed(memory, pos),
213 |                                    value=memory, attn_mask=memory_mask,
214 |                                    key_padding_mask=memory_key_padding_mask)[0]
215 |         tgt = tgt + self.dropout2(tgt2)
216 |         tgt = self.norm2(tgt)
217 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
218 |         tgt = tgt + self.dropout3(tgt2)
219 |         tgt = self.norm3(tgt)
220 |         return tgt
221 | 
222 |     def forward_pre(self, tgt, memory,
223 |                     tgt_mask: Optional[Tensor] = None,
224 |                     memory_mask: Optional[Tensor] = None,
225 |                     tgt_key_padding_mask: Optional[Tensor] = None,
226 |                     memory_key_padding_mask: Optional[Tensor] = None,
227 |                     pos: Optional[Tensor] = None,
228 |                     query_pos: Optional[Tensor] = None):
229 |         tgt2 = self.norm1(tgt)
230 |         q = k = self.with_pos_embed(tgt2, query_pos)
231 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
232 |                               key_padding_mask=tgt_key_padding_mask)[0]
233 |         tgt = tgt + self.dropout1(tgt2)
234 |         tgt2 = self.norm2(tgt)
235 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
236 |                                    key=self.with_pos_embed(memory, pos),
237 |                                    value=memory, attn_mask=memory_mask,
238 |                                    key_padding_mask=memory_key_padding_mask)[0]
239 |         tgt = tgt + self.dropout2(tgt2)
240 |         tgt2 = self.norm3(tgt)
241 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
242 |         tgt = tgt + self.dropout3(tgt2)
243 |         return tgt
244 | 
245 |     def forward(self, tgt, memory,
246 |                 tgt_mask: Optional[Tensor] = None,
247 |                 memory_mask: Optional[Tensor] = None,
248 |                 tgt_key_padding_mask: Optional[Tensor] = None,
249 |                 memory_key_padding_mask: Optional[Tensor] = None,
250 |                 pos: Optional[Tensor] = None,
251 |                 query_pos: Optional[Tensor] = None):
252 |         if self.normalize_before:
253 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
254 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
255 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
256 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
257 | 
258 | 
259 | def _get_clones(module, N):
260 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
261 | 
262 | 
263 | def build_transformer(args):
264 |     return Transformer(
265 |         d_model=args.hidden_dim,
266 |         dropout=args.dropout,
267 |         nhead=args.nheads,
268 |         dim_feedforward=args.dim_feedforward,
269 |         num_encoder_layers=args.enc_layers,
270 |         num_decoder_layers=args.dec_layers,
271 |         normalize_before=args.pre_norm,
272 |         return_intermediate_dec=True,
273 |     )
274 | 
275 | 
276 | def _get_activation_fn(activation):
277 |     """Return an activation function given a string"""
278 |     if activation == "relu":
279 |         return F.relu
280 |     if activation == "gelu":
281 |         return F.gelu
282 |     if activation == "glu":
283 |         return F.glu
284 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
285 | 
286 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/dn_ops.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from omdet.utils.box_ops import xywh2xyxy, xyxy2xywh
  3 | 
  4 | 
  5 | def get_cdn_group(batch,
  6 |                   num_classes,
  7 |                   num_queries,
  8 |                   class_embed,
  9 |                   num_dn=100,
 10 |                   cls_noise_ratio=0.5,
 11 |                   box_noise_scale=1.0,
 12 |                   training=False,
 13 |                   amp=False):
 14 |     """
 15 |     Get contrastive denoising training group. This function creates a contrastive denoising training group with
 16 |     positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding
 17 |     box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information.
 18 | 
 19 |     Args:
 20 |         batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes'
 21 |             (torch.Tensor with shape [num_gts, 4]), 'gt_groups' (List(int)) which is a list of batch size length
 22 |             indicating the number of gts of each image.
 23 |         num_classes (int): Number of classes.
 24 |         num_queries (int): Number of queries.
 25 |         class_embed (torch.Tensor): Embedding weights to map class labels to embedding space.
 26 |         num_dn (int, optional): Number of denoising. Defaults to 100.
 27 |         cls_noise_ratio (float, optional): Noise ratio for class labels. Defaults to 0.5.
 28 |         box_noise_scale (float, optional): Noise scale for bounding box coordinates. Defaults to 1.0.
 29 |         training (bool, optional): If it's in training mode. Defaults to False.
 30 | 
 31 |     Returns:
 32 |         (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Dict]]): The modified class embeddings,
 33 |             bounding boxes, attention mask and meta information for denoising. If not in training mode or 'num_dn'
 34 |             is less than or equal to 0, the function returns None for all elements in the tuple.
 35 |     """
 36 | 
 37 |     if (not training) or num_dn <= 0:
 38 |         return None, None, None, None
 39 |     gt_groups = batch['gt_groups']
 40 |     total_num = sum(gt_groups)
 41 |     max_nums = max(gt_groups)
 42 |     if max_nums == 0:
 43 |         return None, None, None, None
 44 | 
 45 |     num_group = num_dn // max_nums
 46 |     num_group = 1 if num_group == 0 else num_group
 47 |     # pad gt to max_num of a batch
 48 |     bs = len(gt_groups)
 49 |     gt_cls = batch['cls']  # (bs*num, )
 50 |     gt_bbox = batch['bboxes']  # bs*num, 4
 51 |     b_idx = batch['batch_idx']
 52 | 
 53 |     # each group has positive and negative queries.
 54 |     dn_cls = gt_cls.repeat(2 * num_group)  # (2*num_group*bs*num, )
 55 |     dn_bbox = gt_bbox.repeat(2 * num_group, 1)  # 2*num_group*bs*num, 4
 56 |     dn_b_idx = b_idx.repeat(2 * num_group).view(-1).to(dn_cls.device)  # (2*num_group*bs*num, )
 57 | 
 58 |     # positive and negative mask
 59 |     # (bs*num*num_group, ), the second total_num*num_group part as negative samples
 60 |     neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
 61 | 
 62 |     if cls_noise_ratio > 0:
 63 |         # half of bbox prob
 64 |         mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
 65 |         idx = torch.nonzero(mask).squeeze(-1)
 66 |         # randomly put a new one here
 67 |         new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
 68 |         dn_cls[idx] = new_label
 69 | 
 70 |     if box_noise_scale > 0:
 71 |         known_bbox = xywh2xyxy(dn_bbox)
 72 | 
 73 |         diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale  # 2*num_group*bs*num, 4
 74 | 
 75 |         rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0
 76 |         rand_part = torch.rand_like(dn_bbox)
 77 |         rand_part[neg_idx] += 1.0
 78 |         rand_part *= rand_sign
 79 |         known_bbox += rand_part * diff
 80 |         known_bbox.clip_(min=0.0, max=1.0)
 81 |         dn_bbox = xyxy2xywh(known_bbox)
 82 |         dn_bbox = inverse_sigmoid(dn_bbox)
 83 | 
 84 |     # total denoising queries
 85 |     num_dn = int(max_nums * 2 * num_group)
 86 |     # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])
 87 |     dn_cls_embed = class_embed[dn_cls]  # bs*num * 2 * num_group, 256
 88 |     if amp:
 89 |         data_type = torch.bfloat16
 90 |     else:
 91 |         data_type = torch.float32
 92 |     padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device, dtype=data_type)
 93 |     padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device)
 94 | 
 95 |     map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long, device=gt_cls.device) for num in gt_groups])
 96 |     pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0)
 97 | 
 98 |     map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)])
 99 |     fix_class =  dn_cls.dim() == 2
100 |     if fix_class:
101 |         padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed
102 |     else:
103 |         padding_cls[(dn_b_idx.long(), map_indices)] = dn_cls_embed.transpose(1,0)[(dn_b_idx.long(), map_indices)]
104 |     padding_bbox[(dn_b_idx.long(), map_indices)] = dn_bbox
105 | 
106 |     tgt_size = num_dn + num_queries
107 |     attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool)
108 |     # match query cannot see the reconstruct
109 |     attn_mask[num_dn:, :num_dn] = True
110 |     # reconstruct cannot see each other
111 |     for i in range(num_group):
112 |         if i == 0:
113 |             attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
114 |         if i == num_group - 1:
115 |             attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * i * 2] = True
116 |         else:
117 |             attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
118 |             attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True
119 |     dn_meta = {
120 |         'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)],
121 |         'dn_num_group': num_group,
122 |         'dn_num_split': [num_dn, num_queries]}
123 | 
124 |     return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to(
125 |         class_embed.device), dn_meta
126 | 
127 | 
128 | def inverse_sigmoid(x, eps=1e-6):
129 |     """Inverse sigmoid function."""
130 |     x = x.clip(min=0., max=1.)
131 |     return torch.log(x / (1 - x + eps) + eps)
132 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/ela_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .torch_utils import BaseConv, linear_init_
  5 | from .block import RepC3
  6 | from .detr_torch import TransformerEncoder
  7 | from .build_components import TRANSFORMER_ENCODER_REGISTRY
  8 | 
  9 | __all__ = ['ELAEncoder']
 10 | 
 11 | 
 12 | class TransformerLayer(nn.Module):
 13 |     def __init__(self,
 14 |                  d_model=256,
 15 |                  nhead=8,
 16 |                  dim_feedforward=1024,
 17 |                  dropout=0.,
 18 |                  activation="relu",
 19 |                  attn_dropout=None,
 20 |                  act_dropout=None,
 21 |                  normalize_before=False):
 22 |         super(TransformerLayer, self).__init__()
 23 |         attn_dropout = dropout if attn_dropout is None else attn_dropout
 24 |         act_dropout = dropout if act_dropout is None else act_dropout
 25 |         self.normalize_before = normalize_before
 26 | 
 27 |         self.self_attn = torch.nn.MultiheadAttention(d_model, nhead, attn_dropout, batch_first=True)
 28 |         # Implementation of Feedforward model
 29 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
 30 |         self.dropout = nn.Dropout(act_dropout)
 31 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
 32 | 
 33 |         self.norm1 = nn.LayerNorm(d_model)
 34 |         self.norm2 = nn.LayerNorm(d_model)
 35 |         self.dropout1 = nn.Dropout(dropout)
 36 |         self.dropout2 = nn.Dropout(dropout)
 37 |         self.activation = getattr(F, activation)
 38 |         self._reset_parameters()
 39 | 
 40 |     def _reset_parameters(self):
 41 |         linear_init_(self.linear1)
 42 |         linear_init_(self.linear2)
 43 | 
 44 |     @staticmethod
 45 |     def with_pos_embed(tensor, pos_embed):
 46 |         return tensor if pos_embed is None else tensor + pos_embed
 47 | 
 48 |     def forward(self, src, src_mask=None, pos_embed=None):
 49 |         residual = src
 50 |         if self.normalize_before:
 51 |             src = self.norm1(src)
 52 |         q = k = self.with_pos_embed(src, pos_embed)
 53 |         src = self.self_attn(q, k, value=src, attn_mask=src_mask)
 54 |         #print(src[1].shape, src[0].shape)
 55 |         src = src[0]
 56 |         src = residual + self.dropout1(src)
 57 |         if not self.normalize_before:
 58 |             src = self.norm1(src)
 59 | 
 60 |         residual = src
 61 |         if self.normalize_before:
 62 |             src = self.norm2(src)
 63 |         src = self.linear2(self.dropout(self.activation(self.linear1(src))))
 64 |         src = residual + self.dropout2(src)
 65 |         if not self.normalize_before:
 66 |             src = self.norm2(src)
 67 |         return src
 68 | 
 69 | 
 70 | @TRANSFORMER_ENCODER_REGISTRY.register()
 71 | class ELAEncoder(nn.Module):
 72 |     # __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
 73 |     # __inject__ = ['encoder_layer']
 74 | 
 75 |     def __init__(self,
 76 |                  in_channels=[128, 256, 512],
 77 |                  feat_strides=[8, 16, 32],
 78 |                  hidden_dim=256,
 79 |                  use_encoder_idx=[2],
 80 |                  num_encoder_layers=1,
 81 |                  encoder_layer='TransformerLayer',
 82 |                  pe_temperature=10000,
 83 |                  expansion=1.0,
 84 |                  depth_mult=1.0,
 85 |                  act='silu',
 86 |                  trt=False,
 87 |                  dim_feedforward=1024,
 88 |                  eval_size=None):
 89 |         super(ELAEncoder, self).__init__()
 90 |         self.in_channels = in_channels
 91 |         self.feat_strides = feat_strides
 92 |         self.hidden_dim = hidden_dim
 93 |         self.use_encoder_idx = use_encoder_idx
 94 |         self.num_encoder_layers = num_encoder_layers
 95 |         self.pe_temperature = pe_temperature
 96 |         self.eval_size = eval_size
 97 | 
 98 |         self.encoder_layer = TransformerLayer(dim_feedforward=dim_feedforward)
 99 | 
100 |         # channel projection
101 |         self.input_proj = nn.ModuleList()
102 |         for in_channel in self.in_channels:
103 |             self.input_proj.append(
104 |                 nn.Sequential(
105 |                     nn.Conv2d(
106 |                         in_channel, hidden_dim, kernel_size=(1, 1), bias=False),
107 |                     nn.BatchNorm2d(
108 |                         hidden_dim)))
109 |         # encoder transformer
110 |         self.encoder = nn.ModuleList([
111 |             TransformerEncoder(self.encoder_layer, num_encoder_layers)
112 |             for _ in range(len(use_encoder_idx))
113 |         ])
114 | 
115 |         # act = get_act_fn(
116 |         #     act, trt=trt) if act is None or isinstance(act,
117 |         #                                                (str, dict)) else act
118 |         # top-down fpn
119 |         self.lateral_convs = nn.ModuleList()
120 |         self.fpn_blocks = nn.ModuleList()
121 |         for idx in range(len(self.in_channels) - 1, 0, -1):
122 |             self.lateral_convs.append(
123 |                 BaseConv(
124 |                     hidden_dim, hidden_dim, 1, 1, act=act))
125 |             self.fpn_blocks.append(
126 |                 RepC3(
127 |                     hidden_dim * 2,
128 |                     hidden_dim,
129 |                     round(3 * depth_mult),
130 |                     e=1.0))
131 | 
132 |         # bottom-up pan
133 |         self.downsample_convs = nn.ModuleList()
134 |         self.pan_blocks = nn.ModuleList()
135 |         for idx in range(len(self.in_channels) - 1):
136 |             self.downsample_convs.append(
137 |                 BaseConv(
138 |                     hidden_dim, hidden_dim, 3, stride=2, act=act))
139 |             self.pan_blocks.append(
140 |                 RepC3(
141 |                     hidden_dim * 2,
142 |                     hidden_dim,
143 |                     round(3 * depth_mult),
144 |                     e=1.0))
145 | 
146 |     #     self._reset_parameters()
147 |     #
148 |     # def _reset_parameters(self):
149 |     #     if self.eval_size:
150 |     #         for idx in self.use_encoder_idx:
151 |     #             stride = self.feat_strides[idx]
152 |     #             pos_embed = self.build_2d_sincos_position_embedding(
153 |     #                 self.eval_size[1] // stride, self.eval_size[0] // stride,
154 |     #                 self.hidden_dim, self.pe_temperature)
155 |     #             setattr(self, f'pos_embed{idx}', pos_embed)
156 | 
157 |     @staticmethod
158 |     def build_2d_sincos_position_embedding(w,
159 |                                            h,
160 |                                            embed_dim=256,
161 |                                            temperature=10000.):
162 |         grid_w = torch.arange(int(w), dtype=torch.float32)
163 |         grid_h = torch.arange(int(h), dtype=torch.float32)
164 |         grid_w, grid_h = torch.meshgrid(grid_w, grid_h)
165 |         assert embed_dim % 4 == 0, \
166 |             'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
167 |         pos_dim = embed_dim // 4
168 |         omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
169 |         omega = 1. / (temperature**omega)
170 | 
171 |         out_w = grid_w.flatten()[..., None] @omega[None]
172 |         out_h = grid_h.flatten()[..., None] @omega[None]
173 | 
174 |         return torch.cat(
175 |             [
176 |                 torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),
177 |                 torch.cos(out_h)
178 |             ],
179 |             dim=1)[None, :, :]
180 |     @classmethod
181 |     def from_config(cls, cfg):
182 |         enc_cfg = cfg.MODEL.ELAEncoder
183 |         return {
184 |             'in_channels': enc_cfg.in_channels,
185 |             'feat_strides': enc_cfg.feat_strides,
186 |             'hidden_dim': enc_cfg.hidden_dim,
187 |             'use_encoder_idx': enc_cfg.use_encoder_idx,
188 |             'num_encoder_layers': enc_cfg.num_encoder_layers,
189 |             'encoder_layer': enc_cfg.encoder_layer,
190 |             'pe_temperature': enc_cfg.pe_temperature,
191 |             'expansion': enc_cfg.expansion,
192 |             'depth_mult': enc_cfg.depth_mult,
193 |             'act': enc_cfg.act,
194 |             'eval_size': enc_cfg.eval_size,
195 |             'dim_feedforward': enc_cfg.dim_feedforward
196 |         }
197 | 
198 |     def forward(self, feats, for_mot=False):
199 |         assert len(feats) == len(self.in_channels)
200 |         # get projection features
201 |         proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
202 |         # encoder
203 |         if self.num_encoder_layers > 0:
204 |             for i, enc_ind in enumerate(self.use_encoder_idx):
205 |                 h, w = proj_feats[enc_ind].shape[2:]
206 |                 # flatten [B, C, H, W] to [B, HxW, C]
207 |                 src_flatten = proj_feats[enc_ind].flatten(start_dim=2).transpose(1, 2)
208 |                 if self.training or self.eval_size is None:
209 |                     pos_embed = self.build_2d_sincos_position_embedding(
210 |                         w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
211 |                 else:
212 |                     pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
213 |                 memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
214 |                 proj_feats[enc_ind] = memory.transpose(1, 2).reshape((-1, self.hidden_dim, h, w))
215 | 
216 |         # top-down fpn
217 |         inner_outs = [proj_feats[-1]]
218 |         for idx in range(len(self.in_channels) - 1, 0, -1):
219 |             feat_heigh = inner_outs[0]
220 |             feat_low = proj_feats[idx - 1]
221 |             feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
222 |                 feat_heigh)
223 |             inner_outs[0] = feat_heigh
224 | 
225 |             upsample_feat = F.interpolate(
226 |                 feat_heigh, scale_factor=2., mode="nearest")
227 |             inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
228 |                 torch.cat(
229 |                     [upsample_feat, feat_low], dim=1))
230 |             inner_outs.insert(0, inner_out)
231 | 
232 |         # bottom-up pan
233 |         outs = [inner_outs[0]]
234 |         for idx in range(len(self.in_channels) - 1):
235 |             feat_low = outs[-1]
236 |             feat_height = inner_outs[idx + 1]
237 |             downsample_feat = self.downsample_convs[idx](feat_low)
238 |             out = self.pan_blocks[idx](torch.cat(
239 |                 [downsample_feat, feat_height], dim=1))
240 |             outs.append(out)
241 | 
242 |         return outs
243 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/head.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from .build_components import DETR_HEAD_REGISTRY
 3 | 
 4 | 
 5 | __all__ = ['DINOHead']
 6 | @DETR_HEAD_REGISTRY.register()
 7 | class DINOHead(nn.Module):
 8 |     def __init__(self, device="cuda"):
 9 |         super(DINOHead, self).__init__()
10 | 
11 |     def forward(self, out_transformer, inputs=None):
12 |         (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
13 |          dn_meta) = out_transformer
14 | 
15 |         return (dec_out_bboxes[-1], dec_out_logits[-1], None)
16 | 
17 |     @classmethod
18 |     def from_config(cls, cfg, *args, **kwargs):
19 |         return {
20 |             "device": cfg.MODEL.DEVICE
21 |         }
22 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/infer_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import copy
 6 | from typing import Tuple
 7 | 
 8 | import numpy as np
 9 | # import open_clip
10 | from detectron2.structures import Boxes, ImageList, Instances
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from detectron2.modeling import detector_postprocess
15 | from detectron2.layers import batched_nms
16 | from detectron2.modeling import build_backbone
17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head
18 | from detectron2.config import configurable
19 | from omdet.modeling.language_backbone import build_language_backbone
20 | from detectron2.utils.logger import setup_logger
21 | from ..modeling.language_backbone.clip.models import clip as clip
22 | from .torch_utils import bbox_cxcywh_to_xyxy
23 | __all__ = ['OmDetV2TurboInfer']
24 | 
25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
26 | 
27 | from ..utils.cache import LRUCache
28 | 
29 | from huggingface_hub import PyTorchModelHubMixin
30 | 
31 | 
32 | @META_ARCH_REGISTRY.register()
33 | class OmDetV2TurboInfer(nn.Module, PyTorchModelHubMixin):
34 | 
35 |     @configurable
36 |     def __init__(self, cfg):
37 |         super(OmDetV2TurboInfer, self).__init__()
38 |         self.cfg = cfg
39 |         self.logger = setup_logger(name=__name__)
40 | 
41 |         self.backbone = build_backbone(cfg)
42 |         self.decoder = build_decoder_model(cfg)
43 |         self.neck = build_encoder_model(cfg)
44 |         self.device = cfg.MODEL.DEVICE
45 | 
46 |         pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
47 |         pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
48 |         normalizer = lambda x: (x - pixel_mean) / pixel_std
49 |         self.normalizer = normalizer
50 | 
51 |         self.size_divisibility = self.backbone.size_divisibility
52 |         self.nms_test_th = 0.0
53 |         self.conf_test_th = 0.0
54 |         self.loss_type = 'FOCAL'
55 |         self.use_language_cache = True
56 |         self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
57 |         self.num_proposals = cfg.MODEL.ELADecoder.num_queries
58 | 
59 | 
60 |     @classmethod
61 |     def from_config(cls, cfg, *args, **kwargs):
62 |         return {
63 |             'cfg': cfg
64 |         }
65 | 
66 |     def forward(self, x, label_feats, task_feats, task_mask):
67 | 
68 |         body_feats = self.backbone(x)
69 | 
70 |         if type(body_feats) is dict:
71 |             body_feats = [body_feats[i] for i in body_feats.keys()]
72 |         encoder_feats = self.neck(body_feats)
73 |         box_pred, box_cls, _, _, _ = self.decoder(encoder_feats, label_feats, task_feats, task_mask)
74 | 
75 |         return box_pred, box_cls
76 | 


--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/torch_utils.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.init import uniform_
  9 | 
 10 | __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
 11 | 
 12 | 
 13 | def _get_clones(module, n):
 14 |     return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
 15 | 
 16 | 
 17 | def bias_init_with_prob(prior_prob=0.01):
 18 |     """initialize conv/fc bias value according to a given probability value."""
 19 |     return float(-np.log((1 - prior_prob) / prior_prob))  # return bias_init
 20 | 
 21 | 
 22 | def linear_init_(module):
 23 |     bound = 1 / math.sqrt(module.weight.shape[0])
 24 |     uniform_(module.weight, -bound, bound)
 25 |     if hasattr(module, 'bias') and module.bias is not None:
 26 |         uniform_(module.bias, -bound, bound)
 27 | 
 28 | 
 29 | def inverse_sigmoid(x, eps=1e-5):
 30 |     x = x.clamp(min=0, max=1)
 31 |     x1 = x.clamp(min=eps)
 32 |     x2 = (1 - x).clamp(min=eps)
 33 |     return torch.log(x1 / x2)
 34 | 
 35 | 
 36 | def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
 37 |                                         sampling_locations: torch.Tensor,
 38 |                                         attention_weights: torch.Tensor) -> torch.Tensor:
 39 |     """
 40 |     Multi-scale deformable attention.
 41 |     https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
 42 |     """
 43 | 
 44 |     bs, _, num_heads, embed_dims = value.shape
 45 |     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
 46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
 47 |     sampling_grids = 2 * sampling_locations - 1
 48 |     sampling_value_list = []
 49 |     for level, (H_, W_) in enumerate(value_spatial_shapes):
 50 |         # bs, H_*W_, num_heads, embed_dims ->
 51 |         # bs, H_*W_, num_heads*embed_dims ->
 52 |         # bs, num_heads*embed_dims, H_*W_ ->
 53 |         # bs*num_heads, embed_dims, H_, W_
 54 |         value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
 55 |         # bs, num_queries, num_heads, num_points, 2 ->
 56 |         # bs, num_heads, num_queries, num_points, 2 ->
 57 |         # bs*num_heads, num_queries, num_points, 2
 58 |         sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
 59 |         # bs*num_heads, embed_dims, num_queries, num_points
 60 |         sampling_value_l_ = F.grid_sample(value_l_,
 61 |                                           sampling_grid_l_,
 62 |                                           mode='bilinear',
 63 |                                           padding_mode='zeros',
 64 |                                           align_corners=False)
 65 |         sampling_value_list.append(sampling_value_l_)
 66 |     # (bs, num_queries, num_heads, num_levels, num_points) ->
 67 |     # (bs, num_heads, num_queries, num_levels, num_points) ->
 68 |     # (bs, num_heads, 1, num_queries, num_levels*num_points)
 69 |     attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
 70 |                                                                   num_levels * num_points)
 71 |     output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
 72 |         bs, num_heads * embed_dims, num_queries))
 73 |     return output.transpose(1, 2).contiguous()
 74 | 
 75 | 
 76 | def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
 77 |     """
 78 |     Calculate Intersection over Union (IoU) of box1(1, 4) to box2(n, 4).
 79 | 
 80 |     Args:
 81 |         box1 (torch.Tensor): A tensor representing a single bounding box with shape (1, 4).
 82 |         box2 (torch.Tensor): A tensor representing n bounding boxes with shape (n, 4).
 83 |         xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in
 84 |                                (x1, y1, x2, y2) format. Defaults to True.
 85 |         GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False.
 86 |         DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False.
 87 |         CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False.
 88 |         eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
 89 | 
 90 |     Returns:
 91 |         (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags.
 92 |     """
 93 | 
 94 |     # Get the coordinates of bounding boxes
 95 |     if xywh:  # transform from xywh to xyxy
 96 |         (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
 97 |         w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
 98 |         b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
 99 |         b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
100 |     else:  # x1, y1, x2, y2 = box1
101 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
102 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
103 |         w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
104 |         w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
105 | 
106 |     # Intersection area
107 |     inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * \
108 |             (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp_(0)
109 | 
110 |     # Union Area
111 |     union = w1 * h1 + w2 * h2 - inter + eps
112 | 
113 |     # IoU
114 |     iou = inter / union
115 |     if CIoU or DIoU or GIoU:
116 |         cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1)  # convex (smallest enclosing box) width
117 |         ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1)  # convex height
118 |         if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
119 |             c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
120 |             rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center dist ** 2
121 |             if CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
122 |                 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2)
123 |                 with torch.no_grad():
124 |                     alpha = v / (v - iou + (1 + eps))
125 |                 return iou - (rho2 / c2 + v * alpha)  # CIoU
126 |             return iou - rho2 / c2  # DIoU
127 |         c_area = cw * ch + eps  # convex area
128 |         return iou - (c_area - union) / c_area  # GIoU https://arxiv.org/pdf/1902.09630.pdf
129 |     return iou  # IoU
130 | 
131 | def cls_score(cls_type, cls_feature, class_proj, logit_scale):
132 |     if cls_type == 'cosine':
133 |         class_logits = _b_cosine(cls_feature, class_proj, logit_scale)  # 4 100 256 4 256 20
134 |     elif cls_type == 'dot':
135 |         class_logits = torch.bmm(cls_feature, class_proj)  # 4 100 20
136 |     else:
137 |         raise Exception("Unknown cls type {}".format(cls_type))
138 |     return class_logits
139 | 
140 | def _norm(f, dim=-1):
141 |     return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12)
142 | 
143 | 
144 | def _b_cosine(a, b, logit_scale):
145 |     """
146 |     a: B x K x H
147 |     b: B x H x K
148 |     """
149 |     a = _norm(a, dim=2)
150 |     b = _norm(b, dim=1)
151 |     # Calculating the Loss
152 |     logit_scale = logit_scale.exp()
153 |     logits_per_image = logit_scale * torch.bmm(a, b)
154 |     return logits_per_image
155 | 
156 | ###########################
157 | def bbox_cxcywh_to_xyxy(x):
158 |     cxcy, wh = torch.split(x, 2, dim=-1)
159 |     return torch.cat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], dim=-1)
160 | 
161 | def bbox_xyxy2cxcywh(x):
162 |     x0, y0, x1, y1 = torch.split(x, 1, dim=-1)
163 |     return torch.cat([(x1+x0)/2, (y1+y0)/2, x1-x0, y1-y0], dim=-1)
164 | 
165 | class SiLU(nn.Module):
166 |     def __init__(self):
167 |         super(SiLU, self).__init__()
168 | 
169 |     def forward(self, x):
170 |         return x * torch.sigmoid(x)
171 | 
172 | class BaseConv(nn.Module):
173 |     def __init__(self,
174 |                  in_channels,
175 |                  out_channels,
176 |                  ksize,
177 |                  stride,
178 |                  groups=1,
179 |                  bias=False,
180 |                  act="silu"):
181 |         super(BaseConv, self).__init__()
182 |         self.conv = nn.Conv2d(
183 |             in_channels,
184 |             out_channels,
185 |             kernel_size=ksize,
186 |             stride=stride,
187 |             padding=(ksize - 1) // 2,
188 |             groups=groups,
189 |             bias=bias)
190 |         self.bn = nn.BatchNorm2d(
191 |             out_channels,
192 |             # epsilon=1e-3,  # for amp(fp16), set in ppdet/engine/trainer.py
193 |             # momentum=0.97,
194 |             # weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
195 |             # bias_attr=ParamAttr(regularizer=L2Decay(0.0))
196 |             )
197 | 
198 |         if act == 'silu':
199 |             self.act = SiLU()
200 |         elif act == 'gelu':
201 |             self.act = nn.GELU()
202 |     #     self._init_weights()
203 |     #
204 |     # def _init_weights(self):
205 |     #     conv_init_(self.conv)
206 | 
207 |     def forward(self, x):
208 |         x = self.bn(self.conv(x))
209 |         if self.training:
210 |             y = self.act(x)
211 |         else:
212 |             if isinstance(self.act, nn.SiLU):
213 |                 self.act = SiLU()
214 |             y = self.act(x)
215 |         return y
216 | 
217 | import random
218 | import torchvision
219 | 
220 | class BatchResize():
221 |     def __init__(self, mode="training"):
222 |         self.mode = mode
223 |         if mode == "training":
224 |             self.size = int(random.choice(np.arange(480, 801, step=32)))
225 |         else:
226 |             self.size = 640
227 |         self.resize = torchvision.transforms.Resize((self.size, self.size))
228 | 
229 |     def __call__(self, batch_inputs):
230 |         for i, b in enumerate(batch_inputs):
231 |             h, w = batch_inputs[i]["image"].shape[1:]
232 |             batch_inputs[i]["image"] = self.resize(batch_inputs[i]["image"])
233 |             new_h, new_w = (self.size, self.size)
234 |             if self.mode:
235 |                 batch_inputs[i]["instances"].gt_boxes.tensor *= torch.tensor([new_w/w, new_h/h]).repeat(1, 2)
236 |                 batch_inputs[i]["instances"]._image_size = (new_h, new_w)
237 | 
238 |         return batch_inputs
239 | 
240 | 
241 | def get_contrastive_denoising_training_group(targets,
242 |                                              num_classes,
243 |                                              num_queries,
244 |                                              class_embed,
245 |                                              num_denoising=100,
246 |                                              label_noise_ratio=0.5,
247 |                                              box_noise_scale=1.0):
248 |     """
249 |     targets: [targets] that contains labels, bboxes, etc
250 |     num_classes: the size of labels
251 |     num_queries: 300
252 |     class_embed: num_class x batch_size x label_dim OR num_class x batch_size (in the old case)
253 |     """
254 |     if num_denoising <= 0:
255 |         return None, None, None, None
256 |     # number of gt_bboxes in each batch sample
257 |     num_gts = [len(t["labels"]) for t in targets]
258 |     max_gt_num = max(num_gts)
259 |     if max_gt_num == 0:
260 |         return None, None, None, None
261 | 
262 |     num_group = num_denoising // max_gt_num  # the number of denoising group given num_denoising
263 |     num_group = 1 if num_group == 0 else num_group
264 |     # pad gt to max_num of a batch
265 |     bs = len(targets)
266 |     input_query_class = torch.full((bs, max_gt_num), num_classes, dtype=torch.int32)  # batch_size x max_gt_num (initialized with num_class)
267 |     input_query_bbox = torch.zeros((bs, max_gt_num, 4))  # batch_size x max_gt_num x 4
268 |     pad_gt_mask = torch.zeros((bs, max_gt_num))
269 |     for i in range(bs):
270 |         num_gt = num_gts[i]
271 |         if num_gt > 0:
272 |             input_query_class[i, :num_gt] = targets[i]["labels"].squeeze(-1)
273 |             input_query_bbox[i, :num_gt] = targets[i]["boxes"]
274 |             pad_gt_mask[i, :num_gt] = 1
275 |     # each group has positive and negative queries.
276 |     input_query_class = input_query_class.tile([1, 2 * num_group])  # batch_size x (max_gt_num*2*num_group)
277 |     input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
278 |     pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
279 |     # positive and negative mask
280 |     negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1])  # bs x max_gt_num*2 x 1
281 |     negative_gt_mask[:, max_gt_num:] = 1  # set the second half to be NEGATIVE
282 |     negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])  # bs x max_gt_num*2*num_group x 1
283 |     positive_gt_mask = 1 - negative_gt_mask
284 |     # contrastive denoising training positive index
285 |     positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
286 |     dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
287 |     dn_positive_idx = torch.split(dn_positive_idx,
288 |                                    [n * num_group for n in num_gts]) # split by batch+soze
289 |     # total denoising queries
290 |     num_denoising = int(max_gt_num * 2 * num_group)
291 | 
292 |     if label_noise_ratio > 0:
293 |         input_query_class = input_query_class.flatten()  # (batch_size*max_gt_num*2*num_group) * 1
294 |         pad_gt_mask = pad_gt_mask.flatten()
295 |         # half of bbox prob
296 |         mask = torch.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
297 |         chosen_idx = torch.nonzero(mask * pad_gt_mask).squeeze(-1)
298 |         # randomly put a new one here
299 |         new_label = torch.randint_like(
300 |             chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
301 |         input_query_class.scatter_(0, chosen_idx, new_label)
302 |         input_query_class = input_query_class.reshape(bs, num_denoising)
303 |         pad_gt_mask = pad_gt_mask.reshape(bs, num_denoising)
304 | 
305 |     if box_noise_scale > 0:
306 |         known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
307 | 
308 |         diff = torch.tile(input_query_bbox[..., 2:] * 0.5,
309 |                            [1, 1, 2]) * box_noise_scale
310 | 
311 |         rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
312 |         rand_part = torch.rand(input_query_bbox.shape)
313 |         rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
314 |             1 - negative_gt_mask)
315 |         rand_part *= rand_sign
316 |         known_bbox += rand_part * diff
317 |         known_bbox.clip_(min=0.0, max=1.0)
318 |         input_query_bbox = bbox_xyxy2cxcywh(known_bbox)
319 |         input_query_bbox = inverse_sigmoid(input_query_bbox)
320 | 
321 |     fixed_class = class_embed.dim() == 2
322 |     if fixed_class: # fixed class embedding. num_class * hidden_dim
323 |         class_embed = torch.cat(
324 |             [class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])  # (num_class+1) * hidden_dim
325 |     else:
326 |         assert class_embed.dim() == 3
327 |         # (num_class+1) x batch_size x hidden_dim
328 |         class_embed = torch.cat(
329 |             [class_embed, torch.zeros([1, class_embed.shape[-2], class_embed.shape[-1]], device=class_embed.device)])
330 | 
331 |     if fixed_class:
332 |         input_query_class_index = input_query_class.view(input_query_class.shape[0], -1)\
333 |             .long().flatten().reshape(-1,1).repeat(1, class_embed.shape[-1])
334 |         input_query_class = torch.gather(class_embed.to(input_query_class_index.device),
335 |                                          dim=0,
336 |                                          index=input_query_class_index).reshape([bs, num_denoising, -1])
337 |     else:
338 |         temp = []
339 |         input_query_class_index = input_query_class.view(input_query_class.shape[0], -1) \
340 |             .long().flatten().reshape(-1, 1).repeat(1, class_embed.shape[-1]).reshape([bs, num_denoising, -1])
341 |         for b_id in range(bs):
342 |             t = torch.gather(class_embed[:, b_id].to(input_query_class_index.device),
343 |                              dim=0, index=input_query_class_index[b_id])
344 |             temp.append(t)
345 |         input_query_class = torch.cat(temp, dim=0).reshape([bs, num_denoising, -1])
346 | 
347 |     tgt_size = num_denoising + num_queries
348 |     attn_mask = torch.ones([tgt_size, tgt_size]) < 0
349 |     # match query cannot see the reconstruction
350 |     attn_mask[num_denoising:, :num_denoising] = True
351 |     # reconstruct cannot see each other
352 |     for i in range(num_group):
353 |         if i == 0:
354 |             attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
355 |                       2 * (i + 1):num_denoising] = True
356 |         if i == num_group - 1:
357 |             attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
358 |                       i * 2] = True
359 |         else:
360 |             attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
361 |                       2 * (i + 1):num_denoising] = True
362 |             attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
363 |                       2 * i] = True
364 |     attn_mask = ~attn_mask
365 |     dn_meta = {
366 |         "dn_positive_idx": dn_positive_idx,
367 |         "dn_num_group": num_group,
368 |         "dn_num_split": [num_denoising, num_queries]
369 |     }
370 | 
371 |     return input_query_class, input_query_bbox, attn_mask, dn_meta
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 


--------------------------------------------------------------------------------
/omdet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/omdet/utils/analyze_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from collections import Counter
 3 | 
 4 | import numpy as np
 5 | from detectron2.checkpoint import DetectionCheckpointer
 6 | from detectron2.config import CfgNode, instantiate
 7 | from detectron2.data import  build_detection_test_loader
 8 | from detectron2.modeling import build_model
 9 | from detectron2.utils.analysis import FlopCountAnalysis
10 | from fvcore.nn import flop_count_table
11 | 
12 | __all__=["do_flop"]
13 | 
14 | logger = logging.getLogger("detectron2")
15 | 
16 | def do_flop(cfg):
17 |     if isinstance(cfg, CfgNode):
18 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TRAIN[0])
19 |         model = build_model(cfg)
20 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
21 |     else:
22 |         data_loader = instantiate(cfg.dataloader.test)
23 |         model = instantiate(cfg.model)
24 |         model.to(cfg.train.device)
25 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
26 |     model.eval()
27 | 
28 |     counts = Counter()
29 |     total_flops = []
30 |     for idx, data in zip(range(10), data_loader):  # noqa
31 |         flops = FlopCountAnalysis(model, data)
32 |         if idx > 0:
33 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
34 |         counts += flops.by_operator()
35 |         total_flops.append(flops.total())
36 | 
37 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
38 |     logger.info(
39 |         "Average GFlops for each type of operators:\n"
40 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
41 |     )
42 |     logger.info(
43 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
44 |     )
45 | 


--------------------------------------------------------------------------------
/omdet/utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import numpy as np
  6 | import torch
  7 | from torchvision.ops.boxes import box_area
  8 | 
  9 | 
 10 | def box_cxcywh_to_xyxy(x):
 11 |     x_c, y_c, w, h = x.unbind(-1)
 12 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 13 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 14 |     return torch.stack(b, dim=-1)
 15 | 
 16 | 
 17 | def box_xyxy_to_cxcywh(x):
 18 |     x0, y0, x1, y1 = x.unbind(-1)
 19 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 20 |          (x1 - x0), (y1 - y0)]
 21 |     return torch.stack(b, dim=-1)
 22 | 
 23 | 
 24 | # modified from torchvision to also return the union
 25 | def box_iou(boxes1, boxes2):
 26 |     area1 = box_area(boxes1)
 27 |     area2 = box_area(boxes2)
 28 | 
 29 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 30 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 31 | 
 32 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 33 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 34 | 
 35 |     union = area1[:, None] + area2 - inter
 36 | 
 37 |     iou = inter / union
 38 |     return iou, union
 39 | 
 40 | 
 41 | def generalized_box_iou(boxes1, boxes2):
 42 |     """
 43 |     Generalized IoU from https://giou.stanford.edu/
 44 | 
 45 |     The boxes should be in [x0, y0, x1, y1] format
 46 | 
 47 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 48 |     and M = len(boxes2)
 49 |     """
 50 |     # degenerate boxes gives inf / nan results
 51 |     # so do an early check
 52 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 53 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 54 |     iou, union = box_iou(boxes1, boxes2)
 55 | 
 56 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 57 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 58 | 
 59 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 60 |     area = wh[:, :, 0] * wh[:, :, 1]
 61 | 
 62 |     return iou - (area - union) / area
 63 | 
 64 | 
 65 | def masks_to_boxes(masks):
 66 |     """Compute the bounding boxes around the provided masks
 67 | 
 68 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
 69 | 
 70 |     Returns a [N, 4] tensors, with the boxes in xyxy format
 71 |     """
 72 |     if masks.numel() == 0:
 73 |         return torch.zeros((0, 4), device=masks.device)
 74 | 
 75 |     h, w = masks.shape[-2:]
 76 | 
 77 |     y = torch.arange(0, h, dtype=torch.float)
 78 |     x = torch.arange(0, w, dtype=torch.float)
 79 |     y, x = torch.meshgrid(y, x)
 80 | 
 81 |     x_mask = (masks * x.unsqueeze(0))
 82 |     x_max = x_mask.flatten(1).max(-1)[0]
 83 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
 84 | 
 85 |     y_mask = (masks * y.unsqueeze(0))
 86 |     y_max = y_mask.flatten(1).max(-1)[0]
 87 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
 88 | 
 89 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
 90 | 
 91 | 
 92 | def xyxy2xywh(x):
 93 |     """
 94 |     Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
 95 |     top-left corner and (x2, y2) is the bottom-right corner.
 96 | 
 97 |     Args:
 98 |         x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
 99 | 
100 |     Returns:
101 |         y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format.
102 |     """
103 |     assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
104 |     y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
105 |     y[..., 0] = (x[..., 0] + x[..., 2]) / 2  # x center
106 |     y[..., 1] = (x[..., 1] + x[..., 3]) / 2  # y center
107 |     y[..., 2] = x[..., 2] - x[..., 0]  # width
108 |     y[..., 3] = x[..., 3] - x[..., 1]  # height
109 |     return y
110 | 
111 | 
112 | def xywh2xyxy(x):
113 |     """
114 |     Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
115 |     top-left corner and (x2, y2) is the bottom-right corner.
116 | 
117 |     Args:
118 |         x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
119 | 
120 |     Returns:
121 |         y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
122 |     """
123 |     assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
124 |     y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
125 |     dw = x[..., 2] / 2  # half-width
126 |     dh = x[..., 3] / 2  # half-height
127 |     y[..., 0] = x[..., 0] - dw  # top left x
128 |     y[..., 1] = x[..., 1] - dh  # top left y
129 |     y[..., 2] = x[..., 0] + dw  # bottom right x
130 |     y[..., 3] = x[..., 1] + dh  # bottom right y
131 |     return y


--------------------------------------------------------------------------------
/omdet/utils/cache.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import lmdb
 3 | from collections import OrderedDict
 4 | 
 5 | 
 6 | class LRUCache:
 7 |     # initialising capacity
 8 |     def __init__(self, capacity: int):
 9 |         self.cache = OrderedDict()
10 |         self.capacity = capacity
11 | 
12 |     def has(self, key) -> bool:
13 |         return key in self.cache
14 | 
15 |     # we return the value of the key
16 |     # that is queried in O(1) and return -1 if we
17 |     # don't find the key in out dict / cache.
18 |     # And also move the key to the end
19 |     # to show that it was recently used.
20 |     def get(self, key):
21 |         if key not in self.cache:
22 |             return None
23 |         else:
24 |             self.cache.move_to_end(key)
25 |             return self.cache[key]
26 | 
27 |     # first, we add / update the key by conventional methods.
28 |     # And also move the key to the end to show that it was recently used.
29 |     # But here we will also check whether the length of our
30 |     # ordered dictionary has exceeded our capacity,
31 |     # If so we remove the first key (least recently used)
32 |     def put(self, key, value) -> None:
33 |         self.cache[key] = value
34 |         self.cache.move_to_end(key)
35 |         if len(self.cache) > self.capacity:
36 |             self.cache.popitem(last=False)
37 | 
38 |     def pop(self, key, value):
39 |         self.cache.pop(key, None)
40 | 
41 | 
42 | class LmdbReader:
43 |     def __init__(self, path):
44 |         self.path = path
45 |         self.env = self.init_lmdb(path)
46 | 
47 |     def init_lmdb(self, l_path):
48 |         env = lmdb.open(
49 |             l_path, readonly=True,
50 |             create=False, lock=False)  # readahead=not _check_distributed()
51 |         txn = env.begin(buffers=True)
52 |         return txn
53 | 
54 |     def read(self, _id):
55 |         try:
56 |             value = self.env.get(str(_id).encode("utf-8"))
57 |             value = pkl.loads(value)
58 |             return value
59 |         except Exception as e:
60 |             print("Error in reading {} from {}".format(_id, self.path))
61 |             raise e
62 | 


--------------------------------------------------------------------------------
/omdet/utils/plots.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | 
  4 | import cv2
  5 | import matplotlib
  6 | import numpy as np
  7 | import torch
  8 | from PIL import Image, ImageDraw, ImageFont, ImageOps
  9 | import platform
 10 | import math
 11 | 
 12 | def is_writeable(dir, test=False):
 13 |     # Return True if directory has write permissions, test opening a file with write permissions if test=True
 14 |     if test:  # method 1
 15 |         file = Path(dir) / 'tmp.txt'
 16 |         try:
 17 |             with open(file, 'w'):  # open file with write permissions
 18 |                 pass
 19 |             file.unlink()  # remove file
 20 |             return True
 21 |         except IOError:
 22 |             return False
 23 |     else:  # method 2
 24 |         return os.access(dir, os.R_OK)  # possible issues on Windows
 25 | 
 26 | def user_config_dir(dir='Ultralytics', env_var='YOLOV5_CONFIG_DIR'):
 27 |     # Return path of user configuration directory. Prefer environment variable if exists. Make dir if required.
 28 |     env = os.getenv(env_var)
 29 |     if env:
 30 |         path = Path(env)  # use environment variable
 31 |     else:
 32 |         cfg = {'Windows': 'AppData/Roaming', 'Linux': '.config', 'Darwin': 'Library/Application Support'}  # 3 OS dirs
 33 |         path = Path.home() / cfg.get(platform.system(), '')  # OS-specific config dir
 34 |         path = (path if is_writeable(path) else Path('/tmp')) / dir  # GCP and AWS lambda fix, only /tmp is writeable
 35 |     path.mkdir(exist_ok=True)  # make if required
 36 |     return path
 37 | 
 38 | # Settings
 39 | CONFIG_DIR = user_config_dir()  # Ultralytics settings dir
 40 | RANK = int(os.getenv('RANK', -1))
 41 | matplotlib.rc('font', **{'size': 11})
 42 | matplotlib.use('Agg')  # for writing to files only
 43 | 
 44 | def check_font(font='Arial.ttf', size=10):
 45 |     # Return a PIL TrueType Font, downloading to CONFIG_DIR if necessary
 46 |     font = Path(font)
 47 |     font = font if font.exists() else (CONFIG_DIR / font.name)
 48 |     try:
 49 |         return ImageFont.truetype(str(font) if font.exists() else font.name, size)
 50 |     except Exception as e:  # download if missing
 51 |         url = "https://ultralytics.com/assets/" + font.name
 52 |         print(f'Downloading {url} to {font}...')
 53 |         torch.hub.download_url_to_file(url, str(font), progress=False)
 54 |         return ImageFont.truetype(str(font), size)
 55 | 
 56 | def is_ascii(s=''):
 57 |     # Is string composed of all ASCII (no UTF) characters?
 58 |     s = str(s)  # convert list, tuple, None, etc. to str
 59 |     return len(s.encode().decode('ascii', 'ignore')) == len(s)
 60 | 
 61 | 
 62 | class Annotator:
 63 |     # if RANK in (-1, 0):
 64 |     #     check_font()  # download TTF if necessary
 65 | 
 66 |     # YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations
 67 |     def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=True):
 68 |         assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.'
 69 |         self.pil = pil
 70 |         self.offset = 0
 71 |         if self.pil:  # use PIL
 72 |             self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
 73 |             self.im = ImageOps.expand(self.im, border=self.offset, fill=(255, 255, 255))
 74 |             self.draw = ImageDraw.Draw(self.im)
 75 |             self.font = check_font(font, size=font_size or max(round(sum(self.im.size) / 2 * 0.035), 12))
 76 |             self.fh = 5  # font height
 77 |         else:  # use cv2
 78 |             self.im = im
 79 |         self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2)  # line width
 80 | 
 81 |     def _offset_box(self, box):
 82 |         return (np.array(box)+self.offset).tolist()
 83 | 
 84 |     def draw_arrow(self, ptA, ptB, width=1, color=(0, 255, 0)):
 85 |         """Draw line from ptA to ptB with arrowhead at ptB"""
 86 |         # Get drawing context
 87 |         # Draw the line without arrows
 88 |         self.draw.line((ptA, ptB), width=width, fill=color)
 89 | 
 90 |         # Now work out the arrowhead
 91 |         # = it will be a triangle with one vertex at ptB
 92 |         # - it will start at 95% of the length of the line
 93 |         # - it will extend 8 pixels either side of the line
 94 |         x0, y0 = ptA
 95 |         x1, y1 = ptB
 96 |         # Now we can work out the x,y coordinates of the bottom of the arrowhead triangle
 97 |         xb = 0.95 * (x1 - x0) + x0
 98 |         yb = 0.95 * (y1 - y0) + y0
 99 | 
100 |         # Work out the other two vertices of the triangle
101 |         # Check if line is vertical
102 |         if x0 == x1:
103 |             vtx0 = (xb - 5, yb)
104 |             vtx1 = (xb + 5, yb)
105 |         # Check if line is horizontal
106 |         elif y0 == y1:
107 |             vtx0 = (xb, yb + 5)
108 |             vtx1 = (xb, yb - 5)
109 |         else:
110 |             alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180
111 |             a = 8 * math.cos(alpha)
112 |             b = 8 * math.sin(alpha)
113 |             vtx0 = (xb + a, yb + b)
114 |             vtx1 = (xb - a, yb - b)
115 | 
116 |         # draw.point((xb,yb), fill=(255,0,0))    # DEBUG: draw point of base in red - comment out draw.polygon() below if using this line
117 |         # im.save('DEBUG-base.png')              # DEBUG: save
118 | 
119 |         # Now draw the arrowhead triangle
120 |         self.draw.polygon([vtx0, vtx1, ptB], fill=color)
121 | 
122 |     def box_label(self, box, label='', sub_label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
123 |         # Add one xyxy box to image with label
124 |         box = self._offset_box(box)
125 |         if self.pil or not is_ascii(label):
126 |             self.draw.rectangle(box, width=self.lw, outline=color)  # box
127 |             if label:
128 |                 w, h = 2, 2 # text width
129 |                 self.draw.rectangle([box[0], box[1] - self.fh, box[0] + w + 1, box[1] + 1], fill=color)
130 |                 # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls')  # for PIL>8.0
131 |                 self.draw.text((box[0], box[1] - h), label+'\n'+sub_label, fill=txt_color, font=self.font)
132 |         else:  # cv2
133 |             c1, c2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
134 |             cv2.rectangle(self.im, c1, c2, color, thickness=self.lw, lineType=cv2.LINE_AA)
135 |             if label:
136 |                 tf = max(self.lw - 1, 1)  # font thickness
137 |                 w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0]
138 |                 c2 = c1[0] + w, c1[1] - h - 3
139 |                 cv2.rectangle(self.im, c1, c2, color, -1, cv2.LINE_AA)  # filled
140 |                 ft = cv2.freetype.createFreeType2()
141 |                 ft.putText(self.im, label+'\n'+sub_label, (c1[0], c1[1] - 2), 0, self.lw / 3, txt_color, thickness=tf,
142 |                             lineType=cv2.LINE_AA)
143 | 
144 |     def tuple_label(self, src_box, dest_box, label='', src_color='red', dest_color='blue', txt_color=(255, 255, 255)):
145 |         # Add one xyxy box to image with label
146 |         src_box = self._offset_box(src_box)
147 |         dest_box = self._offset_box(dest_box)
148 | 
149 |         if self.pil or not is_ascii(label):
150 |             self.draw.rectangle(src_box, width=self.lw, outline=src_color)  # box
151 |             self.draw.rectangle(dest_box, width=self.lw, outline=dest_color)  # box
152 |             src_c = (int((src_box[2]+src_box[0])/2), int((src_box[3]+src_box[1])/2))
153 |             dest_c = (int((dest_box[2]+dest_box[0])/2), int((dest_box[3]+dest_box[1])/2))
154 |             c_c = [(src_c[0]+dest_c[0])/2, (src_c[1]+dest_c[1])/2]
155 |             # self.draw.line(xy=[src_c, dest_c], fill='green')
156 |             self.draw_arrow(src_c, dest_c, color='green', width=2)
157 | 
158 |             if label:
159 |                 w, h = self.font.getsize(label)  # text width
160 |                 self.draw.rectangle([c_c[0], c_c[1] - self.fh, c_c[0] + w + 1, c_c[1] + 1], fill='green')
161 |                 self.draw.text((c_c[0], c_c[1] - h), label, fill=txt_color, font=self.font)
162 | 
163 |         else:  # cv2
164 |            raise Exception("CV2 is not supported yet")
165 | 
166 |     def rectangle(self, xy, fill=None, outline=None, width=1):
167 |         # Add rectangle to image (PIL-only)
168 |         self.draw.rectangle(xy, fill, outline, width)
169 | 
170 |     def text(self, xy, text, txt_color=(255, 255, 255)):
171 |         # Add text to image (PIL-only)
172 |         w, h = self.font.getsize(text)  # text width, height
173 |         self.draw.text((xy[0], xy[1] - h + 1), text, fill=txt_color, font=self.font)
174 | 
175 |     def result(self):
176 |         # Return annotated image as array
177 |         return np.asarray(self.im)
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/omdet/utils/registry.py:
--------------------------------------------------------------------------------
 1 | def _register_generic(module_dict, module_name, module):
 2 |     assert module_name not in module_dict
 3 |     module_dict[module_name] = module
 4 | 
 5 | 
 6 | class Registry(dict):
 7 |     '''
 8 |     A helper class for managing registering modules, it extends a dictionary
 9 |     and provides a register functions.
10 |     Eg. creeting a registry:
11 |         some_registry = Registry({"default": default_module})
12 |     There're two ways of registering new modules:
13 |     1): normal way is just calling register function:
14 |         def foo():
15 |             ...
16 |         some_registry.register("foo_module", foo)
17 |     2): used as decorator when declaring the module:
18 |         @some_registry.register("foo_module")
19 |         @some_registry.register("foo_modeul_nickname")
20 |         def foo():
21 |             ...
22 |     Access of module is just like using a dictionary, eg:
23 |         f = some_registry["foo_modeul"]
24 |     '''
25 |     def __init__(self, *args, **kwargs):
26 |         super(Registry, self).__init__(*args, **kwargs)
27 | 
28 |     def register(self, module_name, module=None):
29 |         # used as function call
30 |         if module is not None:
31 |             _register_generic(self, module_name, module)
32 |             return
33 | 
34 |         # used as decorator
35 |         def register_fn(fn):
36 |             _register_generic(self, module_name, fn)
37 |             return fn
38 | 
39 |         return register_fn


--------------------------------------------------------------------------------
/omdet/utils/tools.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import base64
  3 | import re
  4 | from PIL import ImageDraw, Image
  5 | import lmdb
  6 | from detectron2.data import transforms as T
  7 | import logging
  8 | from tqdm import tqdm
  9 | import os
 10 | from detectron2.data import detection_utils as utils
 11 | import pickle
 12 | import numpy as np
 13 | from detectron2.config import CfgNode
 14 | from typing import Generator, Sequence
 15 | from joblib import Parallel, delayed
 16 | import torch
 17 | import random
 18 | 
 19 | def make_continuous_categories(cats, verbose=True):
 20 |     # return a continuous categord_id from 1 to num_classes
 21 |     diff_cnt = 0
 22 |     for c_id, c in enumerate(cats):
 23 |         if c['id'] != c_id+1:
 24 |             diff_cnt += 1
 25 |         c['id'] = c_id + 1
 26 | 
 27 |     if verbose:
 28 |         print("Changed {} category_id among {} cats".format(diff_cnt, len(cats)))
 29 | 
 30 |     return cats
 31 | 
 32 | def is_overlap(a, b):
 33 |     if b[1] - b[0] == 0 or a[1] - a[0] == 0:
 34 |         return False
 35 | 
 36 |     return a[0] <= b[0] < a[1] or b[0] <= a[0] < b[1]
 37 | 
 38 | 
 39 | def get_span_embedding(model, tokenizer, sent, spans, layers, device):
 40 |     assert len(sent) == len(spans)
 41 |     encoded = tokenizer.batch_encode_plus(sent, return_tensors="pt", padding=True)
 42 |     encoded = encoded.to(device)
 43 |     # token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
 44 |     with torch.no_grad():
 45 |         output = model(**encoded)
 46 | 
 47 |     # Get all hidden states
 48 |     states = output.hidden_states
 49 |     # Stack and sum all requested layers
 50 |     output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
 51 | 
 52 |     # Only select the tokens that constitute the requested word
 53 |     results = []
 54 |     for b_id, b_span in enumerate(spans):
 55 |         offsets = encoded.encodings[b_id].offsets
 56 |         feats = []
 57 |         valid_offsets = []
 58 |         for t_id, t_span in enumerate(offsets):
 59 |             valid = False
 60 |             for s in b_span:
 61 |                 if is_overlap(t_span, s):
 62 |                     valid = True
 63 |                     break
 64 |             if valid:
 65 |                 feats.append(output[b_id, t_id].view(1, -1))
 66 |                 valid_offsets.append(t_span)
 67 | 
 68 |         if len(feats) == 0:
 69 |             raise Exception(f"Sentence '{sent[b_id]}' ({len(sent[b_id])}) cannot find valid span for {b_span}.")
 70 | 
 71 |         res = torch.mean(torch.stack(feats, dim=0), dim=0).cpu().tolist()
 72 |         results.append(res[0])
 73 |     return results
 74 | 
 75 | 
 76 | def get_txt_embedding(model, sent):
 77 |     txt_embedding = model._text_encode(sent)
 78 |     return txt_embedding
 79 | 
 80 | 
 81 | def clean_t(x, max_len, rm_sym=True, must_idx=None, return_offset=False):
 82 |     """
 83 |     rm_sym: remove symbol _
 84 |     """
 85 |     s_id = 0
 86 |     x = x.lower()
 87 |     if rm_sym:
 88 |         x = x.replace('_', ' ').replace('-', ' ')
 89 |         x = ' '.join(x.split())  # remove duplicate space
 90 | 
 91 |     if must_idx is not None:
 92 |         min_id, max_id = must_idx
 93 |         if max_id >= max_len:
 94 |             s_id = max(0, min(min_id, int(max_id - (max_len / 2))))
 95 |             e_id = min(len(x), int(max_id + (max_len / 2)))
 96 |             # print(f"Special cut ({must_idx}): from {s_id} to {e_id} for sent of len {len(x)}")
 97 |             x = x[s_id:e_id]
 98 |     else:
 99 |         x = x[0:max_len]
100 |     if return_offset:
101 |         return x, s_id
102 |     else:
103 |         return x
104 | 
105 | def sample_true(prob):
106 |     if prob <= 0:
107 |         return False
108 |     generated_neg_prob = random.random()
109 |     valid = generated_neg_prob < prob
110 |     return valid
111 | 
112 | def rm_duplicates(input_list, keep_order=False):
113 |     if not keep_order:
114 |         return list(set(input_list))
115 | 
116 |     # Create an empty set to store the items that have been seen
117 |     seen = set()
118 | 
119 |     # Create an empty list to store the result
120 |     result = []
121 | 
122 |     # Iterate over the input list
123 |     for item in input_list:
124 |         # If the item is not already in the seen set, add it to the result list
125 |         if item not in seen:
126 |             result.append(item)
127 | 
128 |         # Add the item to the seen set
129 |         seen.add(item)
130 | 
131 |     # Return the result list
132 |     return result
133 | 
134 | 
135 | def chunks(l: Sequence, n: int = 5) -> Generator[Sequence, None, None]:
136 |     """Yield successive n-sized chunks from l."""
137 |     for i in range(0, len(l), n):
138 |         yield l[i:i + n]
139 | 
140 | 
141 | def encode_dump_text(model, feat_path, text_vocab, batch_size):
142 |     text_keys = []
143 |     for block in tqdm(chunks(text_vocab, n=batch_size)):
144 |         block_feats = []
145 |         block_keys = []
146 |         for batch in chunks(block, n=500):
147 |             batch_fs = get_txt_embedding(model, batch)
148 |             batch_keys = batch
149 |             block_feats.extend(batch_fs)
150 |             block_keys.extend(batch_keys)
151 | 
152 |         text_keys.extend(block_keys)
153 |         write_lmdb_from_id_data_pairs(
154 |             id_data_pairs=[(key, embed) for key, embed in zip(block_keys, block_feats)],
155 |             lmdb_save_dir=feat_path
156 |         )
157 |     return text_keys
158 | 
159 | 
160 | def cropbox(xmin, ymin, xmax, ymax, img_size, ratio=1.5, make_square=False):
161 |     if xmin < 0 or ymin < 0 or xmax < 0 or ymax < 0:
162 |         raise Exception
163 |     w, h = img_size
164 |     if xmin > w or ymin > h or xmax > w or ymax > h:
165 |         raise Exception
166 | 
167 |     xc = xmin + (xmax - xmin) / 2
168 |     yc = ymin + (ymax - ymin) / 2
169 |     w = xmax - xmin
170 |     h = ymax - ymin
171 |     nw = w * ratio
172 |     nh = h * ratio
173 | 
174 |     if make_square:
175 |         if nw > nh:
176 |             nh = nw
177 |         else:
178 |             nw = nh
179 | 
180 |     nxmin = max(xc - (nw / 2), 0)
181 |     nymin = max(yc - (nh / 2), 0)
182 | 
183 |     nxmax = min(xc + (nw / 2), img_size[0])
184 |     nymax = min(yc + (nh / 2), img_size[1])
185 | 
186 |     return nxmin, nymin, nxmax, nymax
187 | 
188 | 
189 | def image_to_base64(img):
190 |     output_buffer = io.BytesIO()
191 |     img.save(output_buffer, format='JPEG')
192 |     byte_data = output_buffer.getvalue()
193 |     base64_str = base64.b64encode(byte_data)
194 |     return base64_str
195 | 
196 | 
197 | def base64_to_image(base64_str):
198 |     return Image.open(io.BytesIO(base64.b64decode(base64_str)))
199 | 
200 | 
201 | def draw_bounding_box_on_image(image, xmin, ymin, xmax, ymax,
202 |                                color='red',
203 |                                text='',
204 |                                thickness=4):
205 |     draw = ImageDraw.Draw(image)
206 |     draw.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=thickness)
207 |     draw.text((xmin, ymin), text)
208 |     return image
209 | 
210 | 
211 | def build_transform_gen(cfg, is_train):
212 |     """
213 |     Create a list of :class:`TransformGen` from config.
214 |     Returns:
215 |         list[TransformGen]
216 |     """
217 |     if is_train:
218 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
219 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
220 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
221 |     else:
222 |         min_size = cfg.INPUT.MIN_SIZE_TEST
223 |         max_size = cfg.INPUT.MAX_SIZE_TEST
224 |         sample_style = "choice"
225 |     if sample_style == "range":
226 |         assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
227 | 
228 |     tfm_gens = []
229 |     if is_train:
230 |         tfm_gens.append(T.RandomFlip())
231 |     tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
232 |     # tfm_gens.append(T.Resize(min_size))
233 |     if is_train:
234 |         logger = logging.getLogger(__name__)
235 |         logger.info("TransformGens used in training: " + str(tfm_gens))
236 |     return tfm_gens
237 | 
238 | 
239 | def jp(a, b):
240 |     return os.path.join(a, b)
241 | 
242 | 
243 | def check_img(i, img_root):
244 |     # i['file_name'] = i['file_name'].split('/')[-1]
245 |     try:
246 |         iimage = utils.read_image(jp(img_root, i["file_name"]), format='RGB')
247 |         utils.check_image_size(i, iimage)
248 | 
249 |     except Exception as e:
250 |         print("BAD D2 IMG", i)
251 |         if 'image_id' in i:
252 |             return i['image_id']
253 |         else:
254 |             return i['id']
255 | 
256 |     return None
257 | 
258 | 
259 | def fix_img_size(i, img_root):
260 |     try:
261 |         if not "file_name" in i:
262 |             i["file_name"] = i["coco_url"].split("/")[-1]
263 |         img = Image.open(jp(img_root, i['file_name']))
264 |         w, h = img.size
265 |         if i['width'] != w or i['height'] != h:
266 |             print("Found image {} with wrong size.\n".format(i['id']))
267 |             i['width'] = w
268 |             i['height'] = h
269 | 
270 |         return i
271 |     except Exception as e:
272 |         print("BAD IMG", i, e)
273 |         return None
274 | 
275 | 
276 | def fix_data(img_root, data):
277 |     if type(data) is dict:
278 |         num_imgs = len(data['images'])
279 |         data['images'] = Parallel(n_jobs=15, backend='threading')(
280 |             delayed(fix_img_size)(i, img_root) for i in tqdm(data['images']))
281 |         data['images'] = [i for i in data['images'] if i is not None]
282 |         print("First stage image fixing go from {} to {}".format(num_imgs, len(data['images'])))
283 | 
284 |         bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data['images']))
285 |         bad_ids = [x for x in set(bad_ids) if x is not None]
286 |         print("Found {} bad images with D2 checking".format(len(bad_ids)))
287 |         data['images'] = [d for d in data['images'] if d['id'] not in bad_ids]
288 |         print("Images go from {} to {}".format(num_imgs, len(data['images'])))
289 | 
290 |         prev_anno_size = len(data['annotations'])
291 |         valid_imgs = {i['id'] for i in data['images']}
292 |         data['annotations'] = [d for d in data['annotations'] if d['image_id'] in valid_imgs]
293 |         print("Anno go from {} to {} after fixing.".format(prev_anno_size, len(data['annotations'])))
294 |     else:
295 |         num_imgs = len(data)
296 |         data = Parallel(n_jobs=15, backend='threading')(delayed(fix_img_size)(i, img_root) for i in tqdm(data))
297 |         data = [i for i in data if i is not None]
298 |         print("First stage image fixing go from {} to {}".format(num_imgs, len(data)))
299 | 
300 |         bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data))
301 |         bad_ids = [x for x in set(bad_ids) if x is not None]
302 |         print("Found {} bad images with D2 checking".format(len(bad_ids)))
303 |         data = [d for d in data if d['id'] not in bad_ids]
304 |         print("Images go from {} to {}".format(num_imgs, len(data)))
305 |     return data
306 | 
307 | 
308 | def convert_cfg_to_dict(cfg_node, key_list):
309 |     if not isinstance(cfg_node, CfgNode):
310 |         return cfg_node
311 |     else:
312 |         cfg_dict = dict(cfg_node)
313 |         for k, v in cfg_dict.items():
314 |             cfg_dict[k] = convert_cfg_to_dict(v, key_list + [k])
315 |         return cfg_dict
316 | 
317 | 
318 | def flatten_json(json_file):
319 |     out = {}
320 | 
321 |     def flatten(x, name=''):
322 |         if type(x) is dict:
323 |             for a in x:
324 |                 flatten(x[a], name + a + '.')
325 |         elif type(x) is list:
326 |             i = 0
327 |             for a in x:
328 |                 flatten(a, name + str(i) + '.')
329 |                 i += 1
330 |         else:
331 |             out[name[:-1]] = x
332 | 
333 |     flatten(json_file)
334 |     return out
335 | 
336 | 
337 | def convert_to_float(value):
338 |     if isinstance(value, float):
339 |         return value
340 |     try:  # try pytorch
341 |         return value.item()
342 |     except:
343 |         try:  # try numpy
344 |             print(value.dtype)
345 |             return np.asscalar(value)
346 |         except:
347 |             raise ValueError('do not know how to convert this number {} to float'.format(value))
348 | 
349 | 
350 | def remove_punctuation(text: str) -> str:
351 |     punct = ['|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^',
352 |              '\'', '\"', '’', '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
353 |              ]
354 |     for p in punct:
355 |         text = text.replace(p, '')
356 |     return text.strip()


--------------------------------------------------------------------------------
/outputs/000000574769.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/outputs/000000574769.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | timm==0.9.16
 2 | transformers==4.21.0
 3 | lmdb==1.4.1
 4 | Pillow==8.4.0
 5 | ftfy==6.2.0
 6 | joblib==1.3.2
 7 | opencv-python==4.7.0.72
 8 | pydantic
 9 | fastapi
10 | uvicorn


--------------------------------------------------------------------------------
/run_demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from omdet.inference.det_engine import DetEngine
 4 | from omdet.utils.plots import Annotator
 5 | from PIL import Image
 6 | import numpy as np
 7 | 
 8 | if __name__ == "__main__":
 9 |     engine = DetEngine(batch_size=1, device='cuda')
10 |     img_paths = ['./sample_data/000000574769.jpg']       # path of images
11 |     labels = ["person", "cat", "orange"]          # labels to be predicted
12 |     prompt = 'Detect {}.'.format(','.join(labels))        # prompt of detection task, use "Detect {}." as default
13 | 
14 |     res = engine.inf_predict('OmDet-Turbo_tiny_SWIN_T',    # prefix name of the pretrained checkpoints
15 |                            task=prompt,
16 |                            data=img_paths,
17 |                            labels=labels,
18 |                            src_type='local',                     # type of the image_paths, "local"/"url"
19 |                            conf_threshold=0.30,
20 |                            nms_threshold=0.5
21 |                           )
22 |     print(res)
23 | 
24 |     out_folder = './outputs'
25 |     for idx, img_path in enumerate(img_paths):
26 |         im = Image.open(img_path)
27 |         a = Annotator(np.ascontiguousarray(im), font_size=12, line_width=1, pil=True, font='sample_data/simsun.ttc')
28 |         for R in res[idx]:
29 |             a.box_label([R['xmin'], R['ymin'], R['xmax'], R['ymax']],
30 |                         label=f"{R['label']} {str(int(R['conf'] * 100))}%",
31 |                         color='red')
32 | 
33 |         if not os.path.exists(out_folder):
34 |             os.mkdir(out_folder)
35 | 
36 |         image = a.result()
37 |         img = Image.fromarray(image)
38 |         img.save('outputs/'+img_path.split('/')[-1])


--------------------------------------------------------------------------------
/run_wsgi.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import uvicorn
 3 | from fastapi import FastAPI
 4 | from omdet.inference.det_engine import DetEngine
 5 | from pydantic import BaseModel
 6 | from typing import List, Dict, Union
 7 | 
 8 | 
 9 | class InfDetectBody(BaseModel):
10 |     model_id: str
11 |     data: List[str]
12 |     src_type: str = "url"
13 |     task: str
14 |     labels: List[str]
15 |     threshold: float = 0.1
16 |     nms_threshold: float = 0.5
17 | 
18 | 
19 | class Object(BaseModel):
20 |     xmin: float
21 |     ymin: float
22 |     xmax: float
23 |     ymax: float
24 |     conf: float
25 |     label: str
26 | 
27 | 
28 | class DetectionRes(BaseModel):
29 |     took: int
30 |     objects: List[List[Object]] = []
31 | 
32 | 
33 | app = FastAPI()
34 | 
35 | 
36 | @app.on_event("startup")
37 | async def startup_event():
38 |     app.state.detector = DetEngine(model_dir="resources/", device="cuda", batch_size=10)
39 | 
40 | 
41 | @app.post(
42 |     "/inf_predict",
43 |     response_model=DetectionRes,
44 |     name="Detect objects with Inf Possibilities",
45 | )
46 | async def detect_urls(
47 |         body: InfDetectBody = None,
48 | ) -> DetectionRes:
49 |     s_time = time.time()
50 |     out = app.state.detector.inf_predict(
51 |         body.model_id,
52 |         task=body.task,
53 |         labels=body.labels,
54 |         data=body.data,
55 |         src_type=body.src_type,
56 |         conf_threshold=body.threshold,
57 |         nms_threshold=body.nms_threshold,
58 |     )
59 | 
60 |     resp = DetectionRes(took=int((time.time() - s_time) * 1000), objects=out)
61 |     return resp
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     uvicorn.run("run_wsgi:app", host="0.0.0.0", port=8000)
66 | 


--------------------------------------------------------------------------------
/sample_data/000000574769.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/000000574769.jpg


--------------------------------------------------------------------------------
/sample_data/simsun.ttc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/simsun.ttc


--------------------------------------------------------------------------------