├── .gitignore
├── LICENSE
├── README.md
├── add_hf.py
├── configs
└── OmDet-Turbo_tiny_SWIN_T.yaml
├── docs
├── cvt_grounding_dino-en.md
├── cvt_grounding_dino-zh.md
├── main_results.png
├── speed_compare.jpeg
└── turbo_model.jpeg
├── export.py
├── install.md
├── omdet
├── __init__.py
├── inference
│ ├── __init__.py
│ ├── base_engine.py
│ └── det_engine.py
├── modeling
│ ├── __init__.py
│ ├── backbone
│ │ ├── __init__.py
│ │ ├── bifpn.py
│ │ ├── config.py
│ │ ├── convnext.py
│ │ ├── dlafpn.py
│ │ └── swint.py
│ ├── common.py
│ ├── language_backbone
│ │ ├── __init__.py
│ │ ├── backbone.py
│ │ ├── clip
│ │ │ ├── __init__.py
│ │ │ ├── models
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ │ ├── clip.py
│ │ │ │ └── model.py
│ │ │ └── simple_tokenizer.py
│ │ └── word_utils.py
│ └── registry.py
├── omdet_v2_turbo
│ ├── __init__.py
│ ├── block.py
│ ├── build_components.py
│ ├── config.py
│ ├── conv.py
│ ├── detector.py
│ ├── detr_torch.py
│ ├── dn_ops.py
│ ├── ela_decoder.py
│ ├── ela_encoder.py
│ ├── head.py
│ ├── infer_model.py
│ └── torch_utils.py
└── utils
│ ├── __init__.py
│ ├── analyze_model.py
│ ├── box_ops.py
│ ├── cache.py
│ ├── plots.py
│ ├── registry.py
│ └── tools.py
├── outputs
└── 000000574769.jpg
├── requirements.txt
├── run_demo.py
├── run_wsgi.py
└── sample_data
├── 000000574769.jpg
└── simsun.ttc
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OmDet-Turbo
2 |
3 |
4 | [Paper 📄] [Model 🗂️]
5 |
6 |
7 | Fast and accurate open-vocabulary end-to-end object detection
8 |
9 |
10 | ***
11 | ## 🗓️ Updates
12 | * 09/26/2024:OmDet-Turbo has been integrated into Transformers version 4.45.0. The code is available at [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models/omdet_turbo), and the Hugging Face model is available at [here](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf).
13 | * 07/05/2024: Our new open-source project, [OmAget: A multimodal agent framework for solving complex tasks](https://github.com/om-ai-lab/OmAgent) is available !!! Additionally, OmDet has been seamlessly integrated as an OVD tool within it. Feel free to delve into our innovative multimodal agent framework.
14 | * 06/24/2024: Guidance for [converting OmDet-Turbo to ONNX](https://github.com/om-ai-lab/OmDet#:~:text=How%20To%20Export%20ONNX%20Model)
15 | * 03/25/2024: Inference code and a pretrained OmDet-Turbo-Tiny model released.
16 | * 03/12/2024: Github open-source project created
17 |
18 | ***
19 | ## 🔗 Related Works
20 | If you are interested in our research, we welcome you to explore our other wonderful projects.
21 |
22 | 🔆 [How to Evaluate the Generalization of Detection? A Benchmark for Comprehensive Open-Vocabulary Detection](https://arxiv.org/abs/2308.13177)(AAAI24) 🏠[Github Repository](https://github.com/om-ai-lab/OVDEval/tree/main)
23 |
24 | 🔆 [OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network](https://ietresearch.onlinelibrary.wiley.com/doi/full/10.1049/cvi2.12268)(IET Computer Vision)
25 |
26 | ***
27 | ## 📖 Introduction
28 | This repository is the official PyTorch implementation for **OmDet-Turbo**, a fast transformer-based open-vocabulary object detection model.
29 |
30 | **⭐️Highlights**
31 | 1. **OmDet-Turbo** is a transformer-based real-time open-vocabulary
32 | detector that combines strong OVD capabilities with fast inference speed.
33 | This model addresses the challenges of efficient detection in open-vocabulary
34 | scenarios while maintaining high detection performance.
35 | 2. We introduce the **Efficient Fusion Head**, a swift multimodal fusion module
36 | designed to alleviate the computational burden on the encoder and reduce
37 | the time consumption of the head with ROI.
38 | 3. OmDet-Turbo-Base model, achieves state-of-the-art zero-shot performance on the ODinW and OVDEval datasets, with AP scores
39 | of **30.1** and **26.86**, respectively.
40 | 4. The inference speed of OmDetTurbo-Base on the COCO val2017 dataset reach **100.2** FPS on an A100 GPU.
41 |
42 | For more details, check out our paper **[Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892)**
43 |
44 |
45 |
46 | ***
47 | ## ⚡️ Inference Speed
48 | Comparison of inference speeds for each component in tiny-size model.
49 |
50 |
51 | ***
52 | ## 🛠️ How To Install
53 | Follow the [Installation Instructions](install.md) to set up the environments for OmDet-Turbo
54 |
55 | ***
56 | ## 🚀 How To Run
57 | ### Local Inference
58 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints.
59 | 2. Create a folder named **resources**, put downloaded models into this folder.
60 | 3. Run **run_demo.py**, the images with predicted results will be saved at **./outputs** folder.
61 | ### Run as a API Server
62 | 1. Download our pretrained model and the [CLIP](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt?download=true) checkpoints.
63 | 2. Create a folder named **resources**, put downloaded models into this folder.
64 | 3. Run **run_wsgi.py**, the API server will be started at **http://host_ip:8000/inf_predict**, check **http://host_ip:8000/docs** to have a try.
65 |
66 | We already added language cache while inferring with **run_demo.py**. For more details, please open and check **run_demo.py** scripts.
67 |
68 |
69 | ***
70 | ## ⚙️ How To Export ONNX Model
71 | 1. Replace **OmDetV2Turbo** in **OmDet-Turbo_tiny_SWIN_T.yaml** with **OmDetV2TurboInfer**
72 | 2. Run **export.py**, and the omdet.onnx will be exported.
73 |
74 | In the above example, post processing is not included in onnx model , and all input size are fixed. You can add more post processing and change the input size according to your needs.
75 |
76 |
77 | ***
78 | ## 📦 Model Zoo
79 | The performance of COCO and LVIS are evaluated under zero-shot setting.
80 |
81 | Model | Backbone | Pre-Train Data | COCO | LVIS | FPS (pytorch/trt) |Weight
82 | -- |--------|-----------------| -- | -- |-------------------| --
83 | OmDet-Turbo-Tiny| Swin-T | O365,GoldG | 42.5 | 30.3 | 21.5/140.0 | [weight](https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/tree/main)
84 |
85 | ***
86 | ## 📝 Main Results
87 |
88 |
89 | ***
90 | ## Citation
91 | Please consider citing our papers if you use our projects:
92 |
93 | ```
94 | @article{zhao2024real,
95 | title={Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head},
96 | author={Zhao, Tiancheng and Liu, Peng and He, Xuan and Zhang, Lu and Lee, Kyusong},
97 | journal={arXiv preprint arXiv:2403.06892},
98 | year={2024}
99 | }
100 | ```
101 |
102 | ```
103 | @article{zhao2024omdet,
104 | title={OmDet: Large-scale vision-language multi-dataset pre-training with multimodal detection network},
105 | author={Zhao, Tiancheng and Liu, Peng and Lee, Kyusong},
106 | journal={IET Computer Vision},
107 | year={2024},
108 | publisher={Wiley Online Library}
109 | }
110 | ```
111 |
--------------------------------------------------------------------------------
/add_hf.py:
--------------------------------------------------------------------------------
1 | from omdet.inference.det_engine import DetEngine
2 | from omdet.omdet_v2_turbo.detector import OmDetV2Turbo
3 |
4 |
5 | if __name__ == "__main__":
6 | engine = DetEngine(batch_size=1, device='cuda')
7 | img_paths = ['./sample_data/000000574769.jpg'] # path of images
8 | labels = ["person", "cat", "orange"] # labels to be predicted
9 | prompt = 'Detect {}.'.format(','.join(labels)) # prompt of detection task, use "Detect {}." as default
10 |
11 | model_id = 'OmDet-Turbo_tiny_SWIN_T'
12 | model, cfg = engine._load_model(model_id)
13 |
14 | # push to hub
15 | model.push_to_hub("nielsr/omde-v2-turbo-tiny-swin-tiny")
16 |
17 | # reload
18 | model = OmDetV2Turbo.from_pretrained("nielsr/omde-v2-turbo-tiny-swin-tiny")
--------------------------------------------------------------------------------
/configs/OmDet-Turbo_tiny_SWIN_T.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: OmDetV2Turbo
3 | DEPLOY_MODE: true
4 | SWIN:
5 | OUT_FEATURES:
6 | - 1
7 | - 2
8 | - 3
9 | SIZE: T
10 | USE_CHECKPOINT: false
11 | BACKBONE:
12 | NAME: build_swintransformer_backbone
13 | LANGUAGE_BACKBONE:
14 | MODEL_TYPE: "clip"
15 | LANG_DIM: 512
16 | DEVICE: cuda
17 | FUSE_TYPE: merged_attn
18 | TRANSFORMER_DECODER: ELADecoder
19 | TRANSFORMER_ENCODER: ELAEncoder
20 | HEAD: DINOHead
21 | ELAEncoder:
22 | act: gelu
23 | depth_mult: 1.0
24 | dim_feedforward: 2048
25 | encoder_layer: TransformerLayer
26 | eval_size: null
27 | expansion: 1.0
28 | feat_strides:
29 | - 8
30 | - 16
31 | - 32
32 | hidden_dim: 256
33 | in_channels:
34 | - 192
35 | - 384
36 | - 768
37 | num_encoder_layers: 1
38 | pe_temperature: 10000
39 | use_encoder_idx:
40 | - 2
41 | PIXEL_MEAN:
42 | - 123.675
43 | - 116.28
44 | - 103.53
45 | PIXEL_STD:
46 | - 58.395
47 | - 57.12
48 | - 57.375
49 | ELADecoder:
50 | activation: relu
51 | backbone_feat_channels:
52 | - 256
53 | - 256
54 | - 256
55 | box_noise_scale: 1.0
56 | cls_type: cosine
57 | dim_feedforward: 2048
58 | dropout: 0.0
59 | eps: 0.01
60 | eval_idx: -1
61 | eval_size: null
62 | feat_strides:
63 | - 8
64 | - 16
65 | - 32
66 | hidden_dim: 256
67 | label_noise_ratio: 0.5
68 | learnt_init_query: false
69 | nhead: 8
70 | num_decoder_layers: 6
71 | num_decoder_points: 4
72 | num_denoising: 100
73 | num_levels: 3
74 | num_queries: 900
75 | position_embed_type: sine
76 | WEIGHTS: resources/swin_tiny_patch4_window7_224.pkl
77 | INPUT:
78 | FORMAT: RGB
79 | MAX_SIZE_TEST: 640
80 | MIN_SIZE_TEST: 640
81 |
--------------------------------------------------------------------------------
/docs/cvt_grounding_dino-en.md:
--------------------------------------------------------------------------------
1 | # Grounding DINO to TensorRT Conversion
2 |
3 | Given that many people are interested about how to convert Grounding DINO mentioned in our paper to TensorRT, here is a brief introduction to our previous conversion approach. Additionally, while organizing the TRT conversion, we discovered a minor issue with the previous Grounding-DINO-T conversion. The correct FP16 speed after proper conversion should be approximately 27 FPS.
4 |
5 | ## Converting PyTorch Model to ONNX Model
6 | The original Grounding DINO code requires slight modifications to be converted to an ONNX model. However, when converting the ONNX model to a TensorRT model, various errors may occur. To avoid errors during ONNX to TensorRT conversion, some additional changes must be made when converting to the ONNX model.
7 |
8 | - Comment out the statements using checkpoints in the backbone.
9 | - Rewrite the NestedTensor in the code; avoid using the NestedTensor data structure. NestedTensor is mainly concentrated in the visual part. Use Tensor directly instead.
10 | - Rewrite the Joiner class in `backbone.py` as shown in the example below. The rewritten class should inherit from `nn.Module` instead of `nn.Sequential`. This might be the key to avoiding issues when converting the ONNX model to a TensorRT model. Some content in the `build_backbone` function can be moved to the rewritten Joiner class.
11 | - Treat the tokenizer as data preprocessing and place it outside the model; the output should be directly passed as input to the model's forward function.
12 | - The special handling in the `nested_tensor_from_tensor_list` function for ONNX conversion needs to be retained.
13 | - Make other necessary changes due to the above modifications.
14 |
15 | ```python
16 | class Joiner(nn.Module):
17 | def __init__(self):
18 | self.backbone = xxxx
19 | self.position_embedding = xxx
20 |
21 | def forward(self):
22 | pass
23 | ```
24 |
25 | ## Converting ONNX Model to TensorRT Model
26 | The ONNX model converted according to the above suggestions can be smoothly converted to a TensorRT model.
27 |
28 | - It is recommended to use the latest version of TensorRT; it is indeed very fast.
29 | - Fixing the input dimensions can provide certain advantages. The speed tests for Grounding DINO in Omdet are based on fixed input dimensions.
30 | - F32 is almost lossless. When converting to FP16, there is a significant loss of precision, and some layers with substantial losses need extra handling. The speed tests for Grounding DINO in Omdet are based on FP16 models. FP32 is about 25-30% slower than FP16.
31 |
--------------------------------------------------------------------------------
/docs/cvt_grounding_dino-zh.md:
--------------------------------------------------------------------------------
1 | # Grounding DINO 转TensorRT
2 | 鉴于不少同学提问想知道我们Paper提到的Grounding DINO的TRT是如何转换,所以在这里简单介绍一下我们之前的转换思路。此外,我们在整理TRT转换时也发现之前的Grounding-DINO-T转换得有点小问题,实际正确转换之后的FP16速度应该为~27FPS。
3 |
4 | ## pytorch模型 转换成 onnx模型
5 | 原始的Grounding DINO代码稍作修改就能转换成onnx模型, 但是转换成onnx模型后再转换成TensorRT模型时,会有各式各样的花式报错。为了避免onnx 转TensorRT时的报错,必须在转onnx模型时做一些额外的改动。
6 |
7 | - 注释掉backbone中使用checkpoint的语句
8 | - 将代码中的 NestedTensor 进行改写,不要使用NestedTensor数据结构。NestedTensor主要集中在视觉部分。直接使用Tensor即可
9 | - 将backbone.py 中的Joiner类改写成下面示例。改写后的类要继承nn.Module, 而不是nn.Sequential类。这可能是避免onnx转TensorRT模型出现问题的关键。build_backbone函数里面的部分内容可以移动到改写后的Joint类中
10 | - 将tokenizer 当成数据预处理放在模型的外面,输出直接作为forward函数的输入传入模型
11 | - nested_tensor_from_tensor_list 函数中针对转onnx做的特殊处理需要保留
12 | - 其他一些因为上述改动导致的必要改动
13 |
14 | ```python
15 | class Joiner(nn.Module):
16 | def __init__(self):
17 | self.backbone = xxxx
18 | self.position_embedding = xxx
19 |
20 | def forward(self):
21 | pass
22 |
23 | ```
24 |
25 |
26 | ## onnx模型转TensorRT模型
27 | 按照上述建议转出的onnx模型可以流畅的转成TensorRT模型
28 |
29 | - 建议使用最新版本TensorRT, 真的很快
30 | - 固定输入维度,会有一定的优势。Omdet中关于Grounding DINO 的速度测试都是基于固定的输入维度
31 | - F32 几乎无损, 转换FP16的时候精度损失较大,需要对一些损失较大的层进行额外的处理。Omdet中关于Grounding DINO 的速度测试都是基于FP16模型。FP32 比 FP16 慢 25~30%左右
32 |
--------------------------------------------------------------------------------
/docs/main_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/main_results.png
--------------------------------------------------------------------------------
/docs/speed_compare.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/speed_compare.jpeg
--------------------------------------------------------------------------------
/docs/turbo_model.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/docs/turbo_model.jpeg
--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
1 | from omdet.inference.det_engine import DetEngine
2 | import torch
3 |
4 | if __name__ == "__main__":
5 |
6 | model_dir = "./resources"
7 | img_tensor = torch.rand(1, 3, 640, 640) #
8 | label_feats = torch.rand(80, 1, 512) # 80 is cls num, 512 is clip dim
9 | task_feats = torch.rand(77, 1, 512) # 77 is task dim
10 | task_mask = torch.rand(1, 77)
11 |
12 | engine = DetEngine(model_dir=model_dir, batch_size=1, device='cpu')
13 | onnx_model_path = "./omdet.onnx"
14 | engine.export_onnx('OmDet-Turbo_tiny_SWIN_T', img_tensor, label_feats, task_feats, task_mask, onnx_model_path)
15 |
16 |
--------------------------------------------------------------------------------
/install.md:
--------------------------------------------------------------------------------
1 | # Install
2 | ## Requirements
3 |
4 | * CUDA>=11.8
5 |
6 | * Python>=3.9
7 |
8 | Create Python environments.
9 | ```bash
10 | conda create -n omdet python=3.9
11 | ```
12 | Activate the environment:
13 | ```bash
14 | conda activate omdet
15 | ```
16 |
17 | * Pytorch>=2.1.0, Torchvision>=0.16.0
18 |
19 | If your CUDA version is 11.8, you can install Pytorch as following:
20 | ```bash
21 | conda install pytorch==2.1.0 torchvision==0.16.0 pytorch-cuda=11.8 -c pytorch -c nvidia
22 | ```
23 |
24 | * detectron2>=0.6.0:
25 |
26 | Install detectron2:
27 | ```bash
28 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
29 | ```
30 |
31 | * Other requirements
32 | ```bash
33 | pip install -r requirements.txt
34 | ```
35 |
--------------------------------------------------------------------------------
/omdet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/__init__.py
--------------------------------------------------------------------------------
/omdet/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/inference/__init__.py
--------------------------------------------------------------------------------
/omdet/inference/base_engine.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | import requests
4 | import io
5 | import base64
6 | from detectron2.data.detection_utils import _apply_exif_orientation, convert_PIL_to_numpy
7 | import numpy as np
8 |
9 |
10 | def get_output_shape(oldh: int, oldw: int, short_edge_length: int, max_size: int):
11 | """
12 | Compute the output size given input size and target short edge length.
13 | """
14 | h, w = oldh, oldw
15 | size = short_edge_length * 1.0
16 | scale = size / min(h, w)
17 | if h < w:
18 | newh, neww = size, scale * w
19 | else:
20 | newh, neww = scale * h, size
21 | if max(newh, neww) > max_size:
22 | scale = max_size * 1.0 / max(newh, neww)
23 | newh = newh * scale
24 | neww = neww * scale
25 | neww = int(neww + 0.5)
26 | newh = int(newh + 0.5)
27 | return (newh, neww)
28 |
29 |
30 | class BaseEngine(object):
31 | def _load_data(self, src_type, cfg, data, return_transform=False):
32 | if src_type == 'local':
33 | image_data = [Image.open(x) for x in data]
34 |
35 | elif src_type == 'url':
36 | image_data = []
37 | for x in data:
38 | temp = Image.open(io.BytesIO(requests.get(x).content))
39 | image_data.append(temp)
40 |
41 | elif src_type == "base64":
42 | image_data = []
43 | for x in data:
44 | temp = Image.open(io.BytesIO(base64.b64decode(x))).convert("RGB")
45 | image_data.append(temp)
46 |
47 | else:
48 | raise Exception("Unknown mode {}.".format(src_type))
49 |
50 | input_data = []
51 | transforms = []
52 | for x in image_data:
53 | width, height = x.size
54 | pil_image = x.resize((cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST), Image.BILINEAR)
55 | image = convert_PIL_to_numpy(pil_image, cfg.INPUT.FORMAT)
56 |
57 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
58 | input_data.append({"image": image, "height": height, "width": width})
59 |
60 | if return_transform:
61 | return input_data, transforms
62 | else:
63 | return input_data
--------------------------------------------------------------------------------
/omdet/inference/det_engine.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from typing import List, Union, Dict
4 | from omdet.utils.tools import chunks
5 | from detectron2.checkpoint import DetectionCheckpointer
6 | from detectron2.config import get_cfg
7 | from detectron2.engine import DefaultTrainer as Trainer
8 | from omdet.utils.cache import LRUCache
9 | from omdet.inference.base_engine import BaseEngine
10 | from detectron2.utils.logger import setup_logger
11 | from omdet.omdet_v2_turbo.config import add_omdet_v2_turbo_config
12 |
13 |
14 | class DetEngine(BaseEngine):
15 | def __init__(self, model_dir='resources/', device='cpu', batch_size=10):
16 | self.model_dir = model_dir
17 | self._models = LRUCache(10)
18 | self.device = device
19 | self.batch_size = batch_size
20 | self.logger = setup_logger(name=__name__)
21 |
22 | def _init_cfg(self, cfg, model_id):
23 | cfg.MODEL.WEIGHTS = os.path.join(self.model_dir, model_id+'.pth')
24 | cfg.MODEL.DEVICE = self.device
25 | cfg.INPUT.MAX_SIZE_TEST = 640
26 | cfg.INPUT.MIN_SIZE_TEST = 640
27 | cfg.MODEL.DEPLOY_MODE = True
28 | cfg.freeze()
29 | return cfg
30 |
31 | def count_parameters(self, model):
32 | return sum(p.numel() for p in model.parameters())
33 |
34 | def _load_model(self, model_id):
35 | if not self._models.has(model_id):
36 | cfg = get_cfg()
37 | add_omdet_v2_turbo_config(cfg)
38 | cfg.merge_from_file(os.path.join('configs', model_id+'.yaml'))
39 | cfg = self._init_cfg(cfg, model_id)
40 | model = Trainer.build_model(cfg)
41 | self.logger.info("Model:\n{}".format(model))
42 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
43 | print("Loading a OmDet model {}".format(cfg.MODEL.WEIGHTS))
44 | model.eval()
45 | model.to(cfg.MODEL.DEVICE)
46 | print("Total parameters: {}".format(self.count_parameters(model)))
47 | self._models.put(model_id, (model, cfg))
48 |
49 | return self._models.get(model_id)
50 |
51 | def inf_predict(self, model_id,
52 | data: List,
53 | task: Union[str, List],
54 | labels: List[str],
55 | src_type: str = 'local',
56 | conf_threshold: float = 0.5,
57 | nms_threshold: float = 0.5
58 | ):
59 |
60 | if len(task) == 0:
61 | raise Exception("Task cannot be empty.")
62 |
63 | model, cfg = self._load_model(model_id)
64 |
65 | resp = []
66 | flat_labels = labels
67 |
68 | with torch.no_grad():
69 | for batch in chunks(data, self.batch_size):
70 | batch_image = self._load_data(src_type, cfg, batch)
71 | for img in batch_image:
72 | img['label_set'] = labels
73 | img['tasks'] = task
74 |
75 | batch_y = model(batch_image, score_thresh=conf_threshold, nms_thresh=nms_threshold)
76 |
77 | for z in batch_y:
78 | temp = []
79 | instances = z['instances'].to('cpu')
80 | instances = instances[instances.scores > conf_threshold]
81 |
82 | for idx, pred in enumerate(zip(instances.pred_boxes, instances.scores, instances.pred_classes)):
83 | (x, y, xx, yy), conf, cls = pred
84 | conf = float(conf)
85 | cls = flat_labels[int(cls)]
86 |
87 | temp.append({'xmin': int(x),
88 | 'ymin': int(y),
89 | 'xmax': int(xx),
90 | 'ymax': int(yy),
91 | 'conf': conf,
92 | 'label': cls})
93 | resp.append(temp)
94 |
95 | return resp
96 |
97 | def export_onnx(self, model_id, img_tensor, label_feats, task_feats, task_mask, onnx_model_path):
98 |
99 | model, _ = self._load_model(model_id)
100 | model.to("cpu")
101 | model.eval()
102 | inputs = (img_tensor, label_feats, task_feats, task_mask)
103 |
104 | print("start cvt onnx...")
105 | torch.onnx.export(model, # model being run
106 | inputs, # model input (or a tuple for multiple inputs)
107 | onnx_model_path, # where to save the model (can be a file or file-like object)
108 | export_params=True, # store the trained parameter weights inside the model file
109 | opset_version=17, # the ONNX version to export the model to
110 | do_constant_folding=True, # whether to execute constant folding for optimization
111 | input_names=['img_tensor', "label_feats", "task_feats", "task_mask"],
112 | )
--------------------------------------------------------------------------------
/omdet/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/__init__.py
--------------------------------------------------------------------------------
/omdet/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from omdet.modeling.backbone import (convnext, swint)
--------------------------------------------------------------------------------
/omdet/modeling/backbone/config.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import CfgNode as CN
2 |
3 |
4 | def add_backbone_config(cfg):
5 | add_convnext_config(cfg)
6 | add_swint_config(cfg)
7 |
8 |
9 | def add_convnext_config(cfg):
10 | # extra configs for convnext
11 | cfg.MODEL.CONVNEXT = CN()
12 | cfg.MODEL.CONVNEXT.SIZE = "T"
13 | cfg.MODEL.CONVNEXT.DEPTHS= [3, 3, 9, 3]
14 | cfg.MODEL.CONVNEXT.DIMS= [96, 192, 384, 768]
15 | cfg.MODEL.CONVNEXT.DROP_PATH_RATE= 0.2
16 | cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE= 1e-6
17 | cfg.MODEL.CONVNEXT.OUT_FEATURES= [0, 1, 2, 3]
18 | cfg.SOLVER.WEIGHT_DECAY_RATE= 0.95
19 |
20 |
21 | def add_swint_config(cfg):
22 | cfg.MODEL.SWIN = CN()
23 | cfg.MODEL.SWIN.SIZE = 'T' # 'T', 'S', 'B'
24 | cfg.MODEL.SWIN.USE_CHECKPOINT = False
25 | cfg.MODEL.SWIN.OUT_FEATURES = (0, 1, 2, 3) # FPN stride 8 - 32
26 |
27 |
28 |
--------------------------------------------------------------------------------
/omdet/modeling/backbone/convnext.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from timm.models.layers import trunc_normal_, DropPath
6 | from detectron2.modeling.backbone import Backbone
7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
8 | from detectron2.modeling.backbone.fpn import FPN
9 | from detectron2.layers import ShapeSpec
10 |
11 |
12 | class Block(nn.Module):
13 | r""" ConvNeXt Block. There are two equivalent implementations:
14 | (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
15 | (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
16 | We use (2) as we find it slightly faster in PyTorch
17 |
18 | Args:
19 | dim (int): Number of input channels.
20 | drop_path (float): Stochastic depth rate. Default: 0.0
21 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
22 | """
23 |
24 | def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
25 | super().__init__()
26 | self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
27 | self.norm = LayerNorm(dim, eps=1e-6)
28 | self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
29 | self.act = nn.GELU()
30 | self.pwconv2 = nn.Linear(4 * dim, dim)
31 | self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
32 | requires_grad=True) if layer_scale_init_value > 0 else None
33 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
34 |
35 | def forward(self, x):
36 | input = x
37 | x = self.dwconv(x)
38 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
39 | x = self.norm(x)
40 | x = self.pwconv1(x)
41 | x = self.act(x)
42 | x = self.pwconv2(x)
43 | if self.gamma is not None:
44 | x = self.gamma * x
45 | x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
46 |
47 | x = input + self.drop_path(x)
48 | return x
49 |
50 |
51 | class LayerNorm(nn.Module):
52 | r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
53 | The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
54 | shape (batch_size, height, width, channels) while channels_first corresponds to inputs
55 | with shape (batch_size, channels, height, width).
56 | """
57 |
58 | def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
59 | super().__init__()
60 | self.weight = nn.Parameter(torch.ones(normalized_shape))
61 | self.bias = nn.Parameter(torch.zeros(normalized_shape))
62 | self.eps = eps
63 | self.data_format = data_format
64 | if self.data_format not in ["channels_last", "channels_first"]:
65 | raise NotImplementedError
66 | self.normalized_shape = (normalized_shape,)
67 |
68 | def forward(self, x):
69 | if self.data_format == "channels_last":
70 | return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
71 | elif self.data_format == "channels_first":
72 | u = x.mean(1, keepdim=True)
73 | s = (x - u).pow(2).mean(1, keepdim=True)
74 | x = (x - u) / torch.sqrt(s + self.eps)
75 | x = self.weight[:, None, None] * x + self.bias[:, None, None]
76 | return x
77 |
78 |
79 | class ConvNeXt(Backbone):
80 | r""" ConvNeXt
81 | A PyTorch impl of : `A ConvNet for the 2020s` -
82 | https://arxiv.org/pdf/2201.03545.pdf
83 | Args:
84 | in_chans (int): Number of input image channels. Default: 3
85 | num_classes (int): Number of classes for classification head. Default: 1000
86 | depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
87 | dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
88 | drop_path_rate (float): Stochastic depth rate. Default: 0.
89 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
90 | head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
91 | out_features (tuple(int)): Stage numbers of the outputs given to the Neck.
92 | """
93 |
94 | def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768],
95 | drop_path_rate=0., layer_scale_init_value=1e-6, out_features=None):
96 | super().__init__()
97 |
98 | self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
99 | stem = nn.Sequential(
100 | nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
101 | LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
102 | )
103 |
104 | self.downsample_layers.append(stem)
105 | for i in range(3):
106 | downsample_layer = nn.Sequential(
107 | LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
108 | nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
109 | )
110 | self.downsample_layers.append(downsample_layer)
111 |
112 | self.num_layers = len(depths)
113 | num_features = [int(dims[i] * 2 ** i) for i in range(self.num_layers)]
114 | self.num_features = num_features
115 | self._out_features = out_features
116 |
117 | self._out_feature_strides = {}
118 | self._out_feature_channels = {}
119 |
120 | self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
121 | dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
122 | cur = 0
123 | strides = [4, 4, 4, 4]
124 | for i in range(4):
125 | stage = nn.Sequential(
126 | *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
127 | layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
128 | )
129 | self.stages.append(stage)
130 | cur += depths[i]
131 |
132 | self._out_feature_channels[i] = dims[i]
133 | self._out_feature_strides[i] = strides[i] * 2 ** i
134 |
135 | norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
136 | for i_layer in range(4):
137 | layer = norm_layer(dims[i_layer])
138 | layer_name = f'norm{i_layer}'
139 | self.add_module(layer_name, layer)
140 |
141 | self.apply(self._init_weights)
142 |
143 | def _init_weights(self, m):
144 | if isinstance(m, (nn.Conv2d, nn.Linear)):
145 | trunc_normal_(m.weight, std=.02)
146 | nn.init.constant_(m.bias, 0)
147 |
148 | def init_weights(self, pretrained=None):
149 | """Initialize the weights in backbone.
150 | Args:
151 | pretrained (str, optional): Path to pre-trained weights.
152 | Defaults to None.
153 | """
154 |
155 | def _init_weights(m):
156 | if isinstance(m, nn.Linear):
157 | trunc_normal_(m.weight, std=.02)
158 | if isinstance(m, nn.Linear) and m.bias is not None:
159 | nn.init.constant_(m.bias, 0)
160 | elif isinstance(m, nn.LayerNorm):
161 | nn.init.constant_(m.bias, 0)
162 | nn.init.constant_(m.weight, 1.0)
163 |
164 | self.apply(_init_weights)
165 |
166 | def forward_features(self, x):
167 | outs = {}
168 | for i in range(4):
169 | x = self.downsample_layers[i](x)
170 | x = self.stages[i](x)
171 | if i in self._out_features:
172 | norm_layer = getattr(self, f'norm{i}')
173 | x_out = norm_layer(x)
174 | out = x_out.contiguous()
175 | stage_name = i
176 | outs[stage_name] = out
177 |
178 | return outs # {"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
179 |
180 | def forward(self, x):
181 | x = self.forward_features(x)
182 | return x
183 |
184 |
185 | model_urls = {
186 | "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
187 | "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
188 | "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
189 | "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
190 | "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
191 | "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
192 | "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
193 | "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
194 | "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
195 | }
196 |
197 | size2config = {
198 | "N": {
199 | "DEPTHS": [2, 2, 8, 2],
200 | "DIMS": [80, 160, 320, 640]
201 | },
202 | 'T': {
203 | "DEPTHS": [3, 3, 9, 3],
204 | "DIMS": [96, 192, 384, 768]
205 | },
206 | 'S': {
207 | "DEPTHS": [3, 3, 27, 3],
208 | "DIMS": [96, 192, 384, 768]
209 | },
210 | 'B': {
211 | "DEPTHS": [3, 3, 27, 3],
212 | "DIMS": [128, 256, 512, 1024]
213 | },
214 | 'L': {
215 | "DEPTHS": [3, 3, 27, 3],
216 | "DIMS": [192, 384, 768, 1536]
217 | },
218 | 'XL': {
219 | "DEPTHS": [3, 3, 27, 3],
220 | "DIMS": [256, 512, 1024, 2048]
221 | }
222 | }
223 |
224 |
225 | @BACKBONE_REGISTRY.register()
226 | def build_convnext_backbone(cfg, input_shape):
227 | """
228 | Create a ConvNeXt instance from config.
229 |
230 | Returns:
231 | VoVNet: a :class:`VoVNet` instance.
232 | """
233 | size = cfg.MODEL.CONVNEXT.SIZE
234 | if size in size2config:
235 | depth = size2config[size]['DEPTHS']
236 | dims = size2config[size]['DIMS']
237 | else:
238 | depth = cfg.MODEL.CONVNEXT.DEPTHS
239 | dims = cfg.MODEL.CONVNEXT.DIMS
240 |
241 | return ConvNeXt(
242 | in_chans=input_shape.channels,
243 | depths=depth,
244 | dims=dims,
245 | drop_path_rate=cfg.MODEL.CONVNEXT.DROP_PATH_RATE,
246 | layer_scale_init_value=cfg.MODEL.CONVNEXT.LAYER_SCALE_INIT_VALUE,
247 | out_features=cfg.MODEL.CONVNEXT.OUT_FEATURES
248 | )
249 |
250 |
251 | @BACKBONE_REGISTRY.register()
252 | def build_convnext_fpn_backbone(cfg, input_shape: ShapeSpec):
253 | """
254 | Args:
255 | cfg: a detectron2 CfgNode
256 |
257 | Returns:
258 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
259 | """
260 | bottom_up = build_convnext_backbone(cfg, input_shape)
261 | in_features = cfg.MODEL.FPN.IN_FEATURES
262 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS
263 | backbone = FPN(
264 | bottom_up=bottom_up,
265 | in_features=in_features,
266 | out_channels=out_channels,
267 | norm=cfg.MODEL.FPN.NORM,
268 | top_block=None,
269 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
270 | )
271 | return backbone
272 |
--------------------------------------------------------------------------------
/omdet/modeling/common.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch import nn, Tensor
4 | import copy
5 | import torch.nn.functional as F
6 |
7 |
8 | class PositionalEncoding(nn.Module):
9 |
10 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
11 | super().__init__()
12 | self.dropout = nn.Dropout(p=dropout)
13 |
14 | position = torch.arange(max_len).unsqueeze(1)
15 | div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
16 | pe = torch.zeros(max_len, 1, d_model)
17 | pe[:, 0, 0::2] = torch.sin(position * div_term)
18 | pe[:, 0, 1::2] = torch.cos(position * div_term)
19 | self.register_buffer('pe', pe)
20 |
21 | def forward(self, x: Tensor) -> Tensor:
22 | """
23 | Args:
24 | x: Tensor, shape [seq_len, batch_size, embedding_dim]
25 | """
26 | x = x + self.pe[:x.size(0)]
27 | return self.dropout(x)
28 |
29 |
30 | class AbsPositionalEncoding(nn.Module):
31 |
32 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
33 | super().__init__()
34 | self.dropout = nn.Dropout(p=dropout)
35 | self.pe = nn.Embedding(max_len, d_model)
36 |
37 | def forward(self, x: Tensor) -> Tensor:
38 | """
39 | Args:
40 | x: Tensor, shape [seq_len, batch_size, embedding_dim]
41 | """
42 | seq_len = x.size(0)
43 | position = torch.arange(seq_len, device=x.device).unsqueeze(1)
44 | pos_emb = self.pe(position)
45 | x = x + pos_emb
46 | return self.dropout(x)
47 |
48 |
49 | class ResMultiHeadAttention(nn.Module):
50 | def __init__(self, d_q, d_k, d_v, nhead, dropout):
51 | super().__init__()
52 | self.self_attn = nn.MultiheadAttention(d_q, nhead, dropout=dropout, kdim=d_k, vdim=d_v)
53 | self.norm1 = nn.LayerNorm(d_q)
54 | self.dropout = nn.Dropout(dropout)
55 |
56 | def forward(self, q, k=None, v=None, attn_mask=None):
57 | """
58 | """
59 | if k is None:
60 | k = q
61 |
62 | if v is None:
63 | v = q
64 |
65 | q1 = self.self_attn(query=q, key=k, value=v, attn_mask=attn_mask)[0]
66 | q = q + self.dropout(q1)
67 | q = self.norm1(q)
68 | return q
69 |
70 |
71 | class DistilMLP(nn.Module):
72 | def __init__(self, input_size, output_size, dropout=0.1):
73 | super(DistilMLP, self).__init__()
74 | self.squash = nn.GELU()
75 | self.LayerNorm = nn.LayerNorm(input_size, eps=1e-12)
76 | self.intermediate = nn.Linear(input_size, input_size)
77 | self.dropout = nn.Dropout(dropout)
78 | self.dense = nn.Linear(input_size, output_size)
79 |
80 | def forward(self, word_emb):
81 | word_emb = self.squash(word_emb)
82 | word_emb = self.LayerNorm(word_emb)
83 | word_emb = self.dropout(word_emb)
84 | word_emb = self.dense(word_emb)
85 | return word_emb
86 |
87 |
88 | class ResidualLayer(nn.Module):
89 | """
90 | A residual connection followed by a layer norm.
91 | """
92 | def __init__(self, size, dropout):
93 | super(ResidualLayer, self).__init__()
94 | self.norm1 = nn.LayerNorm(size)
95 | self.dropout = nn.Dropout(dropout)
96 |
97 | def forward(self, x, y):
98 | "Apply residual connection to any sublayer with the same size."
99 | return self.norm1(x + self.dropout(y))
100 |
101 |
102 | class ResidualMLP(nn.Module):
103 | def __init__(self, d_m, dropout, d_hidden=1024, activation='relu'):
104 | super(ResidualMLP, self).__init__()
105 | self.mlp = MLP(d_m, d_m, d_hidden, dropout, activation)
106 | self.res1 = ResidualLayer(d_m, dropout)
107 |
108 | def forward(self, x):
109 | mlp_out = self.mlp(x)
110 | x = self.res1(x, mlp_out)
111 | return x
112 |
113 |
114 | class MLP(nn.Module):
115 | def __init__(self, d_input, d_output, d_hidden=1024, dropout=0.1, activation='relu'):
116 | super(MLP, self).__init__()
117 | self.linear1 = nn.Linear(d_input, d_hidden)
118 | self.activation = _get_activation_fn(activation)
119 | self.dropout = nn.Dropout(dropout)
120 | self.linear2 = nn.Linear(d_hidden, d_output)
121 |
122 | def forward(self, x):
123 | return self.linear2(self.dropout(self.activation(self.linear1(x))))
124 |
125 |
126 | def apply_deltas(deltas, boxes, bbox_weights, scale_clamp):
127 | """
128 | Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
129 |
130 | Args:
131 | deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
132 | deltas[i] represents k potentially different class-specific
133 | box transformations for the single box boxes[i].
134 | boxes (Tensor): boxes to transform, of shape (N, 4)
135 | """
136 | boxes = boxes.to(deltas.dtype)
137 |
138 | widths = boxes[:, 2] - boxes[:, 0]
139 | heights = boxes[:, 3] - boxes[:, 1]
140 | ctr_x = boxes[:, 0] + 0.5 * widths
141 | ctr_y = boxes[:, 1] + 0.5 * heights
142 |
143 | wx, wy, ww, wh = bbox_weights
144 | dx = deltas[:, 0::4] / wx
145 | dy = deltas[:, 1::4] / wy
146 | dw = deltas[:, 2::4] / ww
147 | dh = deltas[:, 3::4] / wh
148 |
149 | # Prevent sending too large values into torch.exp()
150 | dw = torch.clamp(dw, max=scale_clamp)
151 | dh = torch.clamp(dh, max=scale_clamp)
152 |
153 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
154 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
155 | pred_w = torch.exp(dw) * widths[:, None]
156 | pred_h = torch.exp(dh) * heights[:, None]
157 |
158 | pred_boxes = torch.zeros_like(deltas)
159 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1
160 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1
161 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2
162 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2
163 |
164 | return pred_boxes
165 |
166 |
167 | def _get_clones(module, N):
168 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
169 |
170 |
171 | def _get_activation_fn(activation):
172 | """Return an activation function given a string"""
173 | if activation == "relu":
174 | return F.relu
175 | if activation == "gelu":
176 | return F.gelu
177 | if activation == "glu":
178 | return F.glu
179 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
180 |
181 |
182 | def _norm(f, dim=-1):
183 | return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12)
184 |
185 |
186 | def _b_cosine(a, b, logit_scale):
187 | """
188 | a: B x K x H
189 | b: B x H x K
190 | """
191 | a = _norm(a, dim=2)
192 | b = _norm(b, dim=1)
193 | # Calculating the Loss
194 | logit_scale = logit_scale.exp()
195 | logits_per_image = logit_scale * torch.bmm(a, b)
196 | return logits_per_image
197 |
198 | def _cosine(a, b, logit_scale):
199 | """
200 | a: ?/1 x K x H
201 | b: ?/1 x H x 1
202 | """
203 | a = _norm(a, dim=2)
204 | b = _norm(b, dim=1)
205 | # Calculating the Loss
206 | logit_scale = logit_scale.exp()
207 | logits_per_image = logit_scale * torch.matmul(a, b)
208 | return logits_per_image
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_language_backbone
2 | #from .build import build_tokenizer
3 |
4 | # from .hfpt_tokenizer import HFPTTokenizer
5 | # from .simple_tokenizer import SimpleTokenizer
6 |
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/backbone.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from omdet.modeling import registry
3 | from omdet.modeling.language_backbone.clip.models import clip as clip
4 |
5 |
6 | @registry.LANGUAGE_BACKBONES.register("clip")
7 | def build_clip_backbone(cfg):
8 | model, _ = clip.load("resources/ViT-B-16.pt", device=torch.device(cfg.MODEL.DEVICE), jit=False)
9 | model.visual = None # delete the vision part
10 | model.logit_scale = None
11 | return model
12 |
13 |
14 | def build_language_backbone(cfg):
15 | print ("cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE", cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE)
16 | assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \
17 | "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format(
18 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
19 | )
20 | return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg)
21 |
22 |
23 | if __name__ == "__main__":
24 | a = build_clip_backbone('')
25 | print(a)
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/__init__.py
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/__init__.py
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/omdet/modeling/language_backbone/clip/models/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/models/clip.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import urllib
4 | import warnings
5 | from typing import Union, List
6 |
7 | import torch
8 | from PIL import Image
9 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
10 | from tqdm import tqdm
11 |
12 | from omdet.modeling.language_backbone.clip.models.model import build_model
13 | from omdet.modeling.language_backbone.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
14 |
15 | try:
16 | from torchvision.transforms import InterpolationMode
17 | BICUBIC = InterpolationMode.BICUBIC
18 | except ImportError:
19 | BICUBIC = Image.BICUBIC
20 |
21 |
22 | __all__ = ["available_models", "load", "tokenize"]
23 | _tokenizer = _Tokenizer()
24 |
25 |
26 | _MODELS = {
27 | "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
28 | "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
29 | "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
30 | "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
31 | "ViT-B-32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
32 | "ViT-B-16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
33 | }
34 |
35 |
36 | def _download(url: str, root: str):
37 | os.makedirs(root, exist_ok=True)
38 | filename = os.path.basename(url)
39 |
40 | expected_sha256 = url.split("/")[-2]
41 | download_target = os.path.join(root, filename)
42 |
43 | if os.path.exists(download_target) and not os.path.isfile(download_target):
44 | raise RuntimeError(f"{download_target} exists and is not a regular file")
45 |
46 | if os.path.isfile(download_target):
47 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
48 | return download_target
49 | else:
50 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
51 |
52 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
53 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
54 | while True:
55 | buffer = source.read(8192)
56 | if not buffer:
57 | break
58 |
59 | output.write(buffer)
60 | loop.update(len(buffer))
61 |
62 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
63 | raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
64 |
65 | return download_target
66 |
67 |
68 | def _transform(n_px):
69 | return Compose([
70 | Resize(n_px, interpolation=BICUBIC),
71 | CenterCrop(n_px),
72 | lambda image: image.convert("RGB"),
73 | ToTensor(),
74 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
75 | ])
76 |
77 |
78 | def available_models() -> List[str]:
79 | """Returns the names of available CLIP rclip"""
80 | return list(_MODELS.keys())
81 |
82 |
83 | def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
84 | jit: bool = False, download_root: str = None):
85 | """Load a CLIP model
86 |
87 | Parameters
88 | ----------
89 | name : str
90 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
91 |
92 | device : Union[str, torch.device]
93 | The device to put the loaded model
94 |
95 | jit : bool
96 | Whether to load the optimized JIT model or more hackable non-JIT model (default).
97 |
98 | download_root: str
99 | path to download the model files; by default, it uses "~/.cache/clip"
100 |
101 | Returns
102 | -------
103 | model : torch.nn.Module
104 | The CLIP model
105 |
106 | preprocess : Callable[[PIL.Image], torch.Tensor]
107 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
108 | """
109 | if name in _MODELS:
110 | model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
111 | elif os.path.isfile(name):
112 | model_path = name
113 | else:
114 | raise RuntimeError(f"Model {name} not found; available rclip = {available_models()}")
115 |
116 | try:
117 | # loading JIT archive
118 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
119 | state_dict = None
120 | except RuntimeError:
121 | # loading saved state dict
122 | if jit:
123 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
124 | jit = False
125 | state_dict = torch.load(model_path, map_location="cpu")
126 |
127 | if not jit:
128 | model = build_model(state_dict or model.state_dict()).to(device)
129 | if str(device) == "cpu":
130 | model.float()
131 | return model, _transform(model.visual.input_resolution)
132 |
133 | # patch the device names
134 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
135 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
136 |
137 | def patch_device(module):
138 | try:
139 | graphs = [module.graph] if hasattr(module, "graph") else []
140 | except RuntimeError:
141 | graphs = []
142 |
143 | if hasattr(module, "forward1"):
144 | graphs.append(module.forward1.graph)
145 |
146 | for graph in graphs:
147 | for node in graph.findAllNodes("prim::Constant"):
148 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
149 | node.copyAttributes(device_node)
150 |
151 | model.apply(patch_device)
152 | patch_device(model.encode_image)
153 | patch_device(model.encode_text)
154 |
155 | # patch dtype to float32 on CPU
156 | if str(device) == "cpu":
157 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
158 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
159 | float_node = float_input.node()
160 |
161 | def patch_float(module):
162 | try:
163 | graphs = [module.graph] if hasattr(module, "graph") else []
164 | except RuntimeError:
165 | graphs = []
166 |
167 | if hasattr(module, "forward1"):
168 | graphs.append(module.forward1.graph)
169 |
170 | for graph in graphs:
171 | for node in graph.findAllNodes("aten::to"):
172 | inputs = list(node.inputs())
173 | for i in [1, 2]: # dtype can be the second or third argument to aten::to()
174 | if inputs[i].node()["value"] == 5:
175 | inputs[i].node().copyAttributes(float_node)
176 |
177 | model.apply(patch_float)
178 | patch_float(model.encode_image)
179 | patch_float(model.encode_text)
180 |
181 | model.float()
182 |
183 | return model, _transform(model.input_resolution.item())
184 |
185 |
186 | def tokenize(texts: Union[str, List[str]], context_length: int = 77,
187 | truncate: bool = False) -> torch.LongTensor:
188 | """
189 | Returns the tokenized representation of given input string(s)
190 |
191 | Parameters
192 | ----------
193 | texts : Union[str, List[str]]
194 | An input string or a list of input strings to tokenize
195 |
196 | context_length : int
197 | The context length to use; all CLIP rclip use 77 as the context length
198 |
199 | truncate: bool
200 | Whether to truncate the text in case its encoding is longer than the context length
201 |
202 | Returns
203 | -------
204 | A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
205 | """
206 | if isinstance(texts, str):
207 | texts = [texts]
208 |
209 | sot_token = _tokenizer.encoder["<|startoftext|>"]
210 | eot_token = _tokenizer.encoder["<|endoftext|>"]
211 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
212 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
213 |
214 | for i, tokens in enumerate(all_tokens):
215 | if len(tokens) > context_length:
216 | if truncate:
217 | tokens = tokens[:context_length]
218 | tokens[-1] = eot_token
219 | else:
220 | raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
221 | result[i, :len(tokens)] = torch.tensor(tokens)
222 |
223 | return result
224 |
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import html
3 | import os
4 | from functools import lru_cache
5 |
6 | import ftfy
7 | import regex as re
8 |
9 |
10 | @lru_cache()
11 | def default_bpe():
12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "models/bpe_simple_vocab_16e6.txt.gz")
13 |
14 |
15 | @lru_cache()
16 | def bytes_to_unicode():
17 | """
18 | Returns list of utf-8 byte and a corresponding list of unicode strings.
19 | The reversible bpe codes work on unicode strings.
20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22 | This is a signficant percentage of your normal, say, 32K bpe vocab.
23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24 | And avoids mapping to whitespace/control characters the bpe code barfs on.
25 | """
26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27 | cs = bs[:]
28 | n = 0
29 | for b in range(2**8):
30 | if b not in bs:
31 | bs.append(b)
32 | cs.append(2**8+n)
33 | n += 1
34 | cs = [chr(n) for n in cs]
35 | return dict(zip(bs, cs))
36 |
37 |
38 | def get_pairs(word):
39 | """Return set of symbol pairs in a word.
40 | Word is represented as tuple of symbols (symbols being variable-length strings).
41 | """
42 | pairs = set()
43 | prev_char = word[0]
44 | for char in word[1:]:
45 | pairs.add((prev_char, char))
46 | prev_char = char
47 | return pairs
48 |
49 |
50 | def basic_clean(text):
51 | text = ftfy.fix_text(text)
52 | text = html.unescape(html.unescape(text))
53 | return text.strip()
54 |
55 |
56 | def whitespace_clean(text):
57 | text = re.sub(r'\s+', ' ', text)
58 | text = text.strip()
59 | return text
60 |
61 |
62 | class SimpleTokenizer(object):
63 | def __init__(self, bpe_path: str = default_bpe()):
64 | self.byte_encoder = bytes_to_unicode()
65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
67 | merges = merges[1:49152-256-2+1]
68 | merges = [tuple(merge.split()) for merge in merges]
69 | vocab = list(bytes_to_unicode().values())
70 | vocab = vocab + [v+'' for v in vocab]
71 | for merge in merges:
72 | vocab.append(''.join(merge))
73 | vocab.extend(['<|startoftext|>', '<|endoftext|>'])
74 | self.encoder = dict(zip(vocab, range(len(vocab))))
75 | self.decoder = {v: k for k, v in self.encoder.items()}
76 | self.bpe_ranks = dict(zip(merges, range(len(merges))))
77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
79 |
80 | def bpe(self, token):
81 | if token in self.cache:
82 | return self.cache[token]
83 | word = tuple(token[:-1]) + ( token[-1] + '',)
84 | pairs = get_pairs(word)
85 |
86 | if not pairs:
87 | return token+''
88 |
89 | while True:
90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
91 | if bigram not in self.bpe_ranks:
92 | break
93 | first, second = bigram
94 | new_word = []
95 | i = 0
96 | while i < len(word):
97 | try:
98 | j = word.index(first, i)
99 | new_word.extend(word[i:j])
100 | i = j
101 | except:
102 | new_word.extend(word[i:])
103 | break
104 |
105 | if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 | new_word.append(first+second)
107 | i += 2
108 | else:
109 | new_word.append(word[i])
110 | i += 1
111 | new_word = tuple(new_word)
112 | word = new_word
113 | if len(word) == 1:
114 | break
115 | else:
116 | pairs = get_pairs(word)
117 | word = ' '.join(word)
118 | self.cache[token] = word
119 | return word
120 |
121 | def encode(self, text):
122 | bpe_tokens = []
123 | text = whitespace_clean(basic_clean(text)).lower()
124 | for token in re.findall(self.pat, text):
125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 | return bpe_tokens
128 |
129 | def decode(self, tokens):
130 | text = ''.join([self.decoder[token] for token in tokens])
131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ')
132 | return text
133 |
--------------------------------------------------------------------------------
/omdet/modeling/language_backbone/word_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Language-related data loading helper functions and class wrappers.
3 | """
4 |
5 | import re
6 | import torch
7 | import codecs
8 |
9 | UNK_TOKEN = ''
10 | PAD_TOKEN = ''
11 | END_TOKEN = ''
12 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
13 |
14 |
15 | class Dictionary(object):
16 | def __init__(self):
17 | self.word2idx = {}
18 | self.idx2word = []
19 |
20 | def add_word(self, word):
21 | if word not in self.word2idx:
22 | self.idx2word.append(word)
23 | self.word2idx[word] = len(self.idx2word) - 1
24 | return self.word2idx[word]
25 |
26 | def __len__(self):
27 | return len(self.idx2word)
28 |
29 | def __getitem__(self, a):
30 | if isinstance(a, int):
31 | return self.idx2word[a]
32 | elif isinstance(a, list):
33 | return [self.idx2word[x] for x in a]
34 | elif isinstance(a, str):
35 | return self.word2idx[a]
36 | else:
37 | raise TypeError("Query word/index argument must be int or str")
38 |
39 | def __contains__(self, word):
40 | return word in self.word2idx
41 |
42 |
43 | class Corpus(object):
44 | def __init__(self):
45 | self.dictionary = Dictionary()
46 |
47 | def set_max_len(self, value):
48 | self.max_len = value
49 |
50 | def load_file(self, filename):
51 | with codecs.open(filename, 'r', 'utf-8') as f:
52 | for line in f:
53 | line = line.strip()
54 | self.add_to_corpus(line)
55 | self.dictionary.add_word(UNK_TOKEN)
56 | self.dictionary.add_word(PAD_TOKEN)
57 |
58 | def add_to_corpus(self, line):
59 | """Tokenizes a text line."""
60 | # Add words to the dictionary
61 | words = line.split()
62 | # tokens = len(words)
63 | for word in words:
64 | word = word.lower()
65 | self.dictionary.add_word(word)
66 |
67 | def tokenize(self, line, max_len=20):
68 | # Tokenize line contents
69 | words = SENTENCE_SPLIT_REGEX.split(line.strip())
70 | # words = [w.lower() for w in words if len(w) > 0]
71 | words = [w.lower() for w in words if (len(w) > 0 and w != ' ')] ## do not include space as a token
72 |
73 | if words[-1] == '.':
74 | words = words[:-1]
75 |
76 | if max_len > 0:
77 | if len(words) > max_len:
78 | words = words[:max_len]
79 | elif len(words) < max_len:
80 | # words = [PAD_TOKEN] * (max_len - len(words)) + words
81 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
82 |
83 | tokens = len(words) ## for end token
84 | ids = torch.LongTensor(tokens)
85 | token = 0
86 | for word in words:
87 | if word not in self.dictionary:
88 | word = UNK_TOKEN
89 | if type(word) != type('a'):
90 | print(word, type(word), word.encode('ascii', 'ignore').decode('ascii'),
91 | type(word.encode('ascii', 'ignore').decode('ascii')))
92 | word = word.encode('ascii', 'ignore').decode('ascii')
93 | ids[token] = self.dictionary[word]
94 | token += 1
95 | # ids[token] = self.dictionary[END_TOKEN]
96 | return ids
97 |
98 | def __len__(self):
99 | return len(self.dictionary)
100 |
--------------------------------------------------------------------------------
/omdet/modeling/registry.py:
--------------------------------------------------------------------------------
1 | from omdet.utils.registry import Registry
2 |
3 | LANGUAGE_BACKBONES = Registry()
4 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import add_omdet_v2_turbo_config
2 | from .detector import OmDetV2Turbo
3 | from .ela_encoder import ELAEncoder
4 | from .ela_decoder import ELADecoder
5 | from .head import DINOHead
6 | from .infer_model import OmDetV2TurboInfer
7 |
8 |
9 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/block.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
6 |
7 | __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
8 | 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3')
9 |
10 |
11 | class DFL(nn.Module):
12 | """
13 | Integral module of Distribution Focal Loss (DFL).
14 | Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
15 | """
16 |
17 | def __init__(self, c1=16):
18 | """Initialize a convolutional layer with a given number of input channels."""
19 | super().__init__()
20 | self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
21 | x = torch.arange(c1, dtype=torch.float)
22 | self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
23 | self.c1 = c1
24 |
25 | def forward(self, x):
26 | """Applies a transformer layer on input tensor 'x' and returns a tensor."""
27 | b, c, a = x.shape # batch, channels, anchors
28 | return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
29 | # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
30 |
31 |
32 | class Proto(nn.Module):
33 | """YOLOv8 mask Proto module for segmentation models."""
34 |
35 | def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks
36 | super().__init__()
37 | self.cv1 = Conv(c1, c_, k=3)
38 | self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest')
39 | self.cv2 = Conv(c_, c_, k=3)
40 | self.cv3 = Conv(c_, c2)
41 |
42 | def forward(self, x):
43 | """Performs a forward pass through layers using an upsampled input image."""
44 | return self.cv3(self.cv2(self.upsample(self.cv1(x))))
45 |
46 |
47 | class HGStem(nn.Module):
48 | """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
49 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
50 | """
51 |
52 | def __init__(self, c1, cm, c2):
53 | super().__init__()
54 | self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
55 | self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
56 | self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())
57 | self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())
58 | self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
59 | self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
60 |
61 | def forward(self, x):
62 | """Forward pass of a PPHGNetV2 backbone layer."""
63 | x = self.stem1(x)
64 | x = F.pad(x, [0, 1, 0, 1])
65 | x2 = self.stem2a(x)
66 | x2 = F.pad(x2, [0, 1, 0, 1])
67 | x2 = self.stem2b(x2)
68 | x1 = self.pool(x)
69 | x = torch.cat([x1, x2], dim=1)
70 | x = self.stem3(x)
71 | x = self.stem4(x)
72 | return x
73 |
74 |
75 | class HGBlock(nn.Module):
76 | """HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
77 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
78 | """
79 |
80 | def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
81 | super().__init__()
82 | block = LightConv if lightconv else Conv
83 | self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
84 | self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act) # squeeze conv
85 | self.ec = Conv(c2 // 2, c2, 1, 1, act=act) # excitation conv
86 | self.add = shortcut and c1 == c2
87 |
88 | def forward(self, x):
89 | """Forward pass of a PPHGNetV2 backbone layer."""
90 | y = [x]
91 | y.extend(m(y[-1]) for m in self.m)
92 | y = self.ec(self.sc(torch.cat(y, 1)))
93 | return y + x if self.add else y
94 |
95 |
96 | class SPP(nn.Module):
97 | """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
98 |
99 | def __init__(self, c1, c2, k=(5, 9, 13)):
100 | """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
101 | super().__init__()
102 | c_ = c1 // 2 # hidden channels
103 | self.cv1 = Conv(c1, c_, 1, 1)
104 | self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
105 | self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
106 |
107 | def forward(self, x):
108 | """Forward pass of the SPP layer, performing spatial pyramid pooling."""
109 | x = self.cv1(x)
110 | return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
111 |
112 |
113 | class SPPF(nn.Module):
114 | """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
115 |
116 | def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13))
117 | super().__init__()
118 | c_ = c1 // 2 # hidden channels
119 | self.cv1 = Conv(c1, c_, 1, 1)
120 | self.cv2 = Conv(c_ * 4, c2, 1, 1)
121 | self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
122 |
123 | def forward(self, x):
124 | """Forward pass through Ghost Convolution block."""
125 | x = self.cv1(x)
126 | y1 = self.m(x)
127 | y2 = self.m(y1)
128 | return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
129 |
130 |
131 | class C1(nn.Module):
132 | """CSP Bottleneck with 1 convolution."""
133 |
134 | def __init__(self, c1, c2, n=1): # ch_in, ch_out, number
135 | super().__init__()
136 | self.cv1 = Conv(c1, c2, 1, 1)
137 | self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
138 |
139 | def forward(self, x):
140 | """Applies cross-convolutions to input in the C3 module."""
141 | y = self.cv1(x)
142 | return self.m(y) + y
143 |
144 |
145 | class C2(nn.Module):
146 | """CSP Bottleneck with 2 convolutions."""
147 |
148 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
149 | super().__init__()
150 | self.c = int(c2 * e) # hidden channels
151 | self.cv1 = Conv(c1, 2 * self.c, 1, 1)
152 | self.cv2 = Conv(2 * self.c, c2, 1) # optional act=FReLU(c2)
153 | # self.attention = ChannelAttention(2 * self.c) # or SpatialAttention()
154 | self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
155 |
156 | def forward(self, x):
157 | """Forward pass through the CSP bottleneck with 2 convolutions."""
158 | a, b = self.cv1(x).chunk(2, 1)
159 | return self.cv2(torch.cat((self.m(a), b), 1))
160 |
161 |
162 | class C2f(nn.Module):
163 | """CSP Bottleneck with 2 convolutions."""
164 |
165 | def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
166 | super().__init__()
167 | self.c = int(c2 * e) # hidden channels
168 | self.cv1 = Conv(c1, 2 * self.c, 1, 1)
169 | self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2)
170 | self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
171 |
172 | def forward(self, x):
173 | """Forward pass through C2f layer."""
174 | y = list(self.cv1(x).chunk(2, 1))
175 | y.extend(m(y[-1]) for m in self.m)
176 | return self.cv2(torch.cat(y, 1))
177 |
178 | def forward_split(self, x):
179 | """Forward pass using split() instead of chunk()."""
180 | y = list(self.cv1(x).split((self.c, self.c), 1))
181 | y.extend(m(y[-1]) for m in self.m)
182 | return self.cv2(torch.cat(y, 1))
183 |
184 |
185 | class C3(nn.Module):
186 | """CSP Bottleneck with 3 convolutions."""
187 |
188 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
189 | super().__init__()
190 | c_ = int(c2 * e) # hidden channels
191 | self.cv1 = Conv(c1, c_, 1, 1)
192 | self.cv2 = Conv(c1, c_, 1, 1)
193 | self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
194 | self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
195 |
196 | def forward(self, x):
197 | """Forward pass through the CSP bottleneck with 2 convolutions."""
198 | return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
199 |
200 |
201 | class C3x(C3):
202 | """C3 module with cross-convolutions."""
203 |
204 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
205 | """Initialize C3TR instance and set default parameters."""
206 | super().__init__(c1, c2, n, shortcut, g, e)
207 | self.c_ = int(c2 * e)
208 | self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
209 |
210 |
211 | class RepC3(nn.Module):
212 | """Rep C3."""
213 |
214 | def __init__(self, c1, c2, n=3, e=1.0):
215 | super().__init__()
216 | c_ = int(c2 * e) # hidden channels
217 | self.cv1 = Conv(c1, c2, 1, 1)
218 | self.cv2 = Conv(c1, c2, 1, 1)
219 | self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
220 | self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
221 |
222 | def forward(self, x):
223 | """Forward pass of RT-DETR neck layer."""
224 | return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
225 |
226 | #
227 | # class C3TR(C3):
228 | # """C3 module with TransformerBlock()."""
229 | #
230 | # def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
231 | # """Initialize C3Ghost module with GhostBottleneck()."""
232 | # super().__init__(c1, c2, n, shortcut, g, e)
233 | # c_ = int(c2 * e)
234 | # self.m = TransformerBlock(c_, c_, 4, n)
235 |
236 |
237 | class C3Ghost(C3):
238 | """C3 module with GhostBottleneck()."""
239 |
240 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
241 | """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
242 | super().__init__(c1, c2, n, shortcut, g, e)
243 | c_ = int(c2 * e) # hidden channels
244 | self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
245 |
246 |
247 | class GhostBottleneck(nn.Module):
248 | """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
249 |
250 | def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride
251 | super().__init__()
252 | c_ = c2 // 2
253 | self.conv = nn.Sequential(
254 | GhostConv(c1, c_, 1, 1), # pw
255 | DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
256 | GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
257 | self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
258 | act=False)) if s == 2 else nn.Identity()
259 |
260 | def forward(self, x):
261 | """Applies skip connection and concatenation to input tensor."""
262 | return self.conv(x) + self.shortcut(x)
263 |
264 |
265 | class Bottleneck(nn.Module):
266 | """Standard bottleneck."""
267 |
268 | def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand
269 | super().__init__()
270 | c_ = int(c2 * e) # hidden channels
271 | self.cv1 = Conv(c1, c_, k[0], 1)
272 | self.cv2 = Conv(c_, c2, k[1], 1, g=g)
273 | self.add = shortcut and c1 == c2
274 |
275 | def forward(self, x):
276 | """'forward()' applies the YOLOv5 FPN to input data."""
277 | return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
278 |
279 |
280 | class BottleneckCSP(nn.Module):
281 | """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
282 |
283 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
284 | super().__init__()
285 | c_ = int(c2 * e) # hidden channels
286 | self.cv1 = Conv(c1, c_, 1, 1)
287 | self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
288 | self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
289 | self.cv4 = Conv(2 * c_, c2, 1, 1)
290 | self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
291 | self.act = nn.SiLU()
292 | self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
293 |
294 | def forward(self, x):
295 | """Applies a CSP bottleneck with 3 convolutions."""
296 | y1 = self.cv3(self.m(self.cv1(x)))
297 | y2 = self.cv2(x)
298 | return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
299 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/build_components.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from detectron2.utils.logger import _log_api_usage
3 | from detectron2.utils.registry import Registry
4 |
5 | TRANSFORMER_ENCODER_REGISTRY = Registry("TRANSFORMER_ENCODER") # noqa F401 isort:skip
6 | TRANSFORMER_ENCODER_REGISTRY.__doc__ = """
7 | """
8 |
9 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_DECODER") # noqa F401 isort:skip
10 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ """
11 |
12 | DETR_HEAD_REGISTRY = Registry("DETR_HEAD") # noqa F401 isort:skip
13 | DETR_HEAD_REGISTRY.__doc__ = """ """
14 |
15 |
16 | def build_encoder_model(cfg):
17 | """
18 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
19 | Note that it does not load any weights from ``cfg``.
20 | """
21 | encoder = cfg.MODEL.TRANSFORMER_ENCODER
22 | mode_class = TRANSFORMER_ENCODER_REGISTRY.get(encoder)
23 | model = mode_class(**mode_class.from_config(cfg))
24 | # model = TRANSFORMER_ENCODER_REGISTRY.get(encoder)(cfg)
25 | model.to(torch.device(cfg.MODEL.DEVICE))
26 | _log_api_usage("modeling.transfor_encoder." + encoder)
27 | return model
28 |
29 |
30 | def build_decoder_model(cfg):
31 | """
32 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
33 | Note that it does not load any weights from ``cfg``.
34 | """
35 | decoder = cfg.MODEL.TRANSFORMER_DECODER
36 | mode_class = TRANSFORMER_DECODER_REGISTRY.get(decoder)
37 | model = mode_class(**mode_class.from_config(cfg))
38 | # model = TRANSFORMER_DECODER_REGISTRY.get(decoder)(cfg)
39 | model.to(torch.device(cfg.MODEL.DEVICE))
40 | _log_api_usage("modeling.transfor_encoder." + decoder)
41 | return model
42 |
43 |
44 | def build_detr_head(cfg):
45 | """
46 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
47 | Note that it does not load any weights from ``cfg``.
48 | """
49 | head = cfg.MODEL.HEAD
50 | # model = DETR_HEAD_REGISTRY.get(head)(cfg)
51 | mode_class = DETR_HEAD_REGISTRY.get(head)
52 | model = mode_class(**mode_class.from_config(cfg))
53 | model.to(torch.device(cfg.MODEL.DEVICE))
54 | _log_api_usage("modeling.transfor_encoder." + head)
55 | return model
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/config.py:
--------------------------------------------------------------------------------
1 | from detectron2.config import CfgNode as CN
2 | from omdet.modeling.backbone.config import add_backbone_config
3 |
4 |
5 | def add_omdet_v2_turbo_config(cfg):
6 | """
7 | Add config for Modulated OmDet Turn.
8 | """
9 | cfg.MODEL.HEAD = "DINOHead"
10 | cfg.MODEL.LOSS = "DINOLoss"
11 | cfg.MODEL.TRANSFORMER_ENCODER = "ELAEncoder"
12 | cfg.MODEL.TRANSFORMER_DECODER = "ELADecoder"
13 |
14 | cfg.MODEL.LANGUAGE_BACKBONE = CN()
15 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "clip"
16 | cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 512
17 |
18 | # Task Head
19 | cfg.MODEL.ELAEncoder = CN()
20 | cfg.MODEL.ELAEncoder.in_channels = [192, 384, 768]
21 | cfg.MODEL.ELAEncoder.feat_strides = [8, 16, 32]
22 | cfg.MODEL.ELAEncoder.hidden_dim = 256
23 | cfg.MODEL.ELAEncoder.use_encoder_idx = [2]
24 | cfg.MODEL.ELAEncoder.num_encoder_layers = 1
25 | cfg.MODEL.ELAEncoder.encoder_layer = 'TransformerLayer'
26 | cfg.MODEL.ELAEncoder.pe_temperature = 10000
27 | cfg.MODEL.ELAEncoder.expansion = 1.0
28 | cfg.MODEL.ELAEncoder.depth_mult = 1.0
29 | cfg.MODEL.ELAEncoder.act = 'silu'
30 | cfg.MODEL.ELAEncoder.eval_size = None
31 | cfg.MODEL.ELAEncoder.dim_feedforward=1024
32 |
33 | cfg.MODEL.ELADecoder = CN()
34 | cfg.MODEL.ELADecoder.hidden_dim = 256
35 | cfg.MODEL.ELADecoder.num_queries = 300
36 | cfg.MODEL.ELADecoder.position_embed_type = 'sine'
37 | cfg.MODEL.ELADecoder.backbone_feat_channels = [256, 256, 256]
38 | cfg.MODEL.ELADecoder.feat_strides = [8, 16, 32]
39 | cfg.MODEL.ELADecoder.num_levels = 3
40 | cfg.MODEL.ELADecoder.num_decoder_points = 4
41 | cfg.MODEL.ELADecoder.nhead = 8
42 | cfg.MODEL.ELADecoder.num_decoder_layers = 3
43 | cfg.MODEL.ELADecoder.dim_feedforward = 1024
44 | cfg.MODEL.ELADecoder.dropout = 0.0
45 | cfg.MODEL.ELADecoder.activation = 'relu'
46 | cfg.MODEL.ELADecoder.num_denoising = 100
47 | cfg.MODEL.ELADecoder.label_noise_ratio = 0.5
48 | cfg.MODEL.ELADecoder.box_noise_scale = 1.0
49 | cfg.MODEL.ELADecoder.learnt_init_query = True
50 | cfg.MODEL.ELADecoder.eval_size = None
51 | cfg.MODEL.ELADecoder.eval_idx = -1
52 | cfg.MODEL.ELADecoder.eps = 1e-2
53 | cfg.MODEL.ELADecoder.cls_type = 'cosine'
54 |
55 | cfg.MODEL.FUSE_TYPE = None
56 |
57 | cfg.INPUT.RANDOM_CROP = None
58 | cfg.INPUT.RANDOM_CONTRAST = None
59 | cfg.INPUT.RANDOM_BRIGHTNESS = None
60 | cfg.INPUT.RANDOM_SATURATION = None
61 |
62 | cfg.MODEL.DEPLOY_MODE = False
63 |
64 | add_backbone_config(cfg)
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/conv.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 |
7 | __all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
8 | 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
9 |
10 |
11 | def autopad(k, p=None, d=1): # kernel, padding, dilation
12 | """Pad to 'same' shape outputs."""
13 | if d > 1:
14 | k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
15 | if p is None:
16 | p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
17 | return p
18 |
19 |
20 | class Conv(nn.Module):
21 | """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
22 | default_act = nn.SiLU() # default activation
23 |
24 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
25 | """Initialize Conv layer with given arguments including activation."""
26 | super().__init__()
27 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
28 | self.bn = nn.BatchNorm2d(c2)
29 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
30 |
31 | def forward(self, x):
32 | """Apply convolution, batch normalization and activation to input tensor."""
33 | return self.act(self.bn(self.conv(x)))
34 |
35 | def forward_fuse(self, x):
36 | """Perform transposed convolution of 2D data."""
37 | return self.act(self.conv(x))
38 |
39 |
40 | class Conv2(Conv):
41 | """Simplified RepConv module with Conv fusing."""
42 |
43 | def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
44 | """Initialize Conv layer with given arguments including activation."""
45 | super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
46 | self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False) # add 1x1 conv
47 |
48 | def forward(self, x):
49 | """Apply convolution, batch normalization and activation to input tensor."""
50 | return self.act(self.bn(self.conv(x) + self.cv2(x)))
51 |
52 | def fuse_convs(self):
53 | """Fuse parallel convolutions."""
54 | w = torch.zeros_like(self.conv.weight.data)
55 | i = [x // 2 for x in w.shape[2:]]
56 | w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
57 | self.conv.weight.data += w
58 | self.__delattr__('cv2')
59 |
60 |
61 | class LightConv(nn.Module):
62 | """Light convolution with args(ch_in, ch_out, kernel).
63 | https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
64 | """
65 |
66 | def __init__(self, c1, c2, k=1, act=nn.ReLU()):
67 | """Initialize Conv layer with given arguments including activation."""
68 | super().__init__()
69 | self.conv1 = Conv(c1, c2, 1, act=False)
70 | self.conv2 = DWConv(c2, c2, k, act=act)
71 |
72 | def forward(self, x):
73 | """Apply 2 convolutions to input tensor."""
74 | return self.conv2(self.conv1(x))
75 |
76 |
77 | class DWConv(Conv):
78 | """Depth-wise convolution."""
79 |
80 | def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation
81 | super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
82 |
83 |
84 | class DWConvTranspose2d(nn.ConvTranspose2d):
85 | """Depth-wise transpose convolution."""
86 |
87 | def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out
88 | super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
89 |
90 |
91 | class ConvTranspose(nn.Module):
92 | """Convolution transpose 2d layer."""
93 | default_act = nn.SiLU() # default activation
94 |
95 | def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
96 | """Initialize ConvTranspose2d layer with batch normalization and activation function."""
97 | super().__init__()
98 | self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
99 | self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
100 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
101 |
102 | def forward(self, x):
103 | """Applies transposed convolutions, batch normalization and activation to input."""
104 | return self.act(self.bn(self.conv_transpose(x)))
105 |
106 | def forward_fuse(self, x):
107 | """Applies activation and convolution transpose operation to input."""
108 | return self.act(self.conv_transpose(x))
109 |
110 |
111 | class Focus(nn.Module):
112 | """Focus wh information into c-space."""
113 |
114 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
115 | super().__init__()
116 | self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
117 | # self.contract = Contract(gain=2)
118 |
119 | def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
120 | return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
121 | # return self.conv(self.contract(x))
122 |
123 |
124 | class GhostConv(nn.Module):
125 | """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
126 |
127 | def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups
128 | super().__init__()
129 | c_ = c2 // 2 # hidden channels
130 | self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
131 | self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
132 |
133 | def forward(self, x):
134 | """Forward propagation through a Ghost Bottleneck layer with skip connection."""
135 | y = self.cv1(x)
136 | return torch.cat((y, self.cv2(y)), 1)
137 |
138 |
139 | class RepConv(nn.Module):
140 | """RepConv is a basic rep-style block, including training and deploy status
141 | This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
142 | """
143 | default_act = nn.SiLU() # default activation
144 |
145 | def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
146 | super().__init__()
147 | assert k == 3 and p == 1
148 | self.g = g
149 | self.c1 = c1
150 | self.c2 = c2
151 | self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
152 |
153 | self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
154 | self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
155 | self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
156 |
157 | def forward_fuse(self, x):
158 | """Forward process"""
159 | return self.act(self.conv(x))
160 |
161 | def forward(self, x):
162 | """Forward process"""
163 | id_out = 0 if self.bn is None else self.bn(x)
164 | return self.act(self.conv1(x) + self.conv2(x) + id_out)
165 |
166 | def get_equivalent_kernel_bias(self):
167 | kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
168 | kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
169 | kernelid, biasid = self._fuse_bn_tensor(self.bn)
170 | return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
171 |
172 | def _avg_to_3x3_tensor(self, avgp):
173 | channels = self.c1
174 | groups = self.g
175 | kernel_size = avgp.kernel_size
176 | input_dim = channels // groups
177 | k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
178 | k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
179 | return k
180 |
181 | def _pad_1x1_to_3x3_tensor(self, kernel1x1):
182 | if kernel1x1 is None:
183 | return 0
184 | else:
185 | return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
186 |
187 | def _fuse_bn_tensor(self, branch):
188 | if branch is None:
189 | return 0, 0
190 | if isinstance(branch, Conv):
191 | kernel = branch.conv.weight
192 | running_mean = branch.bn.running_mean
193 | running_var = branch.bn.running_var
194 | gamma = branch.bn.weight
195 | beta = branch.bn.bias
196 | eps = branch.bn.eps
197 | elif isinstance(branch, nn.BatchNorm2d):
198 | if not hasattr(self, 'id_tensor'):
199 | input_dim = self.c1 // self.g
200 | kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
201 | for i in range(self.c1):
202 | kernel_value[i, i % input_dim, 1, 1] = 1
203 | self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
204 | kernel = self.id_tensor
205 | running_mean = branch.running_mean
206 | running_var = branch.running_var
207 | gamma = branch.weight
208 | beta = branch.bias
209 | eps = branch.eps
210 | std = (running_var + eps).sqrt()
211 | t = (gamma / std).reshape(-1, 1, 1, 1)
212 | return kernel * t, beta - running_mean * gamma / std
213 |
214 | def fuse_convs(self):
215 | if hasattr(self, 'conv'):
216 | return
217 | kernel, bias = self.get_equivalent_kernel_bias()
218 | self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
219 | out_channels=self.conv1.conv.out_channels,
220 | kernel_size=self.conv1.conv.kernel_size,
221 | stride=self.conv1.conv.stride,
222 | padding=self.conv1.conv.padding,
223 | dilation=self.conv1.conv.dilation,
224 | groups=self.conv1.conv.groups,
225 | bias=True).requires_grad_(False)
226 | self.conv.weight.data = kernel
227 | self.conv.bias.data = bias
228 | for para in self.parameters():
229 | para.detach_()
230 | self.__delattr__('conv1')
231 | self.__delattr__('conv2')
232 | if hasattr(self, 'nm'):
233 | self.__delattr__('nm')
234 | if hasattr(self, 'bn'):
235 | self.__delattr__('bn')
236 | if hasattr(self, 'id_tensor'):
237 | self.__delattr__('id_tensor')
238 |
239 |
240 | class ChannelAttention(nn.Module):
241 | """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
242 |
243 | def __init__(self, channels: int) -> None:
244 | super().__init__()
245 | self.pool = nn.AdaptiveAvgPool2d(1)
246 | self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
247 | self.act = nn.Sigmoid()
248 |
249 | def forward(self, x: torch.Tensor) -> torch.Tensor:
250 | return x * self.act(self.fc(self.pool(x)))
251 |
252 |
253 | class SpatialAttention(nn.Module):
254 | """Spatial-attention module."""
255 |
256 | def __init__(self, kernel_size=7):
257 | """Initialize Spatial-attention module with kernel size argument."""
258 | super().__init__()
259 | assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
260 | padding = 3 if kernel_size == 7 else 1
261 | self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
262 | self.act = nn.Sigmoid()
263 |
264 | def forward(self, x):
265 | """Apply channel and spatial attention on input for feature recalibration."""
266 | return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
267 |
268 |
269 | class CBAM(nn.Module):
270 | """Convolutional Block Attention Module."""
271 |
272 | def __init__(self, c1, kernel_size=7): # ch_in, kernels
273 | super().__init__()
274 | self.channel_attention = ChannelAttention(c1)
275 | self.spatial_attention = SpatialAttention(kernel_size)
276 |
277 | def forward(self, x):
278 | """Applies the forward pass through C1 module."""
279 | return self.spatial_attention(self.channel_attention(x))
280 |
281 |
282 | class Concat(nn.Module):
283 | """Concatenate a list of tensors along dimension."""
284 |
285 | def __init__(self, dimension=1):
286 | """Concatenates a list of tensors along a specified dimension."""
287 | super().__init__()
288 | self.d = dimension
289 |
290 | def forward(self, x):
291 | """Forward pass for the YOLOv8 mask Proto module."""
292 | return torch.cat(x, self.d)
293 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/detector.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import copy
6 | from typing import Tuple
7 |
8 | import numpy as np
9 | # import open_clip
10 | from detectron2.structures import Boxes, ImageList, Instances
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from detectron2.modeling import detector_postprocess
15 | from detectron2.layers import batched_nms
16 | from detectron2.modeling import build_backbone
17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head
18 | from detectron2.config import configurable
19 | from omdet.modeling.language_backbone import build_language_backbone
20 | from detectron2.utils.logger import setup_logger
21 | from ..modeling.language_backbone.clip.models import clip as clip
22 | from .torch_utils import bbox_cxcywh_to_xyxy
23 | __all__ = ['OmDetV2Turbo']
24 |
25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
26 |
27 | from ..utils.cache import LRUCache
28 |
29 | from huggingface_hub import PyTorchModelHubMixin
30 |
31 |
32 | @META_ARCH_REGISTRY.register()
33 | class OmDetV2Turbo(nn.Module, PyTorchModelHubMixin):
34 |
35 | @configurable
36 | def __init__(self, cfg):
37 | super(OmDetV2Turbo, self).__init__()
38 | self.cfg = cfg
39 | self.logger = setup_logger(name=__name__)
40 |
41 | self.backbone = build_backbone(cfg)
42 | self.decoder = build_decoder_model(cfg)
43 | self.neck = build_encoder_model(cfg)
44 | self.loss_head = build_detr_head(cfg)
45 | self.device = cfg.MODEL.DEVICE
46 |
47 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
48 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
49 | normalizer = lambda x: (x - pixel_mean) / pixel_std
50 | self.normalizer = normalizer
51 |
52 | self.size_divisibility = self.backbone.size_divisibility
53 | self.nms_test_th = 0.0
54 | self.conf_test_th = 0.0
55 | self.loss_type = 'FOCAL'
56 | self.use_language_cache = True
57 | self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
58 | self.num_proposals = cfg.MODEL.ELADecoder.num_queries
59 |
60 | # Build language Encoder
61 | self.language_backbone = build_language_backbone(cfg)
62 | self.language_cache_label = LRUCache(100)
63 | self.language_cache_prompt = LRUCache(100)
64 |
65 |
66 | @classmethod
67 | def from_config(cls, cfg, *args, **kwargs):
68 | return {
69 | 'cfg': cfg
70 | }
71 |
72 | def preprocess_image(self, batched_inputs):
73 | """
74 | Normalize, pad and batch the input images.
75 | """
76 | images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
77 | images = ImageList.from_tensors(images, self.size_divisibility)
78 |
79 | images_whwh = list()
80 | for bi in batched_inputs:
81 | h, w = bi["image"].shape[-2:]
82 | images_whwh.append(torch.tensor([w, h, w, h], dtype=torch.float32, device=self.device))
83 | images_whwh = torch.stack(images_whwh)
84 | ann_types = [x["ann_type"] if "ann_type" in x else "box" for x in batched_inputs]
85 | return images, images_whwh, ann_types
86 |
87 | def gen_output(self, box_cls, box_pred, batched_inputs, images, score_thresh, nms_thresh, do_postprocess,
88 | max_num_det=None):
89 | results = self.inference(box_cls, box_pred, images.image_sizes, score_thresh, nms_thresh, max_num_det)
90 |
91 | if do_postprocess:
92 | processed_results = []
93 | for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
94 | height = input_per_image.get("height", image_size[0])
95 | width = input_per_image.get("width", image_size[1])
96 | r = detector_postprocess(results_per_image, height, width)
97 | processed_results.append({"instances": r})
98 | results = processed_results
99 | return results
100 |
101 | def inference(self, box_cls, box_pred, image_sizes, score_thresh=None, nms_thresh=None, max_num_det=None):
102 | assert len(box_cls) == len(image_sizes)
103 | if score_thresh is None:
104 | score_thresh = self.conf_test_th
105 |
106 | if nms_thresh is None:
107 | nms_thresh = self.nms_test_th
108 |
109 | num_classes = box_cls.shape[2]
110 | scores, labels = self.compute_score(box_cls)
111 | results = []
112 | if self.loss_type in {"FOCAL", "BCE"}:
113 | for i, (scores_img, box_per_img, image_size) in enumerate(zip(scores, box_pred, image_sizes
114 | )):
115 | results.append(self.inference_single_image(box_per_img, scores_img, labels, image_size, num_classes,
116 | score_thresh=score_thresh,
117 | nms_thresh=nms_thresh,
118 | max_num_det=max_num_det))
119 | else:
120 | for i, (scores_img, label_img, box_per_img, image_size) in enumerate(zip(
121 | scores, labels, box_pred, image_sizes
122 | )):
123 | results.append(
124 | self.inference_single_image(box_per_img, scores_img, label_img, image_size, num_classes,
125 | score_thresh=score_thresh,
126 | nms_thresh=nms_thresh,
127 | max_num_det=max_num_det))
128 |
129 | return results
130 |
131 | def inference_single_image(self, boxes, scores, labels,
132 | image_size: Tuple[int, int],
133 | num_classes: int,
134 | score_thresh: float,
135 | nms_thresh: float,
136 | max_num_det: int = None):
137 | """
138 | Call `fast_rcnn_inference_single_image` for all images.
139 | Args:
140 | boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
141 | boxes for each image. Element i has shape (Ri, K * 4) if doing
142 | class-specific regression, or (Ri, 4) if doing class-agnostic
143 | regression, where Ri is the number of predicted objects for image i.
144 | This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
145 | scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
146 | Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
147 | for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
148 | image_size (list[tuple]): A list of (width, height) tuples for each image in the batch.
149 | score_thresh (float): Only return detections with a confidence score exceeding this
150 | threshold.
151 | nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1].
152 | Returns:
153 | instances: (list[Instances]): A list of N instances, one for each image in the batch,
154 | that stores the topk most confidence detections.
155 | kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
156 | the corresponding boxes/scores index in [0, Ri) from the input, for image i.
157 | """
158 | # scores_per_image: num_proposal
159 | # labels_per_image: num_proposal
160 | # box_per_images: num_proposal x 4'
161 | if self.loss_type in {"FOCAL", "BCE"}:
162 | proposal_num = len(boxes) if max_num_det is None else max_num_det
163 | scores_per_image, topk_indices = scores.flatten(0, 1).topk(proposal_num, sorted=False)
164 | labels_per_image = labels[topk_indices]
165 | box_pred_per_image = boxes.view(-1, 1, 4).repeat(1, num_classes, 1).view(-1, 4)
166 | box_pred_per_image = box_pred_per_image[topk_indices]
167 | else:
168 | box_pred_per_image = boxes
169 | scores_per_image = scores
170 | labels_per_image = labels
171 |
172 | # Score filtering
173 | box_pred_per_image = bbox_cxcywh_to_xyxy(box_pred_per_image) * torch.tensor(image_size).repeat(2).to(self.device)
174 | filter_mask = scores_per_image > score_thresh # R x K
175 | score_keep = filter_mask.nonzero(as_tuple=False).view(-1)
176 | box_pred_per_image = box_pred_per_image[score_keep]
177 | scores_per_image = scores_per_image[score_keep]
178 | labels_per_image = labels_per_image[score_keep]
179 |
180 | # NMS
181 | scores_per_image.to(self.device)
182 | keep = batched_nms(box_pred_per_image, scores_per_image, labels_per_image, nms_thresh)
183 | box_pred_per_image = box_pred_per_image[keep]
184 | scores_per_image = scores_per_image[keep]
185 | labels_per_image = labels_per_image[keep]
186 |
187 | # create an instance
188 | result = Instances(image_size)
189 | result.pred_boxes = Boxes(box_pred_per_image)
190 | result.pred_boxes.clip(image_size)
191 | result.scores = scores_per_image
192 | result.pred_classes = labels_per_image
193 |
194 | return result
195 |
196 | def compute_score(self, box_cls):
197 | """
198 | Args:
199 | box_cls: tensor of shape (batch_size, num_proposals, K).
200 | The tensor predicts the classification probability for each proposal.
201 |
202 | Returns:
203 | """
204 | if self.loss_type in {"FOCAL", "BCE"}:
205 | num_classes = box_cls.shape[2]
206 | proposal_num = box_cls.shape[1]
207 | scores = torch.sigmoid(box_cls)
208 | labels = torch.arange(num_classes, device=self.device). \
209 | unsqueeze(0).repeat(proposal_num, 1).flatten(0, 1)
210 | else:
211 | scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
212 | # scores: batch_size x num_proposal
213 |
214 | return scores, labels
215 |
216 | def language_encode(self, batched_inputs, encode_type="task"):
217 | texts = batched_inputs
218 |
219 | if self.language_encoder_type == "clip":
220 | text_input = clip.tokenize(texts, truncate=True).to(self.device)
221 |
222 | return self.language_backbone(text_input, encode_type == "task")
223 |
224 | def get_cached_label_emb(self, labels):
225 | self.logger.info('processing labels embeddings for {}'.format(labels))
226 | not_cached_index = []
227 | not_cached_labels = []
228 | total_embs = []
229 | for idx, l in enumerate(labels):
230 | if self.language_cache_label.has(l):
231 | total_embs.append(self.language_cache_label.get(l))
232 | else:
233 | total_embs.append(None)
234 | not_cached_index.append(idx)
235 | not_cached_labels.append(l)
236 |
237 | self.logger.info('cached label emb num: {}, not cached num: {}'.format(len(total_embs) - len(not_cached_labels),
238 | len(not_cached_labels)))
239 |
240 | if not_cached_labels:
241 | embeddings = self.language_encode(not_cached_labels, encode_type="label")
242 | for idx, emb in enumerate(embeddings):
243 | idx_to_put = not_cached_index[idx]
244 | total_embs[idx_to_put] = emb
245 | self.language_cache_label.put(not_cached_labels[idx], emb)
246 |
247 | total_label_embs = torch.stack(total_embs).to(self.device)
248 | return total_label_embs
249 |
250 | def get_cached_prompt_emb(self, batched_tasks):
251 | self.logger.info('processing prompt embeddings for {}'.format(batched_tasks))
252 | not_cached_index = []
253 | not_cached_tasks = []
254 | total_task_features = []
255 | total_task_masks = []
256 | for idx, t in enumerate(batched_tasks):
257 | if self.language_cache_prompt.has(t):
258 | task_feature, task_mask = self.language_cache_prompt.get(t)
259 | total_task_features.append(task_feature)
260 | total_task_masks.append(task_mask)
261 | else:
262 | total_task_features.append(None)
263 | total_task_masks.append(None)
264 | not_cached_index.append(idx)
265 | not_cached_tasks.append(t)
266 |
267 | self.logger.info(
268 | 'cached prompt emb num: {}, not cached num: {}'.format(len(total_task_features) - len(not_cached_tasks),
269 | len(not_cached_tasks)))
270 |
271 | if not_cached_tasks:
272 | embeddings, task_masks = self.language_encode(not_cached_tasks, encode_type="task")
273 |
274 | for idx in range(embeddings.shape[1]):
275 | emb = embeddings[:, [idx], :]
276 | idx_to_put = not_cached_index[idx]
277 | cur_mask = torch.unsqueeze(task_masks[idx], dim=0).to(self.device)
278 | total_task_features[idx_to_put] = emb
279 | total_task_masks[idx_to_put] = cur_mask
280 | self.language_cache_prompt.put(not_cached_tasks[idx], (emb, cur_mask))
281 |
282 | total_prompt_features = torch.cat(total_task_features, dim=1)
283 | total_prompt_masks = torch.cat(total_task_masks, dim=0).to(self.device)
284 |
285 | return total_prompt_features, total_prompt_masks
286 |
287 | def get_language_embedding(self, batched_inputs):
288 | batched_labels = [a["label_set"] for a in batched_inputs]
289 | batched_tasks = [a['tasks'] for a in batched_inputs]
290 |
291 | max_label_size = max([len(a) for a in batched_labels])
292 | label_features = []
293 | for i, s_labels in enumerate(batched_labels):
294 | pad_size = max_label_size - len(s_labels)
295 |
296 | label_emb = self.get_cached_label_emb(s_labels)
297 | label_features.append(F.pad(label_emb, (0, 0, 0, pad_size)).unsqueeze(1).to(self.device))
298 |
299 | label_features = torch.cat(label_features, dim=1) # num_label x batch_size x dim_size
300 |
301 | # Task Features
302 | # prompt_features: max_task_len x batch_size x dim_size
303 | # prompt_mask: batch_size x max_task_len
304 | # batched_tasks = ['detect a person', 'detect dog and cat']
305 | prompt_features, prompt_mask = self.get_cached_prompt_emb(batched_tasks)
306 |
307 | return label_features, prompt_features, prompt_mask
308 |
309 | def forward(self, batched_inputs, do_postprocess=True, score_thresh=0.0, nms_thresh=1.0, debug=False):
310 | images, images_whwh, ann_types = self.preprocess_image(batched_inputs)
311 |
312 | # Backbone
313 | body_feats = self.backbone(images.tensor)
314 |
315 | if type(body_feats) is dict:
316 | body_feats = [body_feats[i] for i in body_feats.keys()]
317 |
318 | encoder_feats = self.neck(body_feats)
319 |
320 | if not self.training:
321 | # create label and prompt embeddings
322 | label_feats, prompt_feats, prompt_mask = self.get_language_embedding(batched_inputs)
323 | decoder_feats = self.decoder(encoder_feats, label_feats, prompt_feats, prompt_mask)
324 | box_pred, box_cls, _ = self.loss_head(decoder_feats)
325 |
326 | results = self.gen_output(box_cls, box_pred, batched_inputs, images,
327 | score_thresh, nms_thresh, do_postprocess,
328 | max_num_det=self.num_proposals)
329 |
330 | return results
331 |
332 | def print_trainable_parameters(self):
333 | """
334 | Prints the number of trainable parameters in the model.
335 | """
336 | trainable_params = 0
337 | all_param = 0
338 | for _, param in self.named_parameters():
339 | num_params = param.numel()
340 | # if using DS Zero 3 and the weights are initialized empty
341 | if num_params == 0 and hasattr(param, "ds_numel"):
342 | num_params = param.ds_numel
343 |
344 | all_param += num_params
345 | if param.requires_grad:
346 | trainable_params += num_params
347 | print(
348 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
349 | )
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/detr_torch.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Optional, List
3 |
4 | import torch
5 | import torch.nn.functional as F
6 | from torch import nn, Tensor
7 |
8 |
9 | class Transformer(nn.Module):
10 |
11 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
12 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
13 | activation="relu", normalize_before=False,
14 | return_intermediate_dec=False):
15 | super().__init__()
16 |
17 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
18 | dropout, activation, normalize_before)
19 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
20 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
21 |
22 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
23 | dropout, activation, normalize_before)
24 | decoder_norm = nn.LayerNorm(d_model)
25 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
26 | return_intermediate=return_intermediate_dec)
27 |
28 | self._reset_parameters()
29 |
30 | self.d_model = d_model
31 | self.nhead = nhead
32 |
33 | def _reset_parameters(self):
34 | for p in self.parameters():
35 | if p.dim() > 1:
36 | nn.init.xavier_uniform_(p)
37 |
38 | def forward(self, src, mask, query_embed, pos_embed):
39 | # flatten NxCxHxW to HWxNxC
40 | bs, c, h, w = src.shape
41 | src = src.flatten(2).permute(2, 0, 1)
42 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
43 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
44 | mask = mask.flatten(1)
45 |
46 | tgt = torch.zeros_like(query_embed)
47 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
48 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
49 | pos=pos_embed, query_pos=query_embed)
50 | return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
51 |
52 |
53 | class TransformerEncoder(nn.Module):
54 | def __init__(self, encoder_layer, num_layers, norm=None):
55 | super(TransformerEncoder, self).__init__()
56 | # self.layers = _get_clones(encoder_layer, num_layers)
57 | self.layers = [encoder_layer]
58 | self.num_layers = num_layers
59 | self.norm = norm
60 |
61 | def forward(self, src, src_mask=None, pos_embed=None):
62 | output = src
63 | pos_embed = pos_embed.clone().detach() if pos_embed is not None else pos_embed
64 | for layer in self.layers:
65 | output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
66 |
67 | if self.norm is not None:
68 | output = self.norm(output)
69 |
70 | return output
71 |
72 |
73 | class TransformerDecoder(nn.Module):
74 |
75 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
76 | super().__init__()
77 | self.layers = _get_clones(decoder_layer, num_layers)
78 | self.num_layers = num_layers
79 | self.norm = norm
80 | self.return_intermediate = return_intermediate
81 |
82 | def forward(self, tgt, memory,
83 | tgt_mask: Optional[Tensor] = None,
84 | memory_mask: Optional[Tensor] = None,
85 | tgt_key_padding_mask: Optional[Tensor] = None,
86 | memory_key_padding_mask: Optional[Tensor] = None,
87 | pos: Optional[Tensor] = None,
88 | query_pos: Optional[Tensor] = None):
89 | output = tgt
90 |
91 | intermediate = []
92 |
93 | for layer in self.layers:
94 | output = layer(output, memory, tgt_mask=tgt_mask,
95 | memory_mask=memory_mask,
96 | tgt_key_padding_mask=tgt_key_padding_mask,
97 | memory_key_padding_mask=memory_key_padding_mask,
98 | pos=pos, query_pos=query_pos)
99 | if self.return_intermediate:
100 | intermediate.append(self.norm(output))
101 |
102 | if self.norm is not None:
103 | output = self.norm(output)
104 | if self.return_intermediate:
105 | intermediate.pop()
106 | intermediate.append(output)
107 |
108 | if self.return_intermediate:
109 | return torch.stack(intermediate)
110 |
111 | return output.unsqueeze(0)
112 |
113 |
114 | class TransformerEncoderLayer(nn.Module):
115 |
116 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
117 | activation="relu", normalize_before=False):
118 | super().__init__()
119 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
120 | # Implementation of Feedforward model
121 | self.linear1 = nn.Linear(d_model, dim_feedforward)
122 | self.dropout = nn.Dropout(dropout)
123 | self.linear2 = nn.Linear(dim_feedforward, d_model)
124 |
125 | self.norm1 = nn.LayerNorm(d_model)
126 | self.norm2 = nn.LayerNorm(d_model)
127 | self.dropout1 = nn.Dropout(dropout)
128 | self.dropout2 = nn.Dropout(dropout)
129 |
130 | self.activation = _get_activation_fn(activation)
131 | self.normalize_before = normalize_before
132 |
133 | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
134 | return tensor if pos is None else tensor + pos
135 |
136 | def forward_post(self,
137 | src,
138 | src_mask: Optional[Tensor] = None,
139 | src_key_padding_mask: Optional[Tensor] = None,
140 | pos: Optional[Tensor] = None):
141 | q = k = self.with_pos_embed(src, pos)
142 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
143 | key_padding_mask=src_key_padding_mask)[0]
144 | src = src + self.dropout1(src2)
145 | src = self.norm1(src)
146 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
147 | src = src + self.dropout2(src2)
148 | src = self.norm2(src)
149 | return src
150 |
151 | def forward_pre(self, src,
152 | src_mask: Optional[Tensor] = None,
153 | src_key_padding_mask: Optional[Tensor] = None,
154 | pos: Optional[Tensor] = None):
155 | src2 = self.norm1(src)
156 | q = k = self.with_pos_embed(src2, pos)
157 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
158 | key_padding_mask=src_key_padding_mask)[0]
159 | src = src + self.dropout1(src2)
160 | src2 = self.norm2(src)
161 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
162 | src = src + self.dropout2(src2)
163 | return src
164 |
165 | def forward(self, src,
166 | src_mask: Optional[Tensor] = None,
167 | src_key_padding_mask: Optional[Tensor] = None,
168 | pos: Optional[Tensor] = None):
169 | if self.normalize_before:
170 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
171 | return self.forward_post(src, src_mask, src_key_padding_mask, pos)
172 |
173 |
174 | class TransformerDecoderLayer(nn.Module):
175 |
176 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
177 | activation="relu", normalize_before=False):
178 | super().__init__()
179 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
180 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
181 | # Implementation of Feedforward model
182 | self.linear1 = nn.Linear(d_model, dim_feedforward)
183 | self.dropout = nn.Dropout(dropout)
184 | self.linear2 = nn.Linear(dim_feedforward, d_model)
185 |
186 | self.norm1 = nn.LayerNorm(d_model)
187 | self.norm2 = nn.LayerNorm(d_model)
188 | self.norm3 = nn.LayerNorm(d_model)
189 | self.dropout1 = nn.Dropout(dropout)
190 | self.dropout2 = nn.Dropout(dropout)
191 | self.dropout3 = nn.Dropout(dropout)
192 |
193 | self.activation = _get_activation_fn(activation)
194 | self.normalize_before = normalize_before
195 |
196 | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
197 | return tensor if pos is None else tensor + pos
198 |
199 | def forward_post(self, tgt, memory,
200 | tgt_mask: Optional[Tensor] = None,
201 | memory_mask: Optional[Tensor] = None,
202 | tgt_key_padding_mask: Optional[Tensor] = None,
203 | memory_key_padding_mask: Optional[Tensor] = None,
204 | pos: Optional[Tensor] = None,
205 | query_pos: Optional[Tensor] = None):
206 | q = k = self.with_pos_embed(tgt, query_pos)
207 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
208 | key_padding_mask=tgt_key_padding_mask)[0]
209 | tgt = tgt + self.dropout1(tgt2)
210 | tgt = self.norm1(tgt)
211 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
212 | key=self.with_pos_embed(memory, pos),
213 | value=memory, attn_mask=memory_mask,
214 | key_padding_mask=memory_key_padding_mask)[0]
215 | tgt = tgt + self.dropout2(tgt2)
216 | tgt = self.norm2(tgt)
217 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
218 | tgt = tgt + self.dropout3(tgt2)
219 | tgt = self.norm3(tgt)
220 | return tgt
221 |
222 | def forward_pre(self, tgt, memory,
223 | tgt_mask: Optional[Tensor] = None,
224 | memory_mask: Optional[Tensor] = None,
225 | tgt_key_padding_mask: Optional[Tensor] = None,
226 | memory_key_padding_mask: Optional[Tensor] = None,
227 | pos: Optional[Tensor] = None,
228 | query_pos: Optional[Tensor] = None):
229 | tgt2 = self.norm1(tgt)
230 | q = k = self.with_pos_embed(tgt2, query_pos)
231 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
232 | key_padding_mask=tgt_key_padding_mask)[0]
233 | tgt = tgt + self.dropout1(tgt2)
234 | tgt2 = self.norm2(tgt)
235 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
236 | key=self.with_pos_embed(memory, pos),
237 | value=memory, attn_mask=memory_mask,
238 | key_padding_mask=memory_key_padding_mask)[0]
239 | tgt = tgt + self.dropout2(tgt2)
240 | tgt2 = self.norm3(tgt)
241 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
242 | tgt = tgt + self.dropout3(tgt2)
243 | return tgt
244 |
245 | def forward(self, tgt, memory,
246 | tgt_mask: Optional[Tensor] = None,
247 | memory_mask: Optional[Tensor] = None,
248 | tgt_key_padding_mask: Optional[Tensor] = None,
249 | memory_key_padding_mask: Optional[Tensor] = None,
250 | pos: Optional[Tensor] = None,
251 | query_pos: Optional[Tensor] = None):
252 | if self.normalize_before:
253 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
254 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
255 | return self.forward_post(tgt, memory, tgt_mask, memory_mask,
256 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
257 |
258 |
259 | def _get_clones(module, N):
260 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
261 |
262 |
263 | def build_transformer(args):
264 | return Transformer(
265 | d_model=args.hidden_dim,
266 | dropout=args.dropout,
267 | nhead=args.nheads,
268 | dim_feedforward=args.dim_feedforward,
269 | num_encoder_layers=args.enc_layers,
270 | num_decoder_layers=args.dec_layers,
271 | normalize_before=args.pre_norm,
272 | return_intermediate_dec=True,
273 | )
274 |
275 |
276 | def _get_activation_fn(activation):
277 | """Return an activation function given a string"""
278 | if activation == "relu":
279 | return F.relu
280 | if activation == "gelu":
281 | return F.gelu
282 | if activation == "glu":
283 | return F.glu
284 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
285 |
286 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/dn_ops.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from omdet.utils.box_ops import xywh2xyxy, xyxy2xywh
3 |
4 |
5 | def get_cdn_group(batch,
6 | num_classes,
7 | num_queries,
8 | class_embed,
9 | num_dn=100,
10 | cls_noise_ratio=0.5,
11 | box_noise_scale=1.0,
12 | training=False,
13 | amp=False):
14 | """
15 | Get contrastive denoising training group. This function creates a contrastive denoising training group with
16 | positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding
17 | box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information.
18 |
19 | Args:
20 | batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes'
21 | (torch.Tensor with shape [num_gts, 4]), 'gt_groups' (List(int)) which is a list of batch size length
22 | indicating the number of gts of each image.
23 | num_classes (int): Number of classes.
24 | num_queries (int): Number of queries.
25 | class_embed (torch.Tensor): Embedding weights to map class labels to embedding space.
26 | num_dn (int, optional): Number of denoising. Defaults to 100.
27 | cls_noise_ratio (float, optional): Noise ratio for class labels. Defaults to 0.5.
28 | box_noise_scale (float, optional): Noise scale for bounding box coordinates. Defaults to 1.0.
29 | training (bool, optional): If it's in training mode. Defaults to False.
30 |
31 | Returns:
32 | (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Dict]]): The modified class embeddings,
33 | bounding boxes, attention mask and meta information for denoising. If not in training mode or 'num_dn'
34 | is less than or equal to 0, the function returns None for all elements in the tuple.
35 | """
36 |
37 | if (not training) or num_dn <= 0:
38 | return None, None, None, None
39 | gt_groups = batch['gt_groups']
40 | total_num = sum(gt_groups)
41 | max_nums = max(gt_groups)
42 | if max_nums == 0:
43 | return None, None, None, None
44 |
45 | num_group = num_dn // max_nums
46 | num_group = 1 if num_group == 0 else num_group
47 | # pad gt to max_num of a batch
48 | bs = len(gt_groups)
49 | gt_cls = batch['cls'] # (bs*num, )
50 | gt_bbox = batch['bboxes'] # bs*num, 4
51 | b_idx = batch['batch_idx']
52 |
53 | # each group has positive and negative queries.
54 | dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, )
55 | dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4
56 | dn_b_idx = b_idx.repeat(2 * num_group).view(-1).to(dn_cls.device) # (2*num_group*bs*num, )
57 |
58 | # positive and negative mask
59 | # (bs*num*num_group, ), the second total_num*num_group part as negative samples
60 | neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
61 |
62 | if cls_noise_ratio > 0:
63 | # half of bbox prob
64 | mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
65 | idx = torch.nonzero(mask).squeeze(-1)
66 | # randomly put a new one here
67 | new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
68 | dn_cls[idx] = new_label
69 |
70 | if box_noise_scale > 0:
71 | known_bbox = xywh2xyxy(dn_bbox)
72 |
73 | diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale # 2*num_group*bs*num, 4
74 |
75 | rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0
76 | rand_part = torch.rand_like(dn_bbox)
77 | rand_part[neg_idx] += 1.0
78 | rand_part *= rand_sign
79 | known_bbox += rand_part * diff
80 | known_bbox.clip_(min=0.0, max=1.0)
81 | dn_bbox = xyxy2xywh(known_bbox)
82 | dn_bbox = inverse_sigmoid(dn_bbox)
83 |
84 | # total denoising queries
85 | num_dn = int(max_nums * 2 * num_group)
86 | # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])
87 | dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256
88 | if amp:
89 | data_type = torch.bfloat16
90 | else:
91 | data_type = torch.float32
92 | padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device, dtype=data_type)
93 | padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device)
94 |
95 | map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long, device=gt_cls.device) for num in gt_groups])
96 | pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0)
97 |
98 | map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)])
99 | fix_class = dn_cls.dim() == 2
100 | if fix_class:
101 | padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed
102 | else:
103 | padding_cls[(dn_b_idx.long(), map_indices)] = dn_cls_embed.transpose(1,0)[(dn_b_idx.long(), map_indices)]
104 | padding_bbox[(dn_b_idx.long(), map_indices)] = dn_bbox
105 |
106 | tgt_size = num_dn + num_queries
107 | attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool)
108 | # match query cannot see the reconstruct
109 | attn_mask[num_dn:, :num_dn] = True
110 | # reconstruct cannot see each other
111 | for i in range(num_group):
112 | if i == 0:
113 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
114 | if i == num_group - 1:
115 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * i * 2] = True
116 | else:
117 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True
118 | attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True
119 | dn_meta = {
120 | 'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)],
121 | 'dn_num_group': num_group,
122 | 'dn_num_split': [num_dn, num_queries]}
123 |
124 | return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to(
125 | class_embed.device), dn_meta
126 |
127 |
128 | def inverse_sigmoid(x, eps=1e-6):
129 | """Inverse sigmoid function."""
130 | x = x.clip(min=0., max=1.)
131 | return torch.log(x / (1 - x + eps) + eps)
132 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/ela_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from .torch_utils import BaseConv, linear_init_
5 | from .block import RepC3
6 | from .detr_torch import TransformerEncoder
7 | from .build_components import TRANSFORMER_ENCODER_REGISTRY
8 |
9 | __all__ = ['ELAEncoder']
10 |
11 |
12 | class TransformerLayer(nn.Module):
13 | def __init__(self,
14 | d_model=256,
15 | nhead=8,
16 | dim_feedforward=1024,
17 | dropout=0.,
18 | activation="relu",
19 | attn_dropout=None,
20 | act_dropout=None,
21 | normalize_before=False):
22 | super(TransformerLayer, self).__init__()
23 | attn_dropout = dropout if attn_dropout is None else attn_dropout
24 | act_dropout = dropout if act_dropout is None else act_dropout
25 | self.normalize_before = normalize_before
26 |
27 | self.self_attn = torch.nn.MultiheadAttention(d_model, nhead, attn_dropout, batch_first=True)
28 | # Implementation of Feedforward model
29 | self.linear1 = nn.Linear(d_model, dim_feedforward)
30 | self.dropout = nn.Dropout(act_dropout)
31 | self.linear2 = nn.Linear(dim_feedforward, d_model)
32 |
33 | self.norm1 = nn.LayerNorm(d_model)
34 | self.norm2 = nn.LayerNorm(d_model)
35 | self.dropout1 = nn.Dropout(dropout)
36 | self.dropout2 = nn.Dropout(dropout)
37 | self.activation = getattr(F, activation)
38 | self._reset_parameters()
39 |
40 | def _reset_parameters(self):
41 | linear_init_(self.linear1)
42 | linear_init_(self.linear2)
43 |
44 | @staticmethod
45 | def with_pos_embed(tensor, pos_embed):
46 | return tensor if pos_embed is None else tensor + pos_embed
47 |
48 | def forward(self, src, src_mask=None, pos_embed=None):
49 | residual = src
50 | if self.normalize_before:
51 | src = self.norm1(src)
52 | q = k = self.with_pos_embed(src, pos_embed)
53 | src = self.self_attn(q, k, value=src, attn_mask=src_mask)
54 | #print(src[1].shape, src[0].shape)
55 | src = src[0]
56 | src = residual + self.dropout1(src)
57 | if not self.normalize_before:
58 | src = self.norm1(src)
59 |
60 | residual = src
61 | if self.normalize_before:
62 | src = self.norm2(src)
63 | src = self.linear2(self.dropout(self.activation(self.linear1(src))))
64 | src = residual + self.dropout2(src)
65 | if not self.normalize_before:
66 | src = self.norm2(src)
67 | return src
68 |
69 |
70 | @TRANSFORMER_ENCODER_REGISTRY.register()
71 | class ELAEncoder(nn.Module):
72 | # __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
73 | # __inject__ = ['encoder_layer']
74 |
75 | def __init__(self,
76 | in_channels=[128, 256, 512],
77 | feat_strides=[8, 16, 32],
78 | hidden_dim=256,
79 | use_encoder_idx=[2],
80 | num_encoder_layers=1,
81 | encoder_layer='TransformerLayer',
82 | pe_temperature=10000,
83 | expansion=1.0,
84 | depth_mult=1.0,
85 | act='silu',
86 | trt=False,
87 | dim_feedforward=1024,
88 | eval_size=None):
89 | super(ELAEncoder, self).__init__()
90 | self.in_channels = in_channels
91 | self.feat_strides = feat_strides
92 | self.hidden_dim = hidden_dim
93 | self.use_encoder_idx = use_encoder_idx
94 | self.num_encoder_layers = num_encoder_layers
95 | self.pe_temperature = pe_temperature
96 | self.eval_size = eval_size
97 |
98 | self.encoder_layer = TransformerLayer(dim_feedforward=dim_feedforward)
99 |
100 | # channel projection
101 | self.input_proj = nn.ModuleList()
102 | for in_channel in self.in_channels:
103 | self.input_proj.append(
104 | nn.Sequential(
105 | nn.Conv2d(
106 | in_channel, hidden_dim, kernel_size=(1, 1), bias=False),
107 | nn.BatchNorm2d(
108 | hidden_dim)))
109 | # encoder transformer
110 | self.encoder = nn.ModuleList([
111 | TransformerEncoder(self.encoder_layer, num_encoder_layers)
112 | for _ in range(len(use_encoder_idx))
113 | ])
114 |
115 | # act = get_act_fn(
116 | # act, trt=trt) if act is None or isinstance(act,
117 | # (str, dict)) else act
118 | # top-down fpn
119 | self.lateral_convs = nn.ModuleList()
120 | self.fpn_blocks = nn.ModuleList()
121 | for idx in range(len(self.in_channels) - 1, 0, -1):
122 | self.lateral_convs.append(
123 | BaseConv(
124 | hidden_dim, hidden_dim, 1, 1, act=act))
125 | self.fpn_blocks.append(
126 | RepC3(
127 | hidden_dim * 2,
128 | hidden_dim,
129 | round(3 * depth_mult),
130 | e=1.0))
131 |
132 | # bottom-up pan
133 | self.downsample_convs = nn.ModuleList()
134 | self.pan_blocks = nn.ModuleList()
135 | for idx in range(len(self.in_channels) - 1):
136 | self.downsample_convs.append(
137 | BaseConv(
138 | hidden_dim, hidden_dim, 3, stride=2, act=act))
139 | self.pan_blocks.append(
140 | RepC3(
141 | hidden_dim * 2,
142 | hidden_dim,
143 | round(3 * depth_mult),
144 | e=1.0))
145 |
146 | # self._reset_parameters()
147 | #
148 | # def _reset_parameters(self):
149 | # if self.eval_size:
150 | # for idx in self.use_encoder_idx:
151 | # stride = self.feat_strides[idx]
152 | # pos_embed = self.build_2d_sincos_position_embedding(
153 | # self.eval_size[1] // stride, self.eval_size[0] // stride,
154 | # self.hidden_dim, self.pe_temperature)
155 | # setattr(self, f'pos_embed{idx}', pos_embed)
156 |
157 | @staticmethod
158 | def build_2d_sincos_position_embedding(w,
159 | h,
160 | embed_dim=256,
161 | temperature=10000.):
162 | grid_w = torch.arange(int(w), dtype=torch.float32)
163 | grid_h = torch.arange(int(h), dtype=torch.float32)
164 | grid_w, grid_h = torch.meshgrid(grid_w, grid_h)
165 | assert embed_dim % 4 == 0, \
166 | 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
167 | pos_dim = embed_dim // 4
168 | omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
169 | omega = 1. / (temperature**omega)
170 |
171 | out_w = grid_w.flatten()[..., None] @omega[None]
172 | out_h = grid_h.flatten()[..., None] @omega[None]
173 |
174 | return torch.cat(
175 | [
176 | torch.sin(out_w), torch.cos(out_w), torch.sin(out_h),
177 | torch.cos(out_h)
178 | ],
179 | dim=1)[None, :, :]
180 | @classmethod
181 | def from_config(cls, cfg):
182 | enc_cfg = cfg.MODEL.ELAEncoder
183 | return {
184 | 'in_channels': enc_cfg.in_channels,
185 | 'feat_strides': enc_cfg.feat_strides,
186 | 'hidden_dim': enc_cfg.hidden_dim,
187 | 'use_encoder_idx': enc_cfg.use_encoder_idx,
188 | 'num_encoder_layers': enc_cfg.num_encoder_layers,
189 | 'encoder_layer': enc_cfg.encoder_layer,
190 | 'pe_temperature': enc_cfg.pe_temperature,
191 | 'expansion': enc_cfg.expansion,
192 | 'depth_mult': enc_cfg.depth_mult,
193 | 'act': enc_cfg.act,
194 | 'eval_size': enc_cfg.eval_size,
195 | 'dim_feedforward': enc_cfg.dim_feedforward
196 | }
197 |
198 | def forward(self, feats, for_mot=False):
199 | assert len(feats) == len(self.in_channels)
200 | # get projection features
201 | proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
202 | # encoder
203 | if self.num_encoder_layers > 0:
204 | for i, enc_ind in enumerate(self.use_encoder_idx):
205 | h, w = proj_feats[enc_ind].shape[2:]
206 | # flatten [B, C, H, W] to [B, HxW, C]
207 | src_flatten = proj_feats[enc_ind].flatten(start_dim=2).transpose(1, 2)
208 | if self.training or self.eval_size is None:
209 | pos_embed = self.build_2d_sincos_position_embedding(
210 | w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
211 | else:
212 | pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
213 | memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
214 | proj_feats[enc_ind] = memory.transpose(1, 2).reshape((-1, self.hidden_dim, h, w))
215 |
216 | # top-down fpn
217 | inner_outs = [proj_feats[-1]]
218 | for idx in range(len(self.in_channels) - 1, 0, -1):
219 | feat_heigh = inner_outs[0]
220 | feat_low = proj_feats[idx - 1]
221 | feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
222 | feat_heigh)
223 | inner_outs[0] = feat_heigh
224 |
225 | upsample_feat = F.interpolate(
226 | feat_heigh, scale_factor=2., mode="nearest")
227 | inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
228 | torch.cat(
229 | [upsample_feat, feat_low], dim=1))
230 | inner_outs.insert(0, inner_out)
231 |
232 | # bottom-up pan
233 | outs = [inner_outs[0]]
234 | for idx in range(len(self.in_channels) - 1):
235 | feat_low = outs[-1]
236 | feat_height = inner_outs[idx + 1]
237 | downsample_feat = self.downsample_convs[idx](feat_low)
238 | out = self.pan_blocks[idx](torch.cat(
239 | [downsample_feat, feat_height], dim=1))
240 | outs.append(out)
241 |
242 | return outs
243 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/head.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from .build_components import DETR_HEAD_REGISTRY
3 |
4 |
5 | __all__ = ['DINOHead']
6 | @DETR_HEAD_REGISTRY.register()
7 | class DINOHead(nn.Module):
8 | def __init__(self, device="cuda"):
9 | super(DINOHead, self).__init__()
10 |
11 | def forward(self, out_transformer, inputs=None):
12 | (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
13 | dn_meta) = out_transformer
14 |
15 | return (dec_out_bboxes[-1], dec_out_logits[-1], None)
16 |
17 | @classmethod
18 | def from_config(cls, cfg, *args, **kwargs):
19 | return {
20 | "device": cfg.MODEL.DEVICE
21 | }
22 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/infer_model.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import copy
6 | from typing import Tuple
7 |
8 | import numpy as np
9 | # import open_clip
10 | from detectron2.structures import Boxes, ImageList, Instances
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from detectron2.modeling import detector_postprocess
15 | from detectron2.layers import batched_nms
16 | from detectron2.modeling import build_backbone
17 | from omdet.omdet_v2_turbo.build_components import build_encoder_model, build_decoder_model, build_detr_head
18 | from detectron2.config import configurable
19 | from omdet.modeling.language_backbone import build_language_backbone
20 | from detectron2.utils.logger import setup_logger
21 | from ..modeling.language_backbone.clip.models import clip as clip
22 | from .torch_utils import bbox_cxcywh_to_xyxy
23 | __all__ = ['OmDetV2TurboInfer']
24 |
25 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
26 |
27 | from ..utils.cache import LRUCache
28 |
29 | from huggingface_hub import PyTorchModelHubMixin
30 |
31 |
32 | @META_ARCH_REGISTRY.register()
33 | class OmDetV2TurboInfer(nn.Module, PyTorchModelHubMixin):
34 |
35 | @configurable
36 | def __init__(self, cfg):
37 | super(OmDetV2TurboInfer, self).__init__()
38 | self.cfg = cfg
39 | self.logger = setup_logger(name=__name__)
40 |
41 | self.backbone = build_backbone(cfg)
42 | self.decoder = build_decoder_model(cfg)
43 | self.neck = build_encoder_model(cfg)
44 | self.device = cfg.MODEL.DEVICE
45 |
46 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
47 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
48 | normalizer = lambda x: (x - pixel_mean) / pixel_std
49 | self.normalizer = normalizer
50 |
51 | self.size_divisibility = self.backbone.size_divisibility
52 | self.nms_test_th = 0.0
53 | self.conf_test_th = 0.0
54 | self.loss_type = 'FOCAL'
55 | self.use_language_cache = True
56 | self.language_encoder_type = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
57 | self.num_proposals = cfg.MODEL.ELADecoder.num_queries
58 |
59 |
60 | @classmethod
61 | def from_config(cls, cfg, *args, **kwargs):
62 | return {
63 | 'cfg': cfg
64 | }
65 |
66 | def forward(self, x, label_feats, task_feats, task_mask):
67 |
68 | body_feats = self.backbone(x)
69 |
70 | if type(body_feats) is dict:
71 | body_feats = [body_feats[i] for i in body_feats.keys()]
72 | encoder_feats = self.neck(body_feats)
73 | box_pred, box_cls, _, _, _ = self.decoder(encoder_feats, label_feats, task_feats, task_mask)
74 |
75 | return box_pred, box_cls
76 |
--------------------------------------------------------------------------------
/omdet/omdet_v2_turbo/torch_utils.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.init import uniform_
9 |
10 | __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
11 |
12 |
13 | def _get_clones(module, n):
14 | return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
15 |
16 |
17 | def bias_init_with_prob(prior_prob=0.01):
18 | """initialize conv/fc bias value according to a given probability value."""
19 | return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
20 |
21 |
22 | def linear_init_(module):
23 | bound = 1 / math.sqrt(module.weight.shape[0])
24 | uniform_(module.weight, -bound, bound)
25 | if hasattr(module, 'bias') and module.bias is not None:
26 | uniform_(module.bias, -bound, bound)
27 |
28 |
29 | def inverse_sigmoid(x, eps=1e-5):
30 | x = x.clamp(min=0, max=1)
31 | x1 = x.clamp(min=eps)
32 | x2 = (1 - x).clamp(min=eps)
33 | return torch.log(x1 / x2)
34 |
35 |
36 | def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
37 | sampling_locations: torch.Tensor,
38 | attention_weights: torch.Tensor) -> torch.Tensor:
39 | """
40 | Multi-scale deformable attention.
41 | https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
42 | """
43 |
44 | bs, _, num_heads, embed_dims = value.shape
45 | _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 | sampling_grids = 2 * sampling_locations - 1
48 | sampling_value_list = []
49 | for level, (H_, W_) in enumerate(value_spatial_shapes):
50 | # bs, H_*W_, num_heads, embed_dims ->
51 | # bs, H_*W_, num_heads*embed_dims ->
52 | # bs, num_heads*embed_dims, H_*W_ ->
53 | # bs*num_heads, embed_dims, H_, W_
54 | value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
55 | # bs, num_queries, num_heads, num_points, 2 ->
56 | # bs, num_heads, num_queries, num_points, 2 ->
57 | # bs*num_heads, num_queries, num_points, 2
58 | sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
59 | # bs*num_heads, embed_dims, num_queries, num_points
60 | sampling_value_l_ = F.grid_sample(value_l_,
61 | sampling_grid_l_,
62 | mode='bilinear',
63 | padding_mode='zeros',
64 | align_corners=False)
65 | sampling_value_list.append(sampling_value_l_)
66 | # (bs, num_queries, num_heads, num_levels, num_points) ->
67 | # (bs, num_heads, num_queries, num_levels, num_points) ->
68 | # (bs, num_heads, 1, num_queries, num_levels*num_points)
69 | attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
70 | num_levels * num_points)
71 | output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
72 | bs, num_heads * embed_dims, num_queries))
73 | return output.transpose(1, 2).contiguous()
74 |
75 |
76 | def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
77 | """
78 | Calculate Intersection over Union (IoU) of box1(1, 4) to box2(n, 4).
79 |
80 | Args:
81 | box1 (torch.Tensor): A tensor representing a single bounding box with shape (1, 4).
82 | box2 (torch.Tensor): A tensor representing n bounding boxes with shape (n, 4).
83 | xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in
84 | (x1, y1, x2, y2) format. Defaults to True.
85 | GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False.
86 | DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False.
87 | CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False.
88 | eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
89 |
90 | Returns:
91 | (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags.
92 | """
93 |
94 | # Get the coordinates of bounding boxes
95 | if xywh: # transform from xywh to xyxy
96 | (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
97 | w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
98 | b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
99 | b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
100 | else: # x1, y1, x2, y2 = box1
101 | b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
102 | b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
103 | w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
104 | w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
105 |
106 | # Intersection area
107 | inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * \
108 | (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp_(0)
109 |
110 | # Union Area
111 | union = w1 * h1 + w2 * h2 - inter + eps
112 |
113 | # IoU
114 | iou = inter / union
115 | if CIoU or DIoU or GIoU:
116 | cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width
117 | ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height
118 | if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
119 | c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared
120 | rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2
121 | if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
122 | v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2)
123 | with torch.no_grad():
124 | alpha = v / (v - iou + (1 + eps))
125 | return iou - (rho2 / c2 + v * alpha) # CIoU
126 | return iou - rho2 / c2 # DIoU
127 | c_area = cw * ch + eps # convex area
128 | return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf
129 | return iou # IoU
130 |
131 | def cls_score(cls_type, cls_feature, class_proj, logit_scale):
132 | if cls_type == 'cosine':
133 | class_logits = _b_cosine(cls_feature, class_proj, logit_scale) # 4 100 256 4 256 20
134 | elif cls_type == 'dot':
135 | class_logits = torch.bmm(cls_feature, class_proj) # 4 100 20
136 | else:
137 | raise Exception("Unknown cls type {}".format(cls_type))
138 | return class_logits
139 |
140 | def _norm(f, dim=-1):
141 | return f / f.norm(dim=dim, keepdim=True).clamp_min(1e-12)
142 |
143 |
144 | def _b_cosine(a, b, logit_scale):
145 | """
146 | a: B x K x H
147 | b: B x H x K
148 | """
149 | a = _norm(a, dim=2)
150 | b = _norm(b, dim=1)
151 | # Calculating the Loss
152 | logit_scale = logit_scale.exp()
153 | logits_per_image = logit_scale * torch.bmm(a, b)
154 | return logits_per_image
155 |
156 | ###########################
157 | def bbox_cxcywh_to_xyxy(x):
158 | cxcy, wh = torch.split(x, 2, dim=-1)
159 | return torch.cat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], dim=-1)
160 |
161 | def bbox_xyxy2cxcywh(x):
162 | x0, y0, x1, y1 = torch.split(x, 1, dim=-1)
163 | return torch.cat([(x1+x0)/2, (y1+y0)/2, x1-x0, y1-y0], dim=-1)
164 |
165 | class SiLU(nn.Module):
166 | def __init__(self):
167 | super(SiLU, self).__init__()
168 |
169 | def forward(self, x):
170 | return x * torch.sigmoid(x)
171 |
172 | class BaseConv(nn.Module):
173 | def __init__(self,
174 | in_channels,
175 | out_channels,
176 | ksize,
177 | stride,
178 | groups=1,
179 | bias=False,
180 | act="silu"):
181 | super(BaseConv, self).__init__()
182 | self.conv = nn.Conv2d(
183 | in_channels,
184 | out_channels,
185 | kernel_size=ksize,
186 | stride=stride,
187 | padding=(ksize - 1) // 2,
188 | groups=groups,
189 | bias=bias)
190 | self.bn = nn.BatchNorm2d(
191 | out_channels,
192 | # epsilon=1e-3, # for amp(fp16), set in ppdet/engine/trainer.py
193 | # momentum=0.97,
194 | # weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
195 | # bias_attr=ParamAttr(regularizer=L2Decay(0.0))
196 | )
197 |
198 | if act == 'silu':
199 | self.act = SiLU()
200 | elif act == 'gelu':
201 | self.act = nn.GELU()
202 | # self._init_weights()
203 | #
204 | # def _init_weights(self):
205 | # conv_init_(self.conv)
206 |
207 | def forward(self, x):
208 | x = self.bn(self.conv(x))
209 | if self.training:
210 | y = self.act(x)
211 | else:
212 | if isinstance(self.act, nn.SiLU):
213 | self.act = SiLU()
214 | y = self.act(x)
215 | return y
216 |
217 | import random
218 | import torchvision
219 |
220 | class BatchResize():
221 | def __init__(self, mode="training"):
222 | self.mode = mode
223 | if mode == "training":
224 | self.size = int(random.choice(np.arange(480, 801, step=32)))
225 | else:
226 | self.size = 640
227 | self.resize = torchvision.transforms.Resize((self.size, self.size))
228 |
229 | def __call__(self, batch_inputs):
230 | for i, b in enumerate(batch_inputs):
231 | h, w = batch_inputs[i]["image"].shape[1:]
232 | batch_inputs[i]["image"] = self.resize(batch_inputs[i]["image"])
233 | new_h, new_w = (self.size, self.size)
234 | if self.mode:
235 | batch_inputs[i]["instances"].gt_boxes.tensor *= torch.tensor([new_w/w, new_h/h]).repeat(1, 2)
236 | batch_inputs[i]["instances"]._image_size = (new_h, new_w)
237 |
238 | return batch_inputs
239 |
240 |
241 | def get_contrastive_denoising_training_group(targets,
242 | num_classes,
243 | num_queries,
244 | class_embed,
245 | num_denoising=100,
246 | label_noise_ratio=0.5,
247 | box_noise_scale=1.0):
248 | """
249 | targets: [targets] that contains labels, bboxes, etc
250 | num_classes: the size of labels
251 | num_queries: 300
252 | class_embed: num_class x batch_size x label_dim OR num_class x batch_size (in the old case)
253 | """
254 | if num_denoising <= 0:
255 | return None, None, None, None
256 | # number of gt_bboxes in each batch sample
257 | num_gts = [len(t["labels"]) for t in targets]
258 | max_gt_num = max(num_gts)
259 | if max_gt_num == 0:
260 | return None, None, None, None
261 |
262 | num_group = num_denoising // max_gt_num # the number of denoising group given num_denoising
263 | num_group = 1 if num_group == 0 else num_group
264 | # pad gt to max_num of a batch
265 | bs = len(targets)
266 | input_query_class = torch.full((bs, max_gt_num), num_classes, dtype=torch.int32) # batch_size x max_gt_num (initialized with num_class)
267 | input_query_bbox = torch.zeros((bs, max_gt_num, 4)) # batch_size x max_gt_num x 4
268 | pad_gt_mask = torch.zeros((bs, max_gt_num))
269 | for i in range(bs):
270 | num_gt = num_gts[i]
271 | if num_gt > 0:
272 | input_query_class[i, :num_gt] = targets[i]["labels"].squeeze(-1)
273 | input_query_bbox[i, :num_gt] = targets[i]["boxes"]
274 | pad_gt_mask[i, :num_gt] = 1
275 | # each group has positive and negative queries.
276 | input_query_class = input_query_class.tile([1, 2 * num_group]) # batch_size x (max_gt_num*2*num_group)
277 | input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
278 | pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
279 | # positive and negative mask
280 | negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1]) # bs x max_gt_num*2 x 1
281 | negative_gt_mask[:, max_gt_num:] = 1 # set the second half to be NEGATIVE
282 | negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) # bs x max_gt_num*2*num_group x 1
283 | positive_gt_mask = 1 - negative_gt_mask
284 | # contrastive denoising training positive index
285 | positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
286 | dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
287 | dn_positive_idx = torch.split(dn_positive_idx,
288 | [n * num_group for n in num_gts]) # split by batch+soze
289 | # total denoising queries
290 | num_denoising = int(max_gt_num * 2 * num_group)
291 |
292 | if label_noise_ratio > 0:
293 | input_query_class = input_query_class.flatten() # (batch_size*max_gt_num*2*num_group) * 1
294 | pad_gt_mask = pad_gt_mask.flatten()
295 | # half of bbox prob
296 | mask = torch.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
297 | chosen_idx = torch.nonzero(mask * pad_gt_mask).squeeze(-1)
298 | # randomly put a new one here
299 | new_label = torch.randint_like(
300 | chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
301 | input_query_class.scatter_(0, chosen_idx, new_label)
302 | input_query_class = input_query_class.reshape(bs, num_denoising)
303 | pad_gt_mask = pad_gt_mask.reshape(bs, num_denoising)
304 |
305 | if box_noise_scale > 0:
306 | known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
307 |
308 | diff = torch.tile(input_query_bbox[..., 2:] * 0.5,
309 | [1, 1, 2]) * box_noise_scale
310 |
311 | rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
312 | rand_part = torch.rand(input_query_bbox.shape)
313 | rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
314 | 1 - negative_gt_mask)
315 | rand_part *= rand_sign
316 | known_bbox += rand_part * diff
317 | known_bbox.clip_(min=0.0, max=1.0)
318 | input_query_bbox = bbox_xyxy2cxcywh(known_bbox)
319 | input_query_bbox = inverse_sigmoid(input_query_bbox)
320 |
321 | fixed_class = class_embed.dim() == 2
322 | if fixed_class: # fixed class embedding. num_class * hidden_dim
323 | class_embed = torch.cat(
324 | [class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)]) # (num_class+1) * hidden_dim
325 | else:
326 | assert class_embed.dim() == 3
327 | # (num_class+1) x batch_size x hidden_dim
328 | class_embed = torch.cat(
329 | [class_embed, torch.zeros([1, class_embed.shape[-2], class_embed.shape[-1]], device=class_embed.device)])
330 |
331 | if fixed_class:
332 | input_query_class_index = input_query_class.view(input_query_class.shape[0], -1)\
333 | .long().flatten().reshape(-1,1).repeat(1, class_embed.shape[-1])
334 | input_query_class = torch.gather(class_embed.to(input_query_class_index.device),
335 | dim=0,
336 | index=input_query_class_index).reshape([bs, num_denoising, -1])
337 | else:
338 | temp = []
339 | input_query_class_index = input_query_class.view(input_query_class.shape[0], -1) \
340 | .long().flatten().reshape(-1, 1).repeat(1, class_embed.shape[-1]).reshape([bs, num_denoising, -1])
341 | for b_id in range(bs):
342 | t = torch.gather(class_embed[:, b_id].to(input_query_class_index.device),
343 | dim=0, index=input_query_class_index[b_id])
344 | temp.append(t)
345 | input_query_class = torch.cat(temp, dim=0).reshape([bs, num_denoising, -1])
346 |
347 | tgt_size = num_denoising + num_queries
348 | attn_mask = torch.ones([tgt_size, tgt_size]) < 0
349 | # match query cannot see the reconstruction
350 | attn_mask[num_denoising:, :num_denoising] = True
351 | # reconstruct cannot see each other
352 | for i in range(num_group):
353 | if i == 0:
354 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
355 | 2 * (i + 1):num_denoising] = True
356 | if i == num_group - 1:
357 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
358 | i * 2] = True
359 | else:
360 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
361 | 2 * (i + 1):num_denoising] = True
362 | attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
363 | 2 * i] = True
364 | attn_mask = ~attn_mask
365 | dn_meta = {
366 | "dn_positive_idx": dn_positive_idx,
367 | "dn_num_group": num_group,
368 | "dn_num_split": [num_denoising, num_queries]
369 | }
370 |
371 | return input_query_class, input_query_bbox, attn_mask, dn_meta
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
--------------------------------------------------------------------------------
/omdet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved
2 |
--------------------------------------------------------------------------------
/omdet/utils/analyze_model.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from collections import Counter
3 |
4 | import numpy as np
5 | from detectron2.checkpoint import DetectionCheckpointer
6 | from detectron2.config import CfgNode, instantiate
7 | from detectron2.data import build_detection_test_loader
8 | from detectron2.modeling import build_model
9 | from detectron2.utils.analysis import FlopCountAnalysis
10 | from fvcore.nn import flop_count_table
11 |
12 | __all__=["do_flop"]
13 |
14 | logger = logging.getLogger("detectron2")
15 |
16 | def do_flop(cfg):
17 | if isinstance(cfg, CfgNode):
18 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TRAIN[0])
19 | model = build_model(cfg)
20 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
21 | else:
22 | data_loader = instantiate(cfg.dataloader.test)
23 | model = instantiate(cfg.model)
24 | model.to(cfg.train.device)
25 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
26 | model.eval()
27 |
28 | counts = Counter()
29 | total_flops = []
30 | for idx, data in zip(range(10), data_loader): # noqa
31 | flops = FlopCountAnalysis(model, data)
32 | if idx > 0:
33 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
34 | counts += flops.by_operator()
35 | total_flops.append(flops.total())
36 |
37 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
38 | logger.info(
39 | "Average GFlops for each type of operators:\n"
40 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
41 | )
42 | logger.info(
43 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
44 | )
45 |
--------------------------------------------------------------------------------
/omdet/utils/box_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Linker Tech, Inc. and its affiliates. All Rights Reserved
2 | """
3 | Utilities for bounding box manipulation and GIoU.
4 | """
5 | import numpy as np
6 | import torch
7 | from torchvision.ops.boxes import box_area
8 |
9 |
10 | def box_cxcywh_to_xyxy(x):
11 | x_c, y_c, w, h = x.unbind(-1)
12 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
13 | (x_c + 0.5 * w), (y_c + 0.5 * h)]
14 | return torch.stack(b, dim=-1)
15 |
16 |
17 | def box_xyxy_to_cxcywh(x):
18 | x0, y0, x1, y1 = x.unbind(-1)
19 | b = [(x0 + x1) / 2, (y0 + y1) / 2,
20 | (x1 - x0), (y1 - y0)]
21 | return torch.stack(b, dim=-1)
22 |
23 |
24 | # modified from torchvision to also return the union
25 | def box_iou(boxes1, boxes2):
26 | area1 = box_area(boxes1)
27 | area2 = box_area(boxes2)
28 |
29 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
30 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
31 |
32 | wh = (rb - lt).clamp(min=0) # [N,M,2]
33 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
34 |
35 | union = area1[:, None] + area2 - inter
36 |
37 | iou = inter / union
38 | return iou, union
39 |
40 |
41 | def generalized_box_iou(boxes1, boxes2):
42 | """
43 | Generalized IoU from https://giou.stanford.edu/
44 |
45 | The boxes should be in [x0, y0, x1, y1] format
46 |
47 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
48 | and M = len(boxes2)
49 | """
50 | # degenerate boxes gives inf / nan results
51 | # so do an early check
52 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
53 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
54 | iou, union = box_iou(boxes1, boxes2)
55 |
56 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
57 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
58 |
59 | wh = (rb - lt).clamp(min=0) # [N,M,2]
60 | area = wh[:, :, 0] * wh[:, :, 1]
61 |
62 | return iou - (area - union) / area
63 |
64 |
65 | def masks_to_boxes(masks):
66 | """Compute the bounding boxes around the provided masks
67 |
68 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
69 |
70 | Returns a [N, 4] tensors, with the boxes in xyxy format
71 | """
72 | if masks.numel() == 0:
73 | return torch.zeros((0, 4), device=masks.device)
74 |
75 | h, w = masks.shape[-2:]
76 |
77 | y = torch.arange(0, h, dtype=torch.float)
78 | x = torch.arange(0, w, dtype=torch.float)
79 | y, x = torch.meshgrid(y, x)
80 |
81 | x_mask = (masks * x.unsqueeze(0))
82 | x_max = x_mask.flatten(1).max(-1)[0]
83 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
84 |
85 | y_mask = (masks * y.unsqueeze(0))
86 | y_max = y_mask.flatten(1).max(-1)[0]
87 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
88 |
89 | return torch.stack([x_min, y_min, x_max, y_max], 1)
90 |
91 |
92 | def xyxy2xywh(x):
93 | """
94 | Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
95 | top-left corner and (x2, y2) is the bottom-right corner.
96 |
97 | Args:
98 | x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
99 |
100 | Returns:
101 | y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format.
102 | """
103 | assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
104 | y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
105 | y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center
106 | y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center
107 | y[..., 2] = x[..., 2] - x[..., 0] # width
108 | y[..., 3] = x[..., 3] - x[..., 1] # height
109 | return y
110 |
111 |
112 | def xywh2xyxy(x):
113 | """
114 | Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
115 | top-left corner and (x2, y2) is the bottom-right corner.
116 |
117 | Args:
118 | x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
119 |
120 | Returns:
121 | y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
122 | """
123 | assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
124 | y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x) # faster than clone/copy
125 | dw = x[..., 2] / 2 # half-width
126 | dh = x[..., 3] / 2 # half-height
127 | y[..., 0] = x[..., 0] - dw # top left x
128 | y[..., 1] = x[..., 1] - dh # top left y
129 | y[..., 2] = x[..., 0] + dw # bottom right x
130 | y[..., 3] = x[..., 1] + dh # bottom right y
131 | return y
--------------------------------------------------------------------------------
/omdet/utils/cache.py:
--------------------------------------------------------------------------------
1 | import pickle as pkl
2 | import lmdb
3 | from collections import OrderedDict
4 |
5 |
6 | class LRUCache:
7 | # initialising capacity
8 | def __init__(self, capacity: int):
9 | self.cache = OrderedDict()
10 | self.capacity = capacity
11 |
12 | def has(self, key) -> bool:
13 | return key in self.cache
14 |
15 | # we return the value of the key
16 | # that is queried in O(1) and return -1 if we
17 | # don't find the key in out dict / cache.
18 | # And also move the key to the end
19 | # to show that it was recently used.
20 | def get(self, key):
21 | if key not in self.cache:
22 | return None
23 | else:
24 | self.cache.move_to_end(key)
25 | return self.cache[key]
26 |
27 | # first, we add / update the key by conventional methods.
28 | # And also move the key to the end to show that it was recently used.
29 | # But here we will also check whether the length of our
30 | # ordered dictionary has exceeded our capacity,
31 | # If so we remove the first key (least recently used)
32 | def put(self, key, value) -> None:
33 | self.cache[key] = value
34 | self.cache.move_to_end(key)
35 | if len(self.cache) > self.capacity:
36 | self.cache.popitem(last=False)
37 |
38 | def pop(self, key, value):
39 | self.cache.pop(key, None)
40 |
41 |
42 | class LmdbReader:
43 | def __init__(self, path):
44 | self.path = path
45 | self.env = self.init_lmdb(path)
46 |
47 | def init_lmdb(self, l_path):
48 | env = lmdb.open(
49 | l_path, readonly=True,
50 | create=False, lock=False) # readahead=not _check_distributed()
51 | txn = env.begin(buffers=True)
52 | return txn
53 |
54 | def read(self, _id):
55 | try:
56 | value = self.env.get(str(_id).encode("utf-8"))
57 | value = pkl.loads(value)
58 | return value
59 | except Exception as e:
60 | print("Error in reading {} from {}".format(_id, self.path))
61 | raise e
62 |
--------------------------------------------------------------------------------
/omdet/utils/plots.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import cv2
5 | import matplotlib
6 | import numpy as np
7 | import torch
8 | from PIL import Image, ImageDraw, ImageFont, ImageOps
9 | import platform
10 | import math
11 |
12 | def is_writeable(dir, test=False):
13 | # Return True if directory has write permissions, test opening a file with write permissions if test=True
14 | if test: # method 1
15 | file = Path(dir) / 'tmp.txt'
16 | try:
17 | with open(file, 'w'): # open file with write permissions
18 | pass
19 | file.unlink() # remove file
20 | return True
21 | except IOError:
22 | return False
23 | else: # method 2
24 | return os.access(dir, os.R_OK) # possible issues on Windows
25 |
26 | def user_config_dir(dir='Ultralytics', env_var='YOLOV5_CONFIG_DIR'):
27 | # Return path of user configuration directory. Prefer environment variable if exists. Make dir if required.
28 | env = os.getenv(env_var)
29 | if env:
30 | path = Path(env) # use environment variable
31 | else:
32 | cfg = {'Windows': 'AppData/Roaming', 'Linux': '.config', 'Darwin': 'Library/Application Support'} # 3 OS dirs
33 | path = Path.home() / cfg.get(platform.system(), '') # OS-specific config dir
34 | path = (path if is_writeable(path) else Path('/tmp')) / dir # GCP and AWS lambda fix, only /tmp is writeable
35 | path.mkdir(exist_ok=True) # make if required
36 | return path
37 |
38 | # Settings
39 | CONFIG_DIR = user_config_dir() # Ultralytics settings dir
40 | RANK = int(os.getenv('RANK', -1))
41 | matplotlib.rc('font', **{'size': 11})
42 | matplotlib.use('Agg') # for writing to files only
43 |
44 | def check_font(font='Arial.ttf', size=10):
45 | # Return a PIL TrueType Font, downloading to CONFIG_DIR if necessary
46 | font = Path(font)
47 | font = font if font.exists() else (CONFIG_DIR / font.name)
48 | try:
49 | return ImageFont.truetype(str(font) if font.exists() else font.name, size)
50 | except Exception as e: # download if missing
51 | url = "https://ultralytics.com/assets/" + font.name
52 | print(f'Downloading {url} to {font}...')
53 | torch.hub.download_url_to_file(url, str(font), progress=False)
54 | return ImageFont.truetype(str(font), size)
55 |
56 | def is_ascii(s=''):
57 | # Is string composed of all ASCII (no UTF) characters?
58 | s = str(s) # convert list, tuple, None, etc. to str
59 | return len(s.encode().decode('ascii', 'ignore')) == len(s)
60 |
61 |
62 | class Annotator:
63 | # if RANK in (-1, 0):
64 | # check_font() # download TTF if necessary
65 |
66 | # YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations
67 | def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=True):
68 | assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.'
69 | self.pil = pil
70 | self.offset = 0
71 | if self.pil: # use PIL
72 | self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
73 | self.im = ImageOps.expand(self.im, border=self.offset, fill=(255, 255, 255))
74 | self.draw = ImageDraw.Draw(self.im)
75 | self.font = check_font(font, size=font_size or max(round(sum(self.im.size) / 2 * 0.035), 12))
76 | self.fh = 5 # font height
77 | else: # use cv2
78 | self.im = im
79 | self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2) # line width
80 |
81 | def _offset_box(self, box):
82 | return (np.array(box)+self.offset).tolist()
83 |
84 | def draw_arrow(self, ptA, ptB, width=1, color=(0, 255, 0)):
85 | """Draw line from ptA to ptB with arrowhead at ptB"""
86 | # Get drawing context
87 | # Draw the line without arrows
88 | self.draw.line((ptA, ptB), width=width, fill=color)
89 |
90 | # Now work out the arrowhead
91 | # = it will be a triangle with one vertex at ptB
92 | # - it will start at 95% of the length of the line
93 | # - it will extend 8 pixels either side of the line
94 | x0, y0 = ptA
95 | x1, y1 = ptB
96 | # Now we can work out the x,y coordinates of the bottom of the arrowhead triangle
97 | xb = 0.95 * (x1 - x0) + x0
98 | yb = 0.95 * (y1 - y0) + y0
99 |
100 | # Work out the other two vertices of the triangle
101 | # Check if line is vertical
102 | if x0 == x1:
103 | vtx0 = (xb - 5, yb)
104 | vtx1 = (xb + 5, yb)
105 | # Check if line is horizontal
106 | elif y0 == y1:
107 | vtx0 = (xb, yb + 5)
108 | vtx1 = (xb, yb - 5)
109 | else:
110 | alpha = math.atan2(y1 - y0, x1 - x0) - 90 * math.pi / 180
111 | a = 8 * math.cos(alpha)
112 | b = 8 * math.sin(alpha)
113 | vtx0 = (xb + a, yb + b)
114 | vtx1 = (xb - a, yb - b)
115 |
116 | # draw.point((xb,yb), fill=(255,0,0)) # DEBUG: draw point of base in red - comment out draw.polygon() below if using this line
117 | # im.save('DEBUG-base.png') # DEBUG: save
118 |
119 | # Now draw the arrowhead triangle
120 | self.draw.polygon([vtx0, vtx1, ptB], fill=color)
121 |
122 | def box_label(self, box, label='', sub_label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
123 | # Add one xyxy box to image with label
124 | box = self._offset_box(box)
125 | if self.pil or not is_ascii(label):
126 | self.draw.rectangle(box, width=self.lw, outline=color) # box
127 | if label:
128 | w, h = 2, 2 # text width
129 | self.draw.rectangle([box[0], box[1] - self.fh, box[0] + w + 1, box[1] + 1], fill=color)
130 | # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls') # for PIL>8.0
131 | self.draw.text((box[0], box[1] - h), label+'\n'+sub_label, fill=txt_color, font=self.font)
132 | else: # cv2
133 | c1, c2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
134 | cv2.rectangle(self.im, c1, c2, color, thickness=self.lw, lineType=cv2.LINE_AA)
135 | if label:
136 | tf = max(self.lw - 1, 1) # font thickness
137 | w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0]
138 | c2 = c1[0] + w, c1[1] - h - 3
139 | cv2.rectangle(self.im, c1, c2, color, -1, cv2.LINE_AA) # filled
140 | ft = cv2.freetype.createFreeType2()
141 | ft.putText(self.im, label+'\n'+sub_label, (c1[0], c1[1] - 2), 0, self.lw / 3, txt_color, thickness=tf,
142 | lineType=cv2.LINE_AA)
143 |
144 | def tuple_label(self, src_box, dest_box, label='', src_color='red', dest_color='blue', txt_color=(255, 255, 255)):
145 | # Add one xyxy box to image with label
146 | src_box = self._offset_box(src_box)
147 | dest_box = self._offset_box(dest_box)
148 |
149 | if self.pil or not is_ascii(label):
150 | self.draw.rectangle(src_box, width=self.lw, outline=src_color) # box
151 | self.draw.rectangle(dest_box, width=self.lw, outline=dest_color) # box
152 | src_c = (int((src_box[2]+src_box[0])/2), int((src_box[3]+src_box[1])/2))
153 | dest_c = (int((dest_box[2]+dest_box[0])/2), int((dest_box[3]+dest_box[1])/2))
154 | c_c = [(src_c[0]+dest_c[0])/2, (src_c[1]+dest_c[1])/2]
155 | # self.draw.line(xy=[src_c, dest_c], fill='green')
156 | self.draw_arrow(src_c, dest_c, color='green', width=2)
157 |
158 | if label:
159 | w, h = self.font.getsize(label) # text width
160 | self.draw.rectangle([c_c[0], c_c[1] - self.fh, c_c[0] + w + 1, c_c[1] + 1], fill='green')
161 | self.draw.text((c_c[0], c_c[1] - h), label, fill=txt_color, font=self.font)
162 |
163 | else: # cv2
164 | raise Exception("CV2 is not supported yet")
165 |
166 | def rectangle(self, xy, fill=None, outline=None, width=1):
167 | # Add rectangle to image (PIL-only)
168 | self.draw.rectangle(xy, fill, outline, width)
169 |
170 | def text(self, xy, text, txt_color=(255, 255, 255)):
171 | # Add text to image (PIL-only)
172 | w, h = self.font.getsize(text) # text width, height
173 | self.draw.text((xy[0], xy[1] - h + 1), text, fill=txt_color, font=self.font)
174 |
175 | def result(self):
176 | # Return annotated image as array
177 | return np.asarray(self.im)
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
--------------------------------------------------------------------------------
/omdet/utils/registry.py:
--------------------------------------------------------------------------------
1 | def _register_generic(module_dict, module_name, module):
2 | assert module_name not in module_dict
3 | module_dict[module_name] = module
4 |
5 |
6 | class Registry(dict):
7 | '''
8 | A helper class for managing registering modules, it extends a dictionary
9 | and provides a register functions.
10 | Eg. creeting a registry:
11 | some_registry = Registry({"default": default_module})
12 | There're two ways of registering new modules:
13 | 1): normal way is just calling register function:
14 | def foo():
15 | ...
16 | some_registry.register("foo_module", foo)
17 | 2): used as decorator when declaring the module:
18 | @some_registry.register("foo_module")
19 | @some_registry.register("foo_modeul_nickname")
20 | def foo():
21 | ...
22 | Access of module is just like using a dictionary, eg:
23 | f = some_registry["foo_modeul"]
24 | '''
25 | def __init__(self, *args, **kwargs):
26 | super(Registry, self).__init__(*args, **kwargs)
27 |
28 | def register(self, module_name, module=None):
29 | # used as function call
30 | if module is not None:
31 | _register_generic(self, module_name, module)
32 | return
33 |
34 | # used as decorator
35 | def register_fn(fn):
36 | _register_generic(self, module_name, fn)
37 | return fn
38 |
39 | return register_fn
--------------------------------------------------------------------------------
/omdet/utils/tools.py:
--------------------------------------------------------------------------------
1 | import io
2 | import base64
3 | import re
4 | from PIL import ImageDraw, Image
5 | import lmdb
6 | from detectron2.data import transforms as T
7 | import logging
8 | from tqdm import tqdm
9 | import os
10 | from detectron2.data import detection_utils as utils
11 | import pickle
12 | import numpy as np
13 | from detectron2.config import CfgNode
14 | from typing import Generator, Sequence
15 | from joblib import Parallel, delayed
16 | import torch
17 | import random
18 |
19 | def make_continuous_categories(cats, verbose=True):
20 | # return a continuous categord_id from 1 to num_classes
21 | diff_cnt = 0
22 | for c_id, c in enumerate(cats):
23 | if c['id'] != c_id+1:
24 | diff_cnt += 1
25 | c['id'] = c_id + 1
26 |
27 | if verbose:
28 | print("Changed {} category_id among {} cats".format(diff_cnt, len(cats)))
29 |
30 | return cats
31 |
32 | def is_overlap(a, b):
33 | if b[1] - b[0] == 0 or a[1] - a[0] == 0:
34 | return False
35 |
36 | return a[0] <= b[0] < a[1] or b[0] <= a[0] < b[1]
37 |
38 |
39 | def get_span_embedding(model, tokenizer, sent, spans, layers, device):
40 | assert len(sent) == len(spans)
41 | encoded = tokenizer.batch_encode_plus(sent, return_tensors="pt", padding=True)
42 | encoded = encoded.to(device)
43 | # token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
44 | with torch.no_grad():
45 | output = model(**encoded)
46 |
47 | # Get all hidden states
48 | states = output.hidden_states
49 | # Stack and sum all requested layers
50 | output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
51 |
52 | # Only select the tokens that constitute the requested word
53 | results = []
54 | for b_id, b_span in enumerate(spans):
55 | offsets = encoded.encodings[b_id].offsets
56 | feats = []
57 | valid_offsets = []
58 | for t_id, t_span in enumerate(offsets):
59 | valid = False
60 | for s in b_span:
61 | if is_overlap(t_span, s):
62 | valid = True
63 | break
64 | if valid:
65 | feats.append(output[b_id, t_id].view(1, -1))
66 | valid_offsets.append(t_span)
67 |
68 | if len(feats) == 0:
69 | raise Exception(f"Sentence '{sent[b_id]}' ({len(sent[b_id])}) cannot find valid span for {b_span}.")
70 |
71 | res = torch.mean(torch.stack(feats, dim=0), dim=0).cpu().tolist()
72 | results.append(res[0])
73 | return results
74 |
75 |
76 | def get_txt_embedding(model, sent):
77 | txt_embedding = model._text_encode(sent)
78 | return txt_embedding
79 |
80 |
81 | def clean_t(x, max_len, rm_sym=True, must_idx=None, return_offset=False):
82 | """
83 | rm_sym: remove symbol _
84 | """
85 | s_id = 0
86 | x = x.lower()
87 | if rm_sym:
88 | x = x.replace('_', ' ').replace('-', ' ')
89 | x = ' '.join(x.split()) # remove duplicate space
90 |
91 | if must_idx is not None:
92 | min_id, max_id = must_idx
93 | if max_id >= max_len:
94 | s_id = max(0, min(min_id, int(max_id - (max_len / 2))))
95 | e_id = min(len(x), int(max_id + (max_len / 2)))
96 | # print(f"Special cut ({must_idx}): from {s_id} to {e_id} for sent of len {len(x)}")
97 | x = x[s_id:e_id]
98 | else:
99 | x = x[0:max_len]
100 | if return_offset:
101 | return x, s_id
102 | else:
103 | return x
104 |
105 | def sample_true(prob):
106 | if prob <= 0:
107 | return False
108 | generated_neg_prob = random.random()
109 | valid = generated_neg_prob < prob
110 | return valid
111 |
112 | def rm_duplicates(input_list, keep_order=False):
113 | if not keep_order:
114 | return list(set(input_list))
115 |
116 | # Create an empty set to store the items that have been seen
117 | seen = set()
118 |
119 | # Create an empty list to store the result
120 | result = []
121 |
122 | # Iterate over the input list
123 | for item in input_list:
124 | # If the item is not already in the seen set, add it to the result list
125 | if item not in seen:
126 | result.append(item)
127 |
128 | # Add the item to the seen set
129 | seen.add(item)
130 |
131 | # Return the result list
132 | return result
133 |
134 |
135 | def chunks(l: Sequence, n: int = 5) -> Generator[Sequence, None, None]:
136 | """Yield successive n-sized chunks from l."""
137 | for i in range(0, len(l), n):
138 | yield l[i:i + n]
139 |
140 |
141 | def encode_dump_text(model, feat_path, text_vocab, batch_size):
142 | text_keys = []
143 | for block in tqdm(chunks(text_vocab, n=batch_size)):
144 | block_feats = []
145 | block_keys = []
146 | for batch in chunks(block, n=500):
147 | batch_fs = get_txt_embedding(model, batch)
148 | batch_keys = batch
149 | block_feats.extend(batch_fs)
150 | block_keys.extend(batch_keys)
151 |
152 | text_keys.extend(block_keys)
153 | write_lmdb_from_id_data_pairs(
154 | id_data_pairs=[(key, embed) for key, embed in zip(block_keys, block_feats)],
155 | lmdb_save_dir=feat_path
156 | )
157 | return text_keys
158 |
159 |
160 | def cropbox(xmin, ymin, xmax, ymax, img_size, ratio=1.5, make_square=False):
161 | if xmin < 0 or ymin < 0 or xmax < 0 or ymax < 0:
162 | raise Exception
163 | w, h = img_size
164 | if xmin > w or ymin > h or xmax > w or ymax > h:
165 | raise Exception
166 |
167 | xc = xmin + (xmax - xmin) / 2
168 | yc = ymin + (ymax - ymin) / 2
169 | w = xmax - xmin
170 | h = ymax - ymin
171 | nw = w * ratio
172 | nh = h * ratio
173 |
174 | if make_square:
175 | if nw > nh:
176 | nh = nw
177 | else:
178 | nw = nh
179 |
180 | nxmin = max(xc - (nw / 2), 0)
181 | nymin = max(yc - (nh / 2), 0)
182 |
183 | nxmax = min(xc + (nw / 2), img_size[0])
184 | nymax = min(yc + (nh / 2), img_size[1])
185 |
186 | return nxmin, nymin, nxmax, nymax
187 |
188 |
189 | def image_to_base64(img):
190 | output_buffer = io.BytesIO()
191 | img.save(output_buffer, format='JPEG')
192 | byte_data = output_buffer.getvalue()
193 | base64_str = base64.b64encode(byte_data)
194 | return base64_str
195 |
196 |
197 | def base64_to_image(base64_str):
198 | return Image.open(io.BytesIO(base64.b64decode(base64_str)))
199 |
200 |
201 | def draw_bounding_box_on_image(image, xmin, ymin, xmax, ymax,
202 | color='red',
203 | text='',
204 | thickness=4):
205 | draw = ImageDraw.Draw(image)
206 | draw.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=thickness)
207 | draw.text((xmin, ymin), text)
208 | return image
209 |
210 |
211 | def build_transform_gen(cfg, is_train):
212 | """
213 | Create a list of :class:`TransformGen` from config.
214 | Returns:
215 | list[TransformGen]
216 | """
217 | if is_train:
218 | min_size = cfg.INPUT.MIN_SIZE_TRAIN
219 | max_size = cfg.INPUT.MAX_SIZE_TRAIN
220 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
221 | else:
222 | min_size = cfg.INPUT.MIN_SIZE_TEST
223 | max_size = cfg.INPUT.MAX_SIZE_TEST
224 | sample_style = "choice"
225 | if sample_style == "range":
226 | assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
227 |
228 | tfm_gens = []
229 | if is_train:
230 | tfm_gens.append(T.RandomFlip())
231 | tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
232 | # tfm_gens.append(T.Resize(min_size))
233 | if is_train:
234 | logger = logging.getLogger(__name__)
235 | logger.info("TransformGens used in training: " + str(tfm_gens))
236 | return tfm_gens
237 |
238 |
239 | def jp(a, b):
240 | return os.path.join(a, b)
241 |
242 |
243 | def check_img(i, img_root):
244 | # i['file_name'] = i['file_name'].split('/')[-1]
245 | try:
246 | iimage = utils.read_image(jp(img_root, i["file_name"]), format='RGB')
247 | utils.check_image_size(i, iimage)
248 |
249 | except Exception as e:
250 | print("BAD D2 IMG", i)
251 | if 'image_id' in i:
252 | return i['image_id']
253 | else:
254 | return i['id']
255 |
256 | return None
257 |
258 |
259 | def fix_img_size(i, img_root):
260 | try:
261 | if not "file_name" in i:
262 | i["file_name"] = i["coco_url"].split("/")[-1]
263 | img = Image.open(jp(img_root, i['file_name']))
264 | w, h = img.size
265 | if i['width'] != w or i['height'] != h:
266 | print("Found image {} with wrong size.\n".format(i['id']))
267 | i['width'] = w
268 | i['height'] = h
269 |
270 | return i
271 | except Exception as e:
272 | print("BAD IMG", i, e)
273 | return None
274 |
275 |
276 | def fix_data(img_root, data):
277 | if type(data) is dict:
278 | num_imgs = len(data['images'])
279 | data['images'] = Parallel(n_jobs=15, backend='threading')(
280 | delayed(fix_img_size)(i, img_root) for i in tqdm(data['images']))
281 | data['images'] = [i for i in data['images'] if i is not None]
282 | print("First stage image fixing go from {} to {}".format(num_imgs, len(data['images'])))
283 |
284 | bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data['images']))
285 | bad_ids = [x for x in set(bad_ids) if x is not None]
286 | print("Found {} bad images with D2 checking".format(len(bad_ids)))
287 | data['images'] = [d for d in data['images'] if d['id'] not in bad_ids]
288 | print("Images go from {} to {}".format(num_imgs, len(data['images'])))
289 |
290 | prev_anno_size = len(data['annotations'])
291 | valid_imgs = {i['id'] for i in data['images']}
292 | data['annotations'] = [d for d in data['annotations'] if d['image_id'] in valid_imgs]
293 | print("Anno go from {} to {} after fixing.".format(prev_anno_size, len(data['annotations'])))
294 | else:
295 | num_imgs = len(data)
296 | data = Parallel(n_jobs=15, backend='threading')(delayed(fix_img_size)(i, img_root) for i in tqdm(data))
297 | data = [i for i in data if i is not None]
298 | print("First stage image fixing go from {} to {}".format(num_imgs, len(data)))
299 |
300 | bad_ids = Parallel(n_jobs=15, backend='threading')(delayed(check_img)(i, img_root) for i in tqdm(data))
301 | bad_ids = [x for x in set(bad_ids) if x is not None]
302 | print("Found {} bad images with D2 checking".format(len(bad_ids)))
303 | data = [d for d in data if d['id'] not in bad_ids]
304 | print("Images go from {} to {}".format(num_imgs, len(data)))
305 | return data
306 |
307 |
308 | def convert_cfg_to_dict(cfg_node, key_list):
309 | if not isinstance(cfg_node, CfgNode):
310 | return cfg_node
311 | else:
312 | cfg_dict = dict(cfg_node)
313 | for k, v in cfg_dict.items():
314 | cfg_dict[k] = convert_cfg_to_dict(v, key_list + [k])
315 | return cfg_dict
316 |
317 |
318 | def flatten_json(json_file):
319 | out = {}
320 |
321 | def flatten(x, name=''):
322 | if type(x) is dict:
323 | for a in x:
324 | flatten(x[a], name + a + '.')
325 | elif type(x) is list:
326 | i = 0
327 | for a in x:
328 | flatten(a, name + str(i) + '.')
329 | i += 1
330 | else:
331 | out[name[:-1]] = x
332 |
333 | flatten(json_file)
334 | return out
335 |
336 |
337 | def convert_to_float(value):
338 | if isinstance(value, float):
339 | return value
340 | try: # try pytorch
341 | return value.item()
342 | except:
343 | try: # try numpy
344 | print(value.dtype)
345 | return np.asscalar(value)
346 | except:
347 | raise ValueError('do not know how to convert this number {} to float'.format(value))
348 |
349 |
350 | def remove_punctuation(text: str) -> str:
351 | punct = ['|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^',
352 | '\'', '\"', '’', '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
353 | ]
354 | for p in punct:
355 | text = text.replace(p, '')
356 | return text.strip()
--------------------------------------------------------------------------------
/outputs/000000574769.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/outputs/000000574769.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | timm==0.9.16
2 | transformers==4.21.0
3 | lmdb==1.4.1
4 | Pillow==8.4.0
5 | ftfy==6.2.0
6 | joblib==1.3.2
7 | opencv-python==4.7.0.72
8 | pydantic
9 | fastapi
10 | uvicorn
--------------------------------------------------------------------------------
/run_demo.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from omdet.inference.det_engine import DetEngine
4 | from omdet.utils.plots import Annotator
5 | from PIL import Image
6 | import numpy as np
7 |
8 | if __name__ == "__main__":
9 | engine = DetEngine(batch_size=1, device='cuda')
10 | img_paths = ['./sample_data/000000574769.jpg'] # path of images
11 | labels = ["person", "cat", "orange"] # labels to be predicted
12 | prompt = 'Detect {}.'.format(','.join(labels)) # prompt of detection task, use "Detect {}." as default
13 |
14 | res = engine.inf_predict('OmDet-Turbo_tiny_SWIN_T', # prefix name of the pretrained checkpoints
15 | task=prompt,
16 | data=img_paths,
17 | labels=labels,
18 | src_type='local', # type of the image_paths, "local"/"url"
19 | conf_threshold=0.30,
20 | nms_threshold=0.5
21 | )
22 | print(res)
23 |
24 | out_folder = './outputs'
25 | for idx, img_path in enumerate(img_paths):
26 | im = Image.open(img_path)
27 | a = Annotator(np.ascontiguousarray(im), font_size=12, line_width=1, pil=True, font='sample_data/simsun.ttc')
28 | for R in res[idx]:
29 | a.box_label([R['xmin'], R['ymin'], R['xmax'], R['ymax']],
30 | label=f"{R['label']} {str(int(R['conf'] * 100))}%",
31 | color='red')
32 |
33 | if not os.path.exists(out_folder):
34 | os.mkdir(out_folder)
35 |
36 | image = a.result()
37 | img = Image.fromarray(image)
38 | img.save('outputs/'+img_path.split('/')[-1])
--------------------------------------------------------------------------------
/run_wsgi.py:
--------------------------------------------------------------------------------
1 | import time
2 | import uvicorn
3 | from fastapi import FastAPI
4 | from omdet.inference.det_engine import DetEngine
5 | from pydantic import BaseModel
6 | from typing import List, Dict, Union
7 |
8 |
9 | class InfDetectBody(BaseModel):
10 | model_id: str
11 | data: List[str]
12 | src_type: str = "url"
13 | task: str
14 | labels: List[str]
15 | threshold: float = 0.1
16 | nms_threshold: float = 0.5
17 |
18 |
19 | class Object(BaseModel):
20 | xmin: float
21 | ymin: float
22 | xmax: float
23 | ymax: float
24 | conf: float
25 | label: str
26 |
27 |
28 | class DetectionRes(BaseModel):
29 | took: int
30 | objects: List[List[Object]] = []
31 |
32 |
33 | app = FastAPI()
34 |
35 |
36 | @app.on_event("startup")
37 | async def startup_event():
38 | app.state.detector = DetEngine(model_dir="resources/", device="cuda", batch_size=10)
39 |
40 |
41 | @app.post(
42 | "/inf_predict",
43 | response_model=DetectionRes,
44 | name="Detect objects with Inf Possibilities",
45 | )
46 | async def detect_urls(
47 | body: InfDetectBody = None,
48 | ) -> DetectionRes:
49 | s_time = time.time()
50 | out = app.state.detector.inf_predict(
51 | body.model_id,
52 | task=body.task,
53 | labels=body.labels,
54 | data=body.data,
55 | src_type=body.src_type,
56 | conf_threshold=body.threshold,
57 | nms_threshold=body.nms_threshold,
58 | )
59 |
60 | resp = DetectionRes(took=int((time.time() - s_time) * 1000), objects=out)
61 | return resp
62 |
63 |
64 | if __name__ == "__main__":
65 | uvicorn.run("run_wsgi:app", host="0.0.0.0", port=8000)
66 |
--------------------------------------------------------------------------------
/sample_data/000000574769.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/000000574769.jpg
--------------------------------------------------------------------------------
/sample_data/simsun.ttc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/om-ai-lab/OmDet/f9820753af5f7808b184217d4295291a56aaeb27/sample_data/simsun.ttc
--------------------------------------------------------------------------------