├── .github
    └── coco.png
├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── Base.yaml
    ├── GroundingDINO_SwinB_cfg.py
    ├── OVMono3D_clip_SFP.yaml
    ├── OVMono3D_dinov2_SFP.yaml
    ├── OVMono3D_mae_SFP.yaml
    ├── OVMono3D_midas_SFP.yaml
    ├── OVMono3D_sam_SFP.yaml
    ├── category_meta.json
    └── category_meta50.json
├── cubercnn
    ├── config
    │   ├── __init__.py
    │   └── config.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── builtin.py
    │   ├── dataset_mapper.py
    │   └── datasets.py
    ├── evaluation
    │   ├── __init__.py
    │   └── omni3d_evaluation.py
    ├── modeling
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── clip.py
    │   │   ├── densenet.py
    │   │   ├── dino.py
    │   │   ├── dla.py
    │   │   ├── mae.py
    │   │   ├── midas_final.py
    │   │   ├── mnasnet.py
    │   │   ├── resnet.py
    │   │   ├── sam.py
    │   │   └── shufflenet.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── rcnn3d.py
    │   ├── proposal_generator
    │   │   ├── __init__.py
    │   │   └── rpn.py
    │   └── roi_heads
    │   │   ├── __init__.py
    │   │   ├── cube_head.py
    │   │   ├── fast_rcnn.py
    │   │   ├── roi_heads.py
    │   │   └── roi_heads_gdino.py
    ├── solver
    │   ├── __init__.py
    │   ├── build.py
    │   └── checkpoint.py
    ├── util
    │   ├── __init__.py
    │   ├── math_util.py
    │   ├── model_zoo.py
    │   └── util.py
    └── vis
    │   ├── __init__.py
    │   ├── logperf.py
    │   └── vis.py
├── datasets
    ├── ARKitScenes
    │   └── download_arkitscenes_images.sh
    ├── Omni3D
    │   └── download_omni3d_json.sh
    ├── coco_examples
    │   ├── 000000044260.jpg
    │   ├── 000000088432.jpg
    │   ├── 000000101762.jpg
    │   ├── 000000120584.jpg
    │   ├── 000000128148.jpg
    │   ├── 000000162543.jpg
    │   ├── 000000164115.jpg
    │   ├── 000000311950.jpg
    │   ├── 000000429011.jpg
    │   └── labels.json
    └── objectron
    │   └── download_objectron_images.sh
├── demo
    └── demo.py
├── download_data.sh
├── setup.sh
└── tools
    ├── __init__.py
    ├── eval_ovmono3d_geo.py
    ├── ovmono3d_geo.py
    └── train_net.py


/.github/coco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/.github/coco.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # folders or files
 2 | datasets/*
 3 | cubercnn/modeling/backbone/checkpoint_weights/*
 4 | .vscode/
 5 | .ipynb_checkpoints/
 6 | .idea/
 7 | output/
 8 | cubercnn/external/
 9 | slurm/
10 | datasets
11 | unused/
12 | checkpoints/
13 | ovmono3d_data/
14 | ovmono3d
15 | # filetypes
16 | *.pyc
17 | *.mexa64
18 | */output/*
19 | */output*/*
20 | *~
21 | *.so
22 | *.ipynb
23 | *.pth
24 | *.zip


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | <!-- # OVMono3D -->
  4 | 
  5 | # Open Vocabulary Monocular 3D Object Detection
  6 | 
  7 | [Jin Yao][jy], [Hao Gu][hg], [Xuweiyi Chen][xc], [Jiayun Wang][jw], [Zezhou Cheng][zc]
  8 | 
  9 | 
 10 | [![Website](https://img.shields.io/badge/Project-Page-b361ff
 11 | )](https://uva-computer-vision-lab.github.io/ovmono3d/)
 12 | [![Paper](https://img.shields.io/badge/arXiv-PDF-b31b1b)](https://arxiv.org/pdf/2411.16833)
 13 | 
 14 | 
 15 | </div>
 16 | 
 17 | <table style="border-collapse: collapse; border: none;">
 18 | <tr>
 19 | 	<!-- <td width="60%">
 20 | 		<p align="center">
 21 | 			Zero-shot (+ tracking) on <a href="https://about.facebook.com/realitylabs/projectaria">Project Aria</a> data
 22 | 			<img src=".github/generalization_demo.gif" alt="Aria demo video"/ height="300">
 23 | 		</p>
 24 | 	</td> -->
 25 | 	<td width="100%">
 26 | 		<p align="center">
 27 | 			Zero-shot predictions on COCO
 28 | 			<img src=".github/coco.png" alt="COCO demo"/ height="300">
 29 | 		</p>
 30 | 	</td>
 31 | </tr>
 32 | </table>
 33 | 
 34 | 
 35 | ## Installation <a name="installation"></a>
 36 | Our used cuda version is 12.1.1.
 37 | Run
 38 | ```bash
 39 | conda create -n ovmono3d python=3.8.20
 40 | conda activate ovmono3d
 41 | 
 42 | pip install torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cu121
 43 | ```
 44 | to create the environment and install pytorch.
 45 | 
 46 | Run
 47 | ```bash
 48 | sh setup.sh
 49 | ```
 50 | to install additional dependencies and download model checkpoints of OVMono3D-LIFT and other foundation models.
 51 | 
 52 | ## Demo <a name="demo"></a>
 53 | Run
 54 | ```bash
 55 | python demo/demo.py --config-file configs/OVMono3D_dinov2_SFP.yaml \
 56 | 	--input-folder datasets/coco_examples \
 57 | 	--labels-file datasets/coco_examples/labels.json \
 58 | 	--threshold 0.45 \
 59 | 	MODEL.ROI_HEADS.NAME ROIHeads3DGDINO \
 60 | 	MODEL.WEIGHTS checkpoints/ovmono3d_lift.pth \
 61 | 	OUTPUT_DIR output/coco_examples 
 62 | ```
 63 | to get the results for the example COCO images.
 64 | 
 65 | You can also try your own images and prompted category labels. See the format of the label file in [`labels.json`](datasets/coco_examples/labels.json). If you know the camera intrinsics you could input them as arguments with the convention `--focal-length <float>` and `--principal-point <float> <float>`. Check [`demo.py`](demo/demo.py) for more details.
 66 | 
 67 | 
 68 | ## Data <a name="data"></a>
 69 | Please follow the instructions in [Omni3D](https://github.com/facebookresearch/omni3d/blob/main/DATA.md) to set up the datasets.  
 70 | Run
 71 | ```bash
 72 | sh ./download_data.sh
 73 | ```
 74 | to download our pre-processed OVMono3D 2D predictions (12 GB after unzipping).  
 75 | 
 76 | 
 77 | ## Evaluation <a name="evaluation"></a>
 78 | 
 79 | 
 80 | To run inference and evaluation of OVMono3D-LIFT, use the following command:
 81 | ```bash
 82 | python tools/train_net.py --eval-only  --config-file configs/OVMono3D_dinov2_SFP.yaml --num-gpus 2 \
 83 |     OUTPUT_DIR  output/ovmono3d_lift  \
 84 |     MODEL.WEIGHTS checkpoints/ovmono3d_lift.pth \
 85 |     TEST.CAT_MODE "novel" \
 86 |     DATASETS.ORACLE2D_FILES.EVAL_MODE "target_aware"
 87 | ```
 88 | TEST.CAT_MODE denotes the category set to be evaluated: `novel` or `base` or `all`
 89 | 
 90 | DATASETS.ORACLE2D_FILES.EVAL_MODE denotes the evaluation protocol: `target_aware` or `previous_metric`
 91 | 
 92 | To run inference and evaluation of OVMono3D-GEO, use the following commands:
 93 | ```bash
 94 | python tools/ovmono3d_geo.py
 95 | python tools/eval_ovmono3d_geo.py
 96 | ```
 97 | 
 98 | 
 99 | ## Training <a name="training"></a>
100 | 
101 | To run training of OVMono3D-LIFT, use the following command:
102 | ```bash
103 | python tools/train_net.py --config-file configs/OVMono3D_dinov2_SFP.yaml --num-gpus 8 \
104 |     OUTPUT_DIR  output/ovmono3d_lift \
105 |     VIS_PERIOD 500 TEST.EVAL_PERIOD 2000 \
106 |     MODEL.STABILIZE  0.03 \
107 |     SOLVER.BASE_LR 0.012 \
108 |     SOLVER.CHECKPOINT_PERIOD 1000 \
109 |     SOLVER.IMS_PER_BATCH 64 
110 | ```
111 | 
112 | The training hyperparameters above are used in our experiments. While these parameters can be customized to suit your specific requirements, please note that performance may vary across different configurations.
113 | 
114 | 
115 | ## Citing <a name="citing"></a>
116 | If you find this work useful for your research, please kindly cite:
117 | 
118 | ```BibTeX
119 | @article{yao2024open,
120 |   title={Open Vocabulary Monocular 3D Object Detection},
121 |   author={Yao, Jin and Gu, Hao and Chen, Xuweiyi and Wang, Jiayun and Cheng, Zezhou},
122 |   journal={arXiv preprint arXiv:2411.16833},
123 |   year={2024}
124 | }
125 | ```
126 | Please also consider cite the awesome work of [Omni3D](https://github.com/facebookresearch/omni3d) and datasets used in Omni3D.
127 | <details><summary>BibTex</summary>
128 | 
129 | ```BibTeX
130 | @inproceedings{brazil2023omni3d,
131 |   author =       {Garrick Brazil and Abhinav Kumar and Julian Straub and Nikhila Ravi and Justin Johnson and Georgia Gkioxari},
132 |   title =        {{Omni3D}: A Large Benchmark and Model for {3D} Object Detection in the Wild},
133 |   booktitle =    {CVPR},
134 |   address =      {Vancouver, Canada},
135 |   month =        {June},
136 |   year =         {2023},
137 |   organization = {IEEE},
138 | }
139 | ```
140 | 
141 | ```BibTex
142 | @inproceedings{Geiger2012CVPR,
143 |   author = {Andreas Geiger and Philip Lenz and Raquel Urtasun},
144 |   title = {Are we ready for Autonomous Driving? The KITTI Vision Benchmark Suite},
145 |   booktitle = {CVPR},
146 |   year = {2012}
147 | }
148 | ``` 
149 | 
150 | ```BibTex
151 | @inproceedings{caesar2020nuscenes,
152 |   title={nuscenes: A multimodal dataset for autonomous driving},
153 |   author={Caesar, Holger and Bankiti, Varun and Lang, Alex H and Vora, Sourabh and Liong, Venice Erin and Xu, Qiang and Krishnan, Anush and Pan, Yu and Baldan, Giancarlo and Beijbom, Oscar},
154 |   booktitle={CVPR},
155 |   year={2020}
156 | }
157 | ```
158 | 
159 | ```BibTex
160 | @inproceedings{song2015sun,
161 |   title={Sun rgb-d: A rgb-d scene understanding benchmark suite},
162 |   author={Song, Shuran and Lichtenberg, Samuel P and Xiao, Jianxiong},
163 |   booktitle={CVPR},
164 |   year={2015}
165 | }
166 | ```
167 | 
168 | ```BibTex
169 | @inproceedings{dehghan2021arkitscenes,
170 |   title={{ARK}itScenes - A Diverse Real-World Dataset for 3D Indoor Scene Understanding Using Mobile {RGB}-D Data},
171 |   author={Gilad Baruch and Zhuoyuan Chen and Afshin Dehghan and Tal Dimry and Yuri Feigin and Peter Fu and Thomas Gebauer and Brandon Joffe and Daniel Kurz and Arik Schwartz and Elad Shulman},
172 |   booktitle={NeurIPS Datasets and Benchmarks Track (Round 1)},
173 |   year={2021},
174 | }
175 | ```
176 | 
177 | ```BibTex
178 | @inproceedings{hypersim,
179 |   author    = {Mike Roberts AND Jason Ramapuram AND Anurag Ranjan AND Atulit Kumar AND
180 |                  Miguel Angel Bautista AND Nathan Paczan AND Russ Webb AND Joshua M. Susskind},
181 |   title     = {{Hypersim}: {A} Photorealistic Synthetic Dataset for Holistic Indoor Scene Understanding},
182 |   booktitle = {ICCV},
183 |   year      = {2021},
184 | }
185 | ```
186 | 
187 | ```BibTex
188 | @article{objectron2021,
189 |   title={Objectron: A Large Scale Dataset of Object-Centric Videos in the Wild with Pose Annotations},
190 |   author={Ahmadyan, Adel and Zhang, Liangkai and Ablavatski, Artsiom and Wei, Jianing and Grundmann, Matthias},
191 |   journal={CVPR},
192 |   year={2021},
193 | }
194 | ```
195 | 
196 | </details>
197 | 
198 | 
199 | [jy]: https://yaojin17.github.io
200 | [hg]: https://www.linkedin.com/in/hao--gu/
201 | [xc]: https://xuweiyichen.github.io/
202 | [jw]: https://pwang.pw/
203 | [zc]: https://sites.google.com/site/zezhoucheng/
204 | 
205 | 


--------------------------------------------------------------------------------
/configs/Base.yaml:
--------------------------------------------------------------------------------
 1 | SOLVER:
 2 |   TYPE: "sgd"
 3 |   IMS_PER_BATCH: 32
 4 |   BASE_LR: 0.02
 5 |   STEPS: (19200, 25600)
 6 |   MAX_ITER: 32000
 7 |   WEIGHT_DECAY: 0.0001
 8 |   LR_SCHEDULER_NAME: "WarmupMultiStepLR"
 9 | INPUT:
10 |   MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,)
11 |   MIN_SIZE_TEST: 512
12 |   MAX_SIZE_TRAIN: 4096
13 |   MAX_SIZE_TEST: 4096
14 | TEST:
15 |   VISIBILITY_THRES: 0.33333333
16 |   TRUNCATION_THRES: 0.33333333
17 |   EVAL_PERIOD: 16000
18 | DATASETS:
19 |   TRAIN: ('KITTI_train', 'KITTI_val')
20 |   TEST: ('KITTI_test',) 
21 |   CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person')
22 |   IGNORE_NAMES: "['dontcare', 'ignore', 'void']"
23 |   MIN_HEIGHT_THRES: 0.05
24 |   TRUNCATION_THRES: 0.75
25 |   VISIBILITY_THRES: 0.25
26 |   TRUNC_2D_BOXES: True
27 | VIS_PERIOD: 640
28 | DATALOADER:
29 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
30 |   REPEAT_THRESHOLD: 0.1
31 | MODEL:
32 |   PIXEL_MEAN: [103.530, 116.280, 123.675]
33 |   PIXEL_STD: [57.375, 57.120, 58.395]
34 |   META_ARCHITECTURE: "RCNN3D"
35 |   MASK_ON: False
36 |   STABILIZE: 0.02
37 |   USE_BN: True
38 |   BACKBONE:
39 |     FREEZE_AT: 0
40 |     NAME: 'build_dla_from_vision_fpn_backbone'
41 |   DLA:
42 |     TYPE: 'dla34'
43 |   FPN:
44 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
45 |   ANCHOR_GENERATOR:
46 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
47 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
48 |   RPN:
49 |     HEAD_NAME: "StandardRPNHead"
50 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
51 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
52 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
53 |     POST_NMS_TOPK_TRAIN: 1000
54 |     POST_NMS_TOPK_TEST: 1000
55 |     BOUNDARY_THRESH: -1
56 |     OBJECTNESS_UNCERTAINTY: "IoUness"
57 |     IOU_THRESHOLDS: [0.05, 0.05]
58 |     POSITIVE_FRACTION: 1.0
59 |   PROPOSAL_GENERATOR:
60 |     NAME: "RPNWithIgnore"
61 |   ROI_HEADS:
62 |     NAME: "ROIHeads3D"
63 |     IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6']
64 |     BATCH_SIZE_PER_IMAGE: 512
65 |     SCORE_THRESH_TEST: 0.01
66 |     NUM_CLASSES: 43
67 |   ROI_BOX_HEAD:
68 |     NAME: "FastRCNNConvFCHead"
69 |     NUM_FC: 2
70 |     POOLER_RESOLUTION: 7
71 |   ROI_CUBE_HEAD:
72 |     NAME: 'CubeHead'
73 |     Z_TYPE: 'direct'
74 |     POSE_TYPE: '6d'
75 |     NUM_FC: 2
76 |     SHARED_FC: True
77 |     USE_CONFIDENCE: 1.0
78 |     LOSS_W_3D: 1.0
79 |     POOLER_TYPE: 'ROIAlignV2'
80 |     POOLER_RESOLUTION: 7
81 |     DIMS_PRIORS_ENABLED: False
82 |     DISENTANGLED_LOSS: True
83 |     ALLOCENTRIC_POSE: True
84 |     VIRTUAL_FOCAL: 512.0
85 |     VIRTUAL_DEPTH: True
86 |     CHAMFER_POSE: True
87 | VERSION: 2


--------------------------------------------------------------------------------
/configs/GroundingDINO_SwinB_cfg.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_B_384_22k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/configs/OVMono3D_clip_SFP.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | INPUT: 
10 |   MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,)
11 |   MIN_SIZE_TEST: 608
12 |   MAX_SIZE_TRAIN: 1024
13 |   MAX_SIZE_TEST: 1024
14 |   FORMAT: "RGB"
15 | TEST:
16 |   EVAL_PERIOD: 29000
17 | VIS_PERIOD: 2320
18 | DATASETS:
19 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
20 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
21 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
22 | MODEL:
23 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
24 |   PIXEL_STD: [58.395, 57.120, 57.375]  # changed to rgb order
25 |   BACKBONE:
26 |     FREEZE_AT: 0 
27 |     NAME: 'build_clip_backbone'
28 |   CLIP:
29 |     ARCH: 'ViT-B-16'
30 |     CHECKPOINT: 'openai'
31 |     OUTPUT: 'dense'
32 |     LAYER: -1
33 |     RETURN_MULTILAYER: False
34 |   FPN:
35 |     IN_FEATURE: 'last_feat'
36 |     NORM: 'LN'
37 |     SQUARE_PAD: 1024
38 |   ANCHOR_GENERATOR:
39 |     SIZES: [[64], [128], [256], [512]] 
40 |   RPN:
41 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
42 |   ROI_HEADS:
43 |     NUM_CLASSES: 50
44 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']


--------------------------------------------------------------------------------
/configs/OVMono3D_dinov2_SFP.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | INPUT: 
10 |   MIN_SIZE_TRAIN: (280, 308, 336, 364, 392, 420, 448, 476, 504, 532, 560, 588, 616, 644, 672, 700, 728, 756, 784, 812, 840, 868, 896, 924, 952, 980, 1008,)
11 |   MIN_SIZE_TEST: 532
12 |   MAX_SIZE_TRAIN: 896
13 |   MAX_SIZE_TEST: 896
14 |   FORMAT: "RGB"
15 | TEST:
16 |   EVAL_PERIOD: 29000
17 | VIS_PERIOD: 2320
18 | DATASETS:
19 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
20 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
21 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
22 | MODEL:
23 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
24 |   PIXEL_STD: [58.395, 57.120, 57.375]  # changed to rgb order
25 |   BACKBONE:
26 |     FREEZE_AT: 0 
27 |     NAME: 'build_dino_backbone'
28 |   DINO:
29 |     NAME: 'dinov2'
30 |     MODEL_NAME: 'vitb14'
31 |     OUTPUT: 'dense'
32 |     LAYER: -1
33 |     RETURN_MULTILAYER: False
34 |   FPN:
35 |     IN_FEATURE: 'last_feat'
36 |     NORM: 'LN'
37 |     SQUARE_PAD: 896
38 |   ANCHOR_GENERATOR:
39 |     SIZES: [[64], [256], [512]] 
40 |   RPN:
41 |     IN_FEATURES: ['p2', 'p3', 'p4']
42 |   ROI_HEADS:
43 |     NUM_CLASSES: 50
44 |     IN_FEATURES: ['p2', 'p3', 'p4']


--------------------------------------------------------------------------------
/configs/OVMono3D_mae_SFP.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | INPUT: 
10 |   MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,)
11 |   MIN_SIZE_TEST: 608
12 |   MAX_SIZE_TRAIN: 1024
13 |   MAX_SIZE_TEST: 1024
14 |   FORMAT: "RGB"
15 | TEST:
16 |   EVAL_PERIOD: 29000
17 | VIS_PERIOD: 2320
18 | DATASETS:
19 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
20 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
21 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
22 | MODEL:
23 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
24 |   PIXEL_STD: [58.395, 57.120, 57.375]  # changed to rgb order
25 |   BACKBONE:
26 |     FREEZE_AT: 0 
27 |     NAME: 'build_mae_backbone'
28 |   MAE:
29 |     CHECKPOINT: 'facebook/vit-mae-base'
30 |     OUTPUT: 'dense'
31 |     LAYER: -1
32 |     RETURN_MULTILAYER: False
33 |   FPN:
34 |     IN_FEATURE: 'last_feat'
35 |     NORM: 'LN'
36 |     SQUARE_PAD: 1024
37 |   ANCHOR_GENERATOR:
38 |     SIZES: [[64], [128], [256], [512]] 
39 |   RPN:
40 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
41 |   ROI_HEADS:
42 |     NUM_CLASSES: 50
43 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']


--------------------------------------------------------------------------------
/configs/OVMono3D_midas_SFP.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | INPUT: 
10 |   MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,)
11 |   MIN_SIZE_TEST: 608
12 |   MAX_SIZE_TRAIN: 1024
13 |   MAX_SIZE_TEST: 1024
14 |   FORMAT: "RGB"
15 | TEST:
16 |   EVAL_PERIOD: 29000
17 | VIS_PERIOD: 2320
18 | DATASETS:
19 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
20 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
21 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
22 | MODEL:
23 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
24 |   PIXEL_STD: [58.395, 57.120, 57.375]  # changed to rgb order
25 |   BACKBONE:
26 |     FREEZE_AT: 0 
27 |     NAME: 'build_midas_backbone'
28 |   MIDAS:
29 |     OUTPUT: 'dense'
30 |     LAYER: -1
31 |     RETURN_MULTILAYER: False
32 |   FPN:
33 |     IN_FEATURE: 'last_feat'
34 |     NORM: 'LN'
35 |     SQUARE_PAD: 1024
36 |   ANCHOR_GENERATOR:
37 |     SIZES: [[64], [128], [256], [512]] 
38 |   RPN:
39 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
40 |   ROI_HEADS:
41 |     NUM_CLASSES: 50
42 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']


--------------------------------------------------------------------------------
/configs/OVMono3D_sam_SFP.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | SOLVER:
 3 |   TYPE: "sgd"
 4 |   IMS_PER_BATCH: 192
 5 |   BASE_LR: 0.12
 6 |   STEPS: (69600, 92800)
 7 |   MAX_ITER: 116000
 8 |   WARMUP_ITERS: 3625
 9 | INPUT: 
10 |   MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,)
11 |   MIN_SIZE_TEST: 608
12 |   MAX_SIZE_TRAIN: 1024
13 |   MAX_SIZE_TEST: 1024
14 |   FORMAT: "RGB"
15 | TEST:
16 |   EVAL_PERIOD: 29000
17 | VIS_PERIOD: 2320
18 | DATASETS:
19 |   TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
20 |   TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
21 |   CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
22 | MODEL:
23 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
24 |   PIXEL_STD: [58.395, 57.120, 57.375]  # changed to rgb order
25 |   BACKBONE:
26 |     FREEZE_AT: 0 
27 |     NAME: 'build_sam_backbone'
28 |   MIDAS:
29 |     OUTPUT: 'dense'
30 |     LAYER: -1
31 |     RETURN_MULTILAYER: False
32 |   FPN:
33 |     IN_FEATURE: 'last_feat'
34 |     NORM: 'LN'
35 |     SQUARE_PAD: 1024
36 |   ANCHOR_GENERATOR:
37 |     SIZES: [[64], [128], [256], [512]] 
38 |   RPN:
39 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
40 |   ROI_HEADS:
41 |     NUM_CLASSES: 50
42 |     IN_FEATURES: ['p2', 'p3', 'p4', 'p5']


--------------------------------------------------------------------------------
/configs/category_meta.json:
--------------------------------------------------------------------------------
1 | {"_comment": "97 classes", "thing_classes": ["pedestrian", "car", "dontcare", "cyclist", "van", "truck", "tram", "person", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "dresser", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "board", "kitchen pan", "bin", "stove", "microwave", "plates", "bowl", "oven", "vase", "faucet", "tissues", "machine", "printer", "monitor", "podium", "cart", "projector", "electronics", "computer", "air conditioner", "drawers", "coffee maker", "toaster", "potted plant", "painting", "bag", "tray", "keyboard", "blanket", "rack", "phone", "mouse", "fire extinguisher", "toys", "ladder", "fan", "glass", "clock", "toilet paper", "closet", "fume hood", "utensils", "soundsystem", "shower curtain", "remote", "pen", "fireplace"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "12": 12, "13": 13, "14": 14, "15": 15, "16": 16, "17": 17, "18": 18, "19": 19, "20": 20, "21": 21, "22": 22, "23": 23, "24": 24, "25": 25, "26": 26, "27": 27, "28": 28, "29": 29, "30": 30, "31": 31, "32": 32, "33": 33, "34": 34, "35": 35, "36": 36, "37": 37, "38": 38, "39": 39, "40": 40, "41": 41, "42": 42, "43": 43, "44": 44, "45": 45, "46": 46, "47": 47, "48": 48, "49": 49, "50": 50, "51": 51, "52": 52, "53": 53, "54": 54, "55": 55, "56": 56, "57": 57, "58": 58, "59": 59, "60": 60, "61": 61, "62": 62, "63": 63, "64": 64, "65": 65, "66": 66, "67": 67, "68": 68, "69": 69, "70": 70, "71": 71, "72": 72, "73": 73, "74": 74, "75": 75, "76": 76, "77": 77, "78": 78, "79": 79, "80": 80, "81": 81, "82": 82, "83": 83, "84": 84, "85": 85, "86": 86, "87": 87, "88": 88, "89": 89, "90": 90, "91": 91, "92": 92, "94": 93, "95": 94, "96": 95, "97": 96}}


--------------------------------------------------------------------------------
/configs/category_meta50.json:
--------------------------------------------------------------------------------
1 | {"_comment": "50 classes", "thing_classes": ["pedestrian", "car", "cyclist", "van", "truck", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "bin", "stove", "oven", "machine"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "3": 2, "4": 3, "5": 4, "8": 5, "9": 6, "10": 7, "11": 8, "12": 9, "13": 10, "14": 11, "15": 12, "16": 13, "17": 14, "18": 15, "19": 16, "20": 17, "21": 18, "22": 19, "23": 20, "24": 21, "25": 22, "26": 23, "27": 24, "28": 25, "29": 26, "30": 27, "31": 28, "32": 29, "33": 30, "34": 31, "35": 32, "36": 33, "37": 34, "38": 35, "39": 36, "40": 37, "42": 38, "43": 39, "44": 40, "45": 41, "46": 42, "47": 43, "48": 44, "49": 45, "52": 46, "53": 47, "57": 48, "61": 49}}


--------------------------------------------------------------------------------
/cubercnn/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *


--------------------------------------------------------------------------------
/cubercnn/config/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from detectron2.config import CfgNode as CN
  3 | 
  4 | def get_cfg_defaults(cfg):
  5 | 
  6 |     # A list of category names which will be used
  7 |     cfg.DATASETS.CATEGORY_NAMES = []
  8 | 
  9 |     # The category names which will be treated as ignore
 10 |     # e.g., not counting as background during training
 11 |     # or as false positives during evaluation.
 12 |     cfg.DATASETS.IGNORE_NAMES = []
 13 | 
 14 |     # Should the datasets appear with the same probabilty
 15 |     # in batches (e.g., the imbalance from small and large
 16 |     # datasets will be accounted for during sampling)
 17 |     cfg.DATALOADER.BALANCE_DATASETS = False
 18 | 
 19 |     # The thresholds for when to treat a known box
 20 |     # as ignore based on too heavy of truncation or 
 21 |     # too low of visibility in the image. This affects
 22 |     # both training and evaluation ignores.
 23 |     cfg.DATASETS.TRUNCATION_THRES = 0.99
 24 |     cfg.DATASETS.VISIBILITY_THRES = 0.01
 25 |     cfg.DATASETS.MIN_HEIGHT_THRES = 0.00
 26 |     cfg.DATASETS.MAX_DEPTH = 1e8
 27 | 
 28 |     # Whether modal 2D boxes should be loaded, 
 29 |     # or if the full 3D projected boxes should be used.
 30 |     cfg.DATASETS.MODAL_2D_BOXES = False
 31 | 
 32 |     # Whether truncated 2D boxes should be loaded, 
 33 |     # or if the 3D full projected boxes should be used.
 34 |     cfg.DATASETS.TRUNC_2D_BOXES = True
 35 | 
 36 |     cfg.DATASETS.TEST_BASE = ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
 37 |     cfg.DATASETS.TEST_NOVEL = ('SUNRGBD_test_novel','ARKitScenes_test_novel', 'KITTI_test_novel') 
 38 |     cfg.DATASETS.CATEGORY_NAMES_BASE = ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
 39 |     cfg.DATASETS.CATEGORY_NAMES_NOVEL = ('monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem', 'fireplace', 'tram')
 40 | 
 41 |     # Oracle 2D files for evaluation
 42 |     cfg.DATASETS.ORACLE2D_FILES = CN()
 43 |     cfg.DATASETS.ORACLE2D_FILES.EVAL_MODE = 'target_aware'   # 'target_aware' or 'previous_metric'
 44 |     
 45 |     # Create a configuration for each evaluation mode
 46 |     for mode in ['target_aware', 'previous_metric']:
 47 |         cfg.DATASETS.ORACLE2D_FILES[mode] = CN()
 48 |         cfg.DATASETS.ORACLE2D_FILES[mode].novel = CN()
 49 |         cfg.DATASETS.ORACLE2D_FILES[mode].base = CN()
 50 | 
 51 |         # Oracle 2D file for the Novel class dataset
 52 |         novel_datasets = {
 53 |             'SUNRGBD_test_novel': 'sunrgbd',
 54 |             'ARKitScenes_test_novel': 'arkitscenes', 
 55 |             'KITTI_test_novel': 'kitti'
 56 |         }
 57 |         
 58 |         # Oracle 2D file for the Base class dataset
 59 |         base_datasets = {
 60 |             'SUNRGBD_test': 'sunrgbd',
 61 |             'Hypersim_test': 'hypersim',
 62 |             'ARKitScenes_test': 'arkitscenes',
 63 |             'Objectron_test': 'objectron',
 64 |             'KITTI_test': 'kitti',
 65 |             'nuScenes_test': 'nuscenes'
 66 |         }
 67 | 
 68 |         # Set the file path for the novel class
 69 |         for dataset, dataset_name in novel_datasets.items():
 70 |             prefix = 'gdino_novel_previous_metric' if mode == 'previous_metric' else 'gdino'
 71 |             cfg.DATASETS.ORACLE2D_FILES[mode].novel[dataset] = f'datasets/Omni3D/{prefix}_{dataset_name}_novel_oracle_2d.json'
 72 | 
 73 |         # Set the file path for the base class
 74 |         for dataset, dataset_name in base_datasets.items():
 75 |             prefix = 'gdino_previous_eval' if mode == 'previous_metric' else 'gdino'
 76 |             cfg.DATASETS.ORACLE2D_FILES[mode].base[dataset] = f'datasets/Omni3D/{prefix}_{dataset_name}_base_oracle_2d.json'
 77 | 
 78 |     cfg.MODEL.FPN.IN_FEATURE = None
 79 |     cfg.MODEL.FPN.SQUARE_PAD = 0
 80 |     # Threshold used for matching and filtering boxes
 81 |     # inside of ignore regions, within the RPN and ROIHeads
 82 |     cfg.MODEL.RPN.IGNORE_THRESHOLD = 0.5
 83 | 
 84 |     cfg.MODEL.DINO = CN()
 85 |     cfg.MODEL.DINO.NAME = 'dinov2'
 86 |     cfg.MODEL.DINO.MODEL_NAME = 'vitb14'
 87 |     cfg.MODEL.DINO.OUTPUT = 'dense'
 88 |     cfg.MODEL.DINO.LAYER = -1
 89 |     cfg.MODEL.DINO.RETURN_MULTILAYER = False
 90 | 
 91 |     cfg.MODEL.MAE = CN()
 92 |     cfg.MODEL.MAE.CHECKPOINT = 'facebook/vit-mae-base'
 93 |     cfg.MODEL.MAE.OUTPUT = 'dense'
 94 |     cfg.MODEL.MAE.LAYER = -1
 95 |     cfg.MODEL.MAE.RETURN_MULTILAYER = False
 96 | 
 97 |     cfg.MODEL.CLIP = CN()
 98 |     cfg.MODEL.CLIP.ARCH = 'ViT-B-16'
 99 |     cfg.MODEL.CLIP.CHECKPOINT = 'openai'
100 |     cfg.MODEL.CLIP.OUTPUT = 'dense'
101 |     cfg.MODEL.CLIP.LAYER = -1
102 |     cfg.MODEL.CLIP.RETURN_MULTILAYER = False
103 | 
104 |     cfg.MODEL.MIDAS = CN()
105 |     cfg.MODEL.MIDAS.OUTPUT = 'dense'
106 |     cfg.MODEL.MIDAS.LAYER = -1
107 |     cfg.MODEL.MIDAS.RETURN_MULTILAYER = False
108 | 
109 |     cfg.MODEL.SAM = CN()
110 |     cfg.MODEL.SAM.OUTPUT = 'dense'
111 |     cfg.MODEL.SAM.LAYER = -1
112 |     cfg.MODEL.SAM.RETURN_MULTILAYER = False
113 |     
114 |     # Configuration for cube head
115 |     cfg.MODEL.ROI_CUBE_HEAD = CN()
116 |     cfg.MODEL.ROI_CUBE_HEAD.NAME = "CubeHead"
117 |     cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION = 7
118 |     cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO = 0
119 |     cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE = "ROIAlignV2"
120 | 
121 |     # Settings for the cube head features
122 |     cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV = 0
123 |     cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM = 256
124 |     cfg.MODEL.ROI_CUBE_HEAD.NUM_FC = 2
125 |     cfg.MODEL.ROI_CUBE_HEAD.FC_DIM = 1024
126 |     cfg.MODEL.ROI_CUBE_HEAD.USE_TRANSFORMER = False
127 |     
128 |     # the style to predict Z with currently supported
129 |     # options --> ['direct', 'sigmoid', 'log', 'clusters']
130 |     cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE = "direct"
131 | 
132 |     # the style to predict pose with currently supported
133 |     # options --> ['6d', 'euler', 'quaternion']
134 |     cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE = "6d"
135 | 
136 |     # Whether to scale all 3D losses by inverse depth
137 |     cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT = False
138 | 
139 |     # Virtual depth puts all predictions of depth into
140 |     # a shared virtual space with a shared focal length. 
141 |     cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH = True
142 |     cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL = 512.0
143 | 
144 |     # If true, then all losses are computed using the 8 corners
145 |     # such that they are all in a shared scale space. 
146 |     # E.g., their scale correlates with their impact on 3D IoU.
147 |     # This way no manual weights need to be set.
148 |     cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS = True
149 | 
150 |     # When > 1, the outputs of the 3D head will be based on
151 |     # a 2D scale clustering, based on 2D proposal height/width.
152 |     # This parameter describes the number of bins to cluster.
153 |     cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS = 1
154 | 
155 |     # Whether batch norm is enabled during training. 
156 |     # If false, all BN weights will be frozen. 
157 |     cfg.MODEL.USE_BN = True
158 | 
159 |     # Whether to predict the pose in allocentric space. 
160 |     # The allocentric space may correlate better with 2D 
161 |     # images compared to egocentric poses. 
162 |     cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE = True
163 | 
164 |     # Whether to use chamfer distance for disentangled losses
165 |     # of pose. This avoids periodic issues of rotation but 
166 |     # may prevent the pose "direction" from being interpretable.
167 |     cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE = True
168 | 
169 |     # Should the prediction heads share FC features or not. 
170 |     # These include groups of uv, z, whl, pose.
171 |     cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC = True
172 | 
173 |     # Check for stable gradients. When inf is detected, skip the update. 
174 |     # This prevents an occasional bad sample from exploding the model. 
175 |     # The threshold below is the allows percent of bad samples. 
176 |     # 0.0 is off, and 0.01 is recommended for minor robustness to exploding.
177 |     cfg.MODEL.STABILIZE = 0.01
178 |     
179 |     # Whether or not to use the dimension priors
180 |     cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED = True
181 | 
182 |     # How prior dimensions should be computed? 
183 |     # The supported modes are ["exp", "sigmoid"]
184 |     # where exp is unbounded and sigmoid is bounded
185 |     # between +- 3 standard deviations from the mean.
186 |     cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC = 'exp'
187 | 
188 |     # weight for confidence loss. 0 is off.
189 |     cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE = 1.0
190 | 
191 |     # Loss weights for XY, Z, Dims, Pose
192 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D = 1.0
193 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY = 1.0
194 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z = 1.0
195 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS = 1.0
196 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE = 1.0
197 | 
198 |     cfg.MODEL.DLA = CN()
199 | 
200 |     # Supported types for DLA backbones are...
201 |     # dla34, dla46_c, dla46x_c, dla60x_c, dla60, dla60x, dla102x, dla102x2, dla169
202 |     cfg.MODEL.DLA.TYPE = 'dla34'
203 | 
204 |     # Only available for dla34, dla60, dla102
205 |     cfg.MODEL.DLA.TRICKS = False
206 | 
207 |     # A joint loss for the disentangled loss.
208 |     # All predictions are computed using a corner
209 |     # or chamfers loss depending on chamfer_pose!
210 |     # Recommened to keep this weight small: [0.05, 0.5]
211 |     cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT = 1.0
212 | 
213 |     # sgd, adam, adam+amsgrad, adamw, adamw+amsgrad
214 |     cfg.SOLVER.TYPE = 'sgd'
215 | 
216 |     cfg.MODEL.RESNETS.TORCHVISION = True
217 |     cfg.TEST.DETECTIONS_PER_IMAGE = 100
218 | 
219 |     cfg.TEST.VISIBILITY_THRES = 1/2.0
220 |     cfg.TEST.TRUNCATION_THRES = 1/2.0
221 | 
222 |     # If ORACLE2D is True, the ocacle 2d bboxes and categories will be loaded when evaluation. 
223 |     cfg.TEST.ORACLE2D = True
224 |     cfg.TEST.CAT_MODE = "base" # "base" or "novel" or "all"
225 | 
226 |     cfg.INPUT.RANDOM_FLIP = "horizontal"
227 |     cfg.INPUT.TRAIN_SET_PERCENTAGE = 1.0
228 |     # When True, we will use localization uncertainty
229 |     # as the new IoUness score in the RPN.
230 |     cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY = 'IoUness'
231 | 
232 |     # If > 0.0 this is the scaling factor that will be applied to
233 |     # an RoI 2D box before doing any pooling to give more context. 
234 |     # Ex. 1.5 makes width and height 50% larger. 
235 |     cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES = 0.0
236 | 
237 |     # weight path specifically for pretraining (no checkpointables will be loaded)
238 |     cfg.MODEL.WEIGHTS_PRETRAIN = ''


--------------------------------------------------------------------------------
/cubercnn/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .dataset_mapper import *
3 | from .build import *
4 | from .builtin import *


--------------------------------------------------------------------------------
/cubercnn/data/build.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | import itertools
  3 | import logging
  4 | import numpy as np
  5 | import math
  6 | import json
  7 | from collections import defaultdict
  8 | import torch
  9 | import torch.utils.data
 10 | 
 11 | from detectron2.config import configurable
 12 | from detectron2.utils.logger import _log_api_usage
 13 | 
 14 | from detectron2.data.catalog import DatasetCatalog
 15 | from detectron2.data.common import DatasetFromList, MapDataset
 16 | from detectron2.data.dataset_mapper import DatasetMapper
 17 | from detectron2.data.samplers import (
 18 |     InferenceSampler, 
 19 |     RepeatFactorTrainingSampler, 
 20 |     TrainingSampler
 21 | )
 22 | from detectron2.data.build import (
 23 |     filter_images_with_only_crowd_annotations, 
 24 |     build_batch_data_loader,
 25 |     trivial_batch_collator
 26 | )
 27 | import random
 28 | 
 29 | 
 30 | def sample_by_percentage(data_list, percentage, seed=None):
 31 |     if seed is not None:
 32 |         random.seed(seed)  
 33 |     sample_size = int(len(data_list) * percentage)
 34 |     return random.sample(data_list, sample_size)
 35 | 
 36 | def xywh_to_xyxy(bbox):
 37 |     x, y, w, h = bbox
 38 |     x_min = x
 39 |     y_min = y
 40 |     x_max = x + w
 41 |     y_max = y + h
 42 |     return [x_min, y_min, x_max, y_max]
 43 | 
 44 | 
 45 | def merge_oracle2d_to_detection_dicts(dataset_dicts, oracle2d):
 46 |     for dataset, oracle in zip(dataset_dicts, oracle2d):
 47 |         with open(oracle, 'r') as file:
 48 |             oracle_data = json.load(file)
 49 |         for data_dict, oracle_dict in zip(dataset,oracle_data):
 50 |             assert data_dict['image_id'] == oracle_dict['image_id']
 51 |             data_dict["oracle2D"] = {"gt_bbox2D": torch.tensor([xywh_to_xyxy(instance["bbox"]) for instance in oracle_dict["instances"]]), 
 52 |                                      "gt_classes": torch.tensor([instance["category_id"] for instance in oracle_dict["instances"]]),
 53 |                                      "gt_scores": torch.tensor([instance["score"] for instance in oracle_dict["instances"]]),
 54 |                                      }
 55 | 
 56 | 
 57 | def get_detection_dataset_dicts(names, filter_empty=True, oracle2d=None, **kwargs):
 58 |     
 59 |     if isinstance(names, str):
 60 |         names = [names]
 61 | 
 62 |     assert len(names), names
 63 |     dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
 64 |     if oracle2d:
 65 |         merge_oracle2d_to_detection_dicts(dataset_dicts, oracle2d)
 66 |     for dataset_name, dicts in zip(names, dataset_dicts):
 67 |         assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
 68 | 
 69 |     dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
 70 | 
 71 |     has_instances = "annotations" in dataset_dicts[0]
 72 |     
 73 |     if filter_empty and has_instances:
 74 |         dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
 75 | 
 76 |     assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
 77 |     return dataset_dicts
 78 | 
 79 | 
 80 | def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None, dataset_id_to_src=None):
 81 |     if dataset is None:
 82 |         dataset = get_detection_dataset_dicts(
 83 |             cfg.DATASETS.TRAIN,
 84 |             filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
 85 |             min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
 86 |             if cfg.MODEL.KEYPOINT_ON
 87 |             else 0,
 88 |             proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
 89 |         )
 90 |         _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
 91 | 
 92 |     if cfg.INPUT.TRAIN_SET_PERCENTAGE != 1.0:
 93 |         dataset = sample_by_percentage(dataset, cfg.INPUT.TRAIN_SET_PERCENTAGE, seed=42) 
 94 |     logger = logging.getLogger(__name__)
 95 |     logger.info("Using {} training images".format(len(dataset)))
 96 |     if mapper is None:
 97 |         mapper = DatasetMapper(cfg, True)
 98 | 
 99 |     if sampler is None:
100 |         sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
101 |         balance_datasets = cfg.DATALOADER.BALANCE_DATASETS
102 |         logger = logging.getLogger(__name__)
103 |         logger.info("Using training sampler {}".format(sampler_name))
104 | 
105 |         if balance_datasets:
106 |             assert dataset_id_to_src is not None, 'Need dataset sources.'
107 | 
108 |             dataset_source_to_int = {val:i for i, val in enumerate(set(dataset_id_to_src.values()))}
109 |             dataset_ids_per_img = [dataset_source_to_int[dataset_id_to_src[img['dataset_id']]] for img in dataset]
110 |             dataset_ids = np.unique(dataset_ids_per_img)
111 | 
112 |             # only one source? don't re-weight then.
113 |             if len(dataset_ids) == 1:
114 |                 weights_per_img = torch.ones(len(dataset_ids_per_img)).float()
115 |             
116 |             # compute per-dataset weights.
117 |             else:
118 |                 counts = np.bincount(dataset_ids_per_img)
119 |                 counts = [counts[id] for id in dataset_ids]
120 |                 weights = [1 - count/np.sum(counts) for count in counts]
121 |                 weights = [weight/np.min(weights) for weight in weights]
122 |                 
123 |                 weights_per_img = torch.zeros(len(dataset_ids_per_img)).float()
124 |                 dataset_ids_per_img = torch.FloatTensor(dataset_ids_per_img).long()
125 | 
126 |                 # copy weights
127 |                 for dataset_id, weight in zip(dataset_ids, weights):
128 |                     weights_per_img[dataset_ids_per_img == dataset_id] = weight
129 | 
130 |         # no special sampling whatsoever
131 |         if sampler_name == "TrainingSampler" and not balance_datasets:
132 |             sampler = TrainingSampler(len(dataset))
133 | 
134 |         # balance the weight sampling by datasets
135 |         elif sampler_name == "TrainingSampler" and balance_datasets:
136 |             sampler = RepeatFactorTrainingSampler(weights_per_img)
137 |         
138 |         # balance the weight sampling by categories
139 |         elif sampler_name == "RepeatFactorTrainingSampler" and not balance_datasets:
140 |             repeat_factors = repeat_factors_from_category_frequency(
141 |                 dataset, cfg.DATALOADER.REPEAT_THRESHOLD
142 |             )
143 |             sampler = RepeatFactorTrainingSampler(repeat_factors)
144 | 
145 |         # balance the weight sampling by categories AND by dataset frequency
146 |         elif sampler_name == "RepeatFactorTrainingSampler" and balance_datasets:
147 |             repeat_factors = repeat_factors_from_category_frequency(
148 |                 dataset, cfg.DATALOADER.REPEAT_THRESHOLD
149 |             )
150 |             repeat_factors *= weights_per_img
151 |             repeat_factors /= repeat_factors.min().item()
152 |             sampler = RepeatFactorTrainingSampler(repeat_factors)
153 |         else:
154 |             raise ValueError("Unknown training sampler: {}".format(sampler_name))
155 | 
156 |     return {
157 |         "dataset": dataset,
158 |         "sampler": sampler,
159 |         "mapper": mapper,
160 |         "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
161 |         "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
162 |         "num_workers": cfg.DATALOADER.NUM_WORKERS,
163 |     }
164 | 
165 | 
166 | def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
167 |         """
168 |         Compute (fractional) per-image repeat factors based on category frequency.
169 |         The repeat factor for an image is a function of the frequency of the rarest
170 |         category labeled in that image. The "frequency of category c" in [0, 1] is defined
171 |         as the fraction of images in the training set (without repeats) in which category c
172 |         appears.
173 |         See :paper:`lvis` (>= v2) Appendix B.2.
174 | 
175 |         Args:
176 |             dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
177 |             repeat_thresh (float): frequency threshold below which data is repeated.
178 |                 If the frequency is half of `repeat_thresh`, the image will be
179 |                 repeated twice.
180 | 
181 |         Returns:
182 |             torch.Tensor:
183 |                 the i-th element is the repeat factor for the dataset image at index i.
184 |         """
185 |         # 1. For each category c, compute the fraction of images that contain it: f(c)
186 |         category_freq = defaultdict(int)
187 |         for dataset_dict in dataset_dicts:  # For each image (without repeats)
188 |             cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
189 |             for cat_id in cat_ids:
190 |                 if cat_id < 0: continue
191 |                 category_freq[cat_id] += 1
192 |         num_images = len(dataset_dicts)
193 |         for k, v in category_freq.items():
194 |             category_freq[k] = v / num_images
195 | 
196 |         # 2. For each category c, compute the category-level repeat factor:
197 |         #    r(c) = max(1, sqrt(t / f(c)))
198 |         category_rep = {
199 |             cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
200 |             for cat_id, cat_freq in category_freq.items()
201 |         }
202 | 
203 |         # 3. For each image I, compute the image-level repeat factor:
204 |         #    r(I) = max_{c in I} r(c)
205 |         rep_factors = []
206 |         for dataset_dict in dataset_dicts:
207 |             cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
208 |             rep_factor = max({category_rep[cat_id] for cat_id in cat_ids if cat_id >= 0}, default=1.0)
209 |             rep_factors.append(rep_factor)
210 | 
211 |         return torch.tensor(rep_factors, dtype=torch.float32)
212 | 
213 | @configurable(from_config=_train_loader_from_config)
214 | def build_detection_train_loader(dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0):
215 |     if isinstance(dataset, list):
216 |         dataset = DatasetFromList(dataset, copy=False)
217 |     if mapper is not None:
218 |         dataset = MapDataset(dataset, mapper)
219 |     if sampler is None:
220 |         sampler = TrainingSampler(len(dataset))
221 |     assert isinstance(sampler, torch.utils.data.sampler.Sampler)
222 |     return build_batch_data_loader(
223 |         dataset,
224 |         sampler,
225 |         total_batch_size,
226 |         aspect_ratio_grouping=aspect_ratio_grouping,
227 |         num_workers=num_workers
228 |     )
229 | 
230 | def _test_loader_from_config(cfg, dataset_name, mode, mapper=None):
231 |     if isinstance(dataset_name, str):
232 |         dataset_name = [dataset_name]
233 |     dataset = get_detection_dataset_dicts(
234 |         dataset_name,
235 |         filter_empty=False,
236 |         oracle2d=[
237 |             getattr(getattr(cfg.DATASETS.ORACLE2D_FILES[cfg.DATASETS.ORACLE2D_FILES.EVAL_MODE], mode), x) for x in dataset_name
238 |         ]
239 |         if cfg.TEST.ORACLE2D
240 |         else None,
241 |         proposal_files=[
242 |             cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
243 |         ]
244 |         if cfg.MODEL.LOAD_PROPOSALS
245 |         else None,
246 |     )
247 |     if mapper is None:
248 |         mapper = DatasetMapper(cfg, False)
249 | 
250 |     return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
251 | 
252 | @configurable(from_config=_test_loader_from_config)
253 | def build_detection_test_loader(dataset, *, mapper, sampler=None, num_workers=0):
254 |     
255 |     if isinstance(dataset, list):
256 |         dataset = DatasetFromList(dataset, copy=False)
257 |     if mapper is not None:
258 |         dataset = MapDataset(dataset, mapper)
259 |     if sampler is None:
260 |         sampler = InferenceSampler(len(dataset))
261 | 
262 |     # Always use 1 image per worker during inference since this is the
263 |     # standard when reporting inference time in papers.
264 |     batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
265 |     data_loader = torch.utils.data.DataLoader(
266 |         dataset,
267 |         num_workers=num_workers,
268 |         batch_sampler=batch_sampler,
269 |         collate_fn=trivial_batch_collator,
270 |     )
271 |     return data_loader
272 | 
273 | 


--------------------------------------------------------------------------------
/cubercnn/data/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | 
 3 | def get_omni3d_categories(dataset="omni3d"):
 4 |     """
 5 |     Returns the Omni3D categories for dataset
 6 |     Args:
 7 |         dataset: str
 8 |     Returns:
 9 |         cats: set of strings with category names
10 |     """
11 | 
12 |     if dataset == "omni3d":
13 |         cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'})
14 |         assert len(cats) == 50
15 |     elif dataset == "omni3d_in":
16 |         cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'})
17 |         assert len(cats) == 38
18 |     elif dataset == "omni3d_out":
19 |         cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'})
20 |         assert len(cats) == 11
21 |     elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test"]:
22 |         cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
23 |         assert len(cats) == 38
24 |     elif dataset in ["Hypersim_train", "Hypersim_val"]:
25 |         cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
26 |         assert len(cats) == 29
27 |     elif dataset == "Hypersim_test":
28 |         # Hypersim test annotation does not contain toilet
29 |         cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
30 |         assert len(cats) == 28
31 |     elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]:
32 |         cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'})
33 |         assert len(cats) == 14
34 |     elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]:
35 |         cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'})
36 |         assert len(cats) == 9
37 |     elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]:
38 |         cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'})
39 |         assert len(cats) == 5
40 |     elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]:
41 |         cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'})
42 |         assert len(cats) == 9
43 |     elif dataset in [ "SUNRGBD_test_novel"]:
44 |         cats = set({'monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem'})
45 |         assert len(cats) == 20
46 |     elif dataset in [ "ARKitScenes_test_novel"]:
47 |         cats = set({'fireplace'})
48 |         assert len(cats) == 1
49 |     elif dataset in [ "KITTI_test_novel"]:
50 |         cats = set({'tram'})
51 |         assert len(cats) == 1
52 |     else:
53 |         raise ValueError("%s dataset is not registered." % (dataset))
54 | 
55 |     return cats


--------------------------------------------------------------------------------
/cubercnn/data/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | import copy
  3 | import torch
  4 | import numpy as np
  5 | from detectron2.structures import BoxMode, Keypoints
  6 | from detectron2.data import detection_utils
  7 | from detectron2.data import transforms as T
  8 | from detectron2.data import (
  9 |     DatasetMapper
 10 | )
 11 | from detectron2.structures import (
 12 |     Boxes,
 13 |     BoxMode,
 14 |     Instances,
 15 | )
 16 | 
 17 | class DatasetMapper3D(DatasetMapper):
 18 | 
 19 |     def __call__(self, dataset_dict):
 20 |         
 21 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 22 |         
 23 |         image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format)
 24 |         detection_utils.check_image_size(dataset_dict, image)
 25 | 
 26 |         aug_input = T.AugInput(image)
 27 |         transforms = self.augmentations(aug_input)
 28 |         image = aug_input.image
 29 | 
 30 |         image_shape = image.shape[:2]  # h, w
 31 | 
 32 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
 33 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
 34 |         # Therefore it's important to use torch.Tensor.
 35 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
 36 | 
 37 |         # no need for additoinal processing at inference
 38 |         if not self.is_train:
 39 |             return dataset_dict
 40 | 
 41 |         if "annotations" in dataset_dict:
 42 | 
 43 |             dataset_id = dataset_dict['dataset_id']
 44 |             K = np.array(dataset_dict['K'])
 45 | 
 46 |             unknown_categories = self.dataset_id_to_unknown_cats[dataset_id]
 47 | 
 48 |             # transform and pop off annotations
 49 |             annos = [
 50 |                 transform_instance_annotations(obj, transforms, K=K)
 51 |                 for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0
 52 |             ]
 53 | 
 54 |             # convert to instance format
 55 |             instances = annotations_to_instances(annos, image_shape, unknown_categories)
 56 |             dataset_dict["instances"] = detection_utils.filter_empty_instances(instances)
 57 | 
 58 |         return dataset_dict
 59 | 
 60 | '''
 61 | Cached for mirroring annotations
 62 | '''
 63 | _M1 = np.array([
 64 |     [1, 0, 0], 
 65 |     [0, -1, 0],
 66 |     [0, 0, -1]
 67 | ])
 68 | _M2 = np.array([
 69 |     [-1.,  0.,  0.],
 70 |     [ 0., -1.,  0.],
 71 |     [ 0.,  0.,  1.]
 72 | ])
 73 | 
 74 | 
 75 | def transform_instance_annotations(annotation, transforms, *, K):
 76 |     
 77 |     if isinstance(transforms, (tuple, list)):
 78 |         transforms = T.TransformList(transforms)
 79 |     
 80 |     # bbox is 1d (per-instance bounding box)
 81 |     bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
 82 |     bbox = transforms.apply_box(np.array([bbox]))[0]
 83 |     
 84 |     annotation["bbox"] = bbox
 85 |     annotation["bbox_mode"] = BoxMode.XYXY_ABS
 86 | 
 87 |     if annotation['center_cam'][2] != 0:
 88 | 
 89 |         # project the 3D box annotation XYZ_3D to screen 
 90 |         point3D = annotation['center_cam']
 91 |         point2D = K @ np.array(point3D)
 92 |         point2D[:2] = point2D[:2] / point2D[-1]
 93 |         annotation["center_cam_proj"] = point2D.tolist()
 94 | 
 95 |         # apply coords transforms to 2D box
 96 |         annotation["center_cam_proj"][0:2] = transforms.apply_coords(
 97 |             point2D[np.newaxis][:, :2]
 98 |         )[0].tolist()
 99 | 
100 |         keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T
101 |         keypoints[:, 0] /= keypoints[:, -1]
102 |         keypoints[:, 1] /= keypoints[:, -1]
103 |         
104 |         if annotation['ignore']:
105 |             # all keypoints marked as not visible 
106 |             # 0 - unknown, 1 - not visible, 2 visible
107 |             keypoints[:, 2] = 1
108 |         else:
109 |             
110 |             valid_keypoints = keypoints[:, 2] > 0
111 | 
112 |             # 0 - unknown, 1 - not visible, 2 visible
113 |             keypoints[:, 2] = 2
114 |             keypoints[valid_keypoints, 2] = 2
115 | 
116 |         # in place
117 |         transforms.apply_coords(keypoints[:, :2])
118 |         annotation["keypoints"] = keypoints.tolist()
119 | 
120 |         # manually apply mirror for pose
121 |         for transform in transforms:
122 | 
123 |             # horrizontal flip?
124 |             if isinstance(transform, T.HFlipTransform):
125 | 
126 |                 pose = _M1 @ np.array(annotation["pose"]) @ _M2
127 |                 annotation["pose"] = pose.tolist()
128 |                 annotation["R_cam"] = pose.tolist()
129 | 
130 |     return annotation
131 | 
132 | 
133 | def annotations_to_instances(annos, image_size, unknown_categories):
134 | 
135 |     # init
136 |     target = Instances(image_size)
137 |     
138 |     # add classes, 2D boxes, 3D boxes and poses
139 |     target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64)
140 |     target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos])
141 |     target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos])
142 |     target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos])
143 |     
144 |     n = len(target.gt_classes)
145 | 
146 |     # do keypoints?
147 |     target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos]))
148 | 
149 |     gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool)
150 |     gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True
151 | 
152 |     # include available category indices as tensor with GTs
153 |     target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1])
154 | 
155 |     return target
156 | 


--------------------------------------------------------------------------------
/cubercnn/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .omni3d_evaluation import *


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | from .densenet import * 
 2 | from .mnasnet import * 
 3 | from .resnet import * 
 4 | from .shufflenet import * 
 5 | from .dla import * 
 6 | from .dino import *
 7 | from .mae import *
 8 | from .clip import *
 9 | from .midas_final import *
10 | from .sam import *


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/clip.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import ShapeSpec
  2 | from detectron2.modeling.backbone import Backbone
  3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN
  5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid
  6 | import torch
  7 | from torch import nn
  8 | from torchvision import models
  9 | import torch.nn.functional as F
 10 | import einops as E
 11 | import unittest
 12 | import open_clip
 13 | from cubercnn.modeling.backbone.dino import tokens_to_output
 14 | from typing import Tuple
 15 | 
 16 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/clip.py
 17 | class CLIPBackbone(Backbone):
 18 |     def __init__(self, cfg, input_shape, arch="ViT-B-16", checkpoint="openai", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",):
 19 |         super().__init__()
 20 |         assert output in ["dense-cls", "cls", "gap", "dense"]
 21 |         self.output = output
 22 |         # Initialize a pre-trained CLIP image encoder and freeze it.
 23 |         _clip_model, _, _ = open_clip.create_model_and_transforms(
 24 |             arch, pretrained=checkpoint
 25 |         )
 26 |         _clip_model = _clip_model.to(torch.float32)
 27 |         self.visual = _clip_model.visual
 28 |         del _clip_model
 29 | 
 30 |         # Extract some attributes from CLIP module for easy access.
 31 |         self.patch_size = self.visual.conv1.stride[0]
 32 | 
 33 |         # get feature dimension
 34 |         feat_dim = self.visual.transformer.width
 35 |         feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim
 36 |         feat_dims = [feat_dim, feat_dim, feat_dim, feat_dim]
 37 | 
 38 |         # get extraction targets
 39 |         n_layers = len(self.visual.transformer.resblocks)
 40 |         multilayers = [
 41 |             n_layers // 4 - 1,
 42 |             n_layers // 2 - 1,
 43 |             n_layers // 4 * 3 - 1,
 44 |             n_layers - 1,
 45 |         ]
 46 | 
 47 |         if return_multilayer:
 48 |             self.feat_dim = feat_dims
 49 |             self.multilayers = multilayers
 50 |         else:
 51 |             self.feat_dim = feat_dims
 52 |             layer = multilayers[-1] if layer == -1 else layer
 53 |             self.multilayers = [layer]
 54 | 
 55 |         # define layer name (for logging)
 56 |         self.layer = "-".join(str(_x) for _x in self.multilayers)
 57 | 
 58 |         self._out_feature_channels = {out_feature: feat_dim}
 59 |         self._out_feature_strides = {out_feature: self.patch_size}
 60 |         self._out_features = [out_feature]
 61 | 
 62 |     def forward(self, images):
 63 |         img_h, img_w = images.shape[-2:]
 64 |         out_hw = (img_h // self.patch_size, img_w // self.patch_size)
 65 | 
 66 |         # clip stuff
 67 |         x = self.visual.conv1(images)
 68 |         x_hw = x.shape[-2:]
 69 |         x = E.rearrange(x, "b c h w -> b (h w) c")
 70 | 
 71 |         # concat cls token
 72 |         _cls_embed = E.repeat(self.visual.class_embedding, "c -> b 1 c", b=x.shape[0])
 73 |         x = torch.cat([_cls_embed.to(x.dtype), x], dim=1)
 74 | 
 75 |         # add pos embed
 76 |         pos_embed = resize_pos_embed(self.visual.positional_embedding, x_hw)
 77 |         x = self.visual.ln_pre(x + pos_embed.to(x.dtype))
 78 | 
 79 |         embeds = []
 80 |         for i, blk in enumerate(self.visual.transformer.resblocks):
 81 |             x = blk(x)
 82 |             if i in self.multilayers:
 83 |                 embeds.append(x)
 84 |                 if len(embeds) == len(self.multilayers):
 85 |                     break
 86 | 
 87 |         outputs = {}
 88 |         for i, _x in enumerate(embeds):
 89 |             _x = tokens_to_output(self.output, _x[:, 1:], _x[:, 0], out_hw)
 90 |             outputs[self._out_features[i]] = _x
 91 |         return outputs
 92 | 
 93 | def resize_pos_embed(
 94 |     pos_embed: torch.Tensor, hw: Tuple[int, int], has_cls_token: bool = True
 95 | ):
 96 |     """
 97 |     Resize positional embedding for arbitrary image resolution. Resizing is done
 98 |     via bicubic interpolation.
 99 | 
100 |     Args:
101 |         pos_embed: Positional embedding tensor of shape ``(n_patches, embed_dim)``.
102 |         hw: Target height and width of the tensor after interpolation.
103 |         has_cls_token: Whether ``pos_embed[0]`` is for the ``[cls]`` token.
104 | 
105 |     Returns:
106 |         Tensor of shape ``(new_n_patches, embed_dim)`` of resized embedding.
107 |         ``new_n_patches`` is ``new_height * new_width`` if ``has_cls`` is False,
108 |         else ``1 + new_height * new_width``.
109 |     """
110 | 
111 |     n_grid = pos_embed.shape[0] - 1 if has_cls_token else pos_embed.shape[0]
112 | 
113 |     # Do not resize if already in same shape.
114 |     if n_grid == hw[0] * hw[1]:
115 |         return pos_embed
116 | 
117 |     # Get original position embedding and extract ``[cls]`` token.
118 |     if has_cls_token:
119 |         cls_embed, pos_embed = pos_embed[[0]], pos_embed[1:]
120 | 
121 |     orig_dim = int(pos_embed.shape[0] ** 0.5)
122 | 
123 |     pos_embed = E.rearrange(pos_embed, "(h w) c -> 1 c h w", h=orig_dim)
124 |     pos_embed = F.interpolate(
125 |         pos_embed, hw, mode="bicubic", align_corners=False, antialias=True
126 |     )
127 |     pos_embed = E.rearrange(pos_embed, "1 c h w -> (h w) c")
128 | 
129 |     # Add embedding of ``[cls]`` token back after resizing.
130 |     if has_cls_token:
131 |         pos_embed = torch.cat([cls_embed, pos_embed], dim=0)
132 | 
133 |     return pos_embed
134 | 
135 | @BACKBONE_REGISTRY.register()
136 | def build_clip_backbone(cfg, input_shape: ShapeSpec, priors=None):
137 |     arch = cfg.MODEL.CLIP.ARCH
138 |     checkpoint = cfg.MODEL.CLIP.CHECKPOINT
139 |     output = cfg.MODEL.CLIP.OUTPUT
140 |     layer = cfg.MODEL.CLIP.LAYER
141 |     return_multilayer = cfg.MODEL.CLIP.RETURN_MULTILAYER
142 | 
143 |     bottom_up = CLIPBackbone(
144 |         cfg,
145 |         input_shape,
146 |         arch=arch,
147 |         checkpoint=checkpoint,
148 |         output=output,
149 |         layer=layer,
150 |         return_multilayer=return_multilayer,
151 |     )
152 | 
153 |     in_feature = cfg.MODEL.FPN.IN_FEATURE
154 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
155 |     scale_factors = (4.0, 2.0, 1.0, 0.5)
156 |     backbone = SimpleFeaturePyramid(
157 |         net=bottom_up,
158 |         in_feature=in_feature,
159 |         out_channels=out_channels,
160 |         scale_factors=scale_factors,
161 |         norm=cfg.MODEL.FPN.NORM,
162 |         top_block=None,
163 |         square_pad=cfg.MODEL.FPN.SQUARE_PAD
164 |     )
165 |     return backbone
166 | 
167 | class TestCLIPBackbone(unittest.TestCase):
168 |     def setUp(self):
169 |         # Mock configuration
170 |         self.cfg = type('', (), {})()
171 |         self.cfg.MODEL = type('', (), {})()
172 |         self.cfg.MODEL.CLIP = type('', (), {})()
173 |         self.cfg.MODEL.CLIP.ARCH = "ViT-B-16"
174 |         self.cfg.MODEL.CLIP.CHECKPOINT = "openai"
175 |         self.cfg.MODEL.CLIP.OUTPUT = "dense"
176 |         self.cfg.MODEL.CLIP.LAYER = -1
177 |         self.cfg.MODEL.CLIP.RETURN_MULTILAYER = False
178 |         self.cfg.MODEL.FPN = type('', (), {})()
179 |         self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat'
180 |         self.cfg.MODEL.FPN.OUT_CHANNELS = 256
181 |         self.cfg.MODEL.FPN.NORM = "LN"
182 |         self.cfg.MODEL.FPN.FUSE_TYPE = "sum"
183 |         self.cfg.MODEL.FPN.SQUARE_PAD = 512
184 |         self.input_shape = ShapeSpec(channels=3, height=512, width=512)
185 | 
186 |     def test_clip_backbone_forward(self):
187 |         # Create the backbone
188 |         backbone = build_clip_backbone(self.cfg, self.input_shape)
189 |         # Generate a random input tensor
190 |         x = torch.randn(1, 3, 512, 512)
191 |         # Run forward pass
192 |         outputs = backbone(x)
193 |         print(backbone.net.output_shape())
194 |         for key, output in outputs.items():
195 |             print(key, output.shape)
196 | 
197 |         # print(backbone.net.vit)
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     unittest.main()


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/densenet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class DenseNetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.densenet121(pretrained)
15 |         base  = base.features
16 | 
17 |         self.base = base
18 |         
19 |         self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024}
20 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22 |     
23 |     def forward(self, x):
24 | 
25 |         outputs = {}
26 |         
27 |         db1 = self.base[0:5](x)
28 |         db2 = self.base[5:7](db1)
29 |         db3 = self.base[7:9](db2)
30 |         p5 = self.base[9:](db3)
31 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32 |         outputs['p2'] = db1
33 |         outputs['p3'] = db2
34 |         outputs['p4'] = db3
35 |         outputs['p5'] = p5
36 |         outputs['p6'] = p6
37 | 
38 |         return outputs
39 | 
40 | 
41 | @BACKBONE_REGISTRY.register()
42 | def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
43 |     """
44 |     Args:
45 |         cfg: a detectron2 CfgNode
46 | 
47 |     Returns:
48 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
49 |     """
50 | 
51 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
52 | 
53 |     bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
54 |     in_features = cfg.MODEL.FPN.IN_FEATURES
55 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
56 | 
57 |     backbone = FPN(
58 |         bottom_up=bottom_up,
59 |         in_features=in_features,
60 |         out_channels=out_channels,
61 |         norm=cfg.MODEL.FPN.NORM,
62 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE
63 |     )
64 |     return backbone


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/dino.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import ShapeSpec
  2 | from detectron2.modeling.backbone import Backbone
  3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN
  5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid
  6 | import torch
  7 | from torch import nn
  8 | from torchvision import models
  9 | import torch.nn.functional as F
 10 | import einops as E
 11 | import unittest
 12 | 
 13 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/dino.py
 14 | class DINOBackbone(Backbone):
 15 |     def __init__(self, cfg, input_shape, dino_name="dino", model_name="vitb16", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",):
 16 |         super().__init__()
 17 |         feat_dims = {
 18 |             "vitb8": 768,
 19 |             "vitb16": 768,
 20 |             "vitb14": 768,
 21 |             "vitb14_reg": 768,
 22 |             "vitl14": 1024,
 23 |             "vitg14": 1536,
 24 |         }
 25 | 
 26 |         # get model
 27 |         self.model_name = dino_name
 28 |         self.checkpoint_name = f"{dino_name}_{model_name}"
 29 |         dino_vit = torch.hub.load(f"facebookresearch/{dino_name}", self.checkpoint_name)
 30 |         self.vit = dino_vit
 31 |         self.has_registers = "_reg" in model_name
 32 | 
 33 |         assert output in ["cls", "gap", "dense", "dense-cls"]
 34 |         self.output = output
 35 |         self.patch_size = self.vit.patch_embed.proj.kernel_size[0]
 36 | 
 37 |         feat_dim = feat_dims[model_name]
 38 |         feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim
 39 | 
 40 |         num_layers = len(self.vit.blocks)
 41 |         multilayers = [
 42 |             num_layers // 4 - 1,
 43 |             num_layers // 2 - 1,
 44 |             num_layers // 4 * 3 - 1,
 45 |             num_layers - 1,
 46 |         ]
 47 | 
 48 |         if return_multilayer:
 49 |             self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim]
 50 |             self.multilayers = multilayers
 51 |         else:
 52 |             self.feat_dim = feat_dim
 53 |             layer = multilayers[-1] if layer == -1 else layer
 54 |             self.multilayers = [layer]
 55 | 
 56 |         # define layer name (for logging)
 57 |         self.layer = "-".join(str(_x) for _x in self.multilayers)
 58 | 
 59 |         self._out_feature_channels = {out_feature: feat_dim}
 60 |         self._out_feature_strides = {out_feature: self.patch_size}
 61 |         self._out_features = [out_feature]
 62 | 
 63 |     def forward(self, images):
 64 |         h, w = images.shape[-2:]
 65 |         h, w = h // self.patch_size, w // self.patch_size
 66 | 
 67 |         if self.model_name == "dinov2":
 68 |             x = self.vit.prepare_tokens_with_masks(images, None)
 69 |         else:
 70 |             x = self.vit.prepare_tokens(images)
 71 | 
 72 |         embeds = []
 73 |         for i, blk in enumerate(self.vit.blocks):
 74 |             x = blk(x)
 75 |             if i in self.multilayers:
 76 |                 embeds.append(x)
 77 |                 if len(embeds) == len(self.multilayers):
 78 |                     break
 79 | 
 80 |         num_spatial = h * w
 81 |         outputs = {}
 82 |         for idx, x_i in enumerate(embeds):
 83 |             cls_tok = x_i[:, 0]
 84 |             spatial = x_i[:, -1 * num_spatial:]
 85 |             x_i = tokens_to_output(self.output, spatial, cls_tok, (h, w))
 86 |             outputs[self._out_features[idx]] = x_i
 87 | 
 88 |         return outputs
 89 | 
 90 | 
 91 | @BACKBONE_REGISTRY.register()
 92 | def build_dino_backbone(cfg, input_shape: ShapeSpec, priors=None):
 93 |     dino_name = cfg.MODEL.DINO.NAME
 94 |     model_name = cfg.MODEL.DINO.MODEL_NAME
 95 |     output = cfg.MODEL.DINO.OUTPUT
 96 |     layer = cfg.MODEL.DINO.LAYER
 97 |     return_multilayer = cfg.MODEL.DINO.RETURN_MULTILAYER
 98 | 
 99 |     bottom_up = DINOBackbone(
100 |         cfg,
101 |         input_shape,
102 |         dino_name=dino_name,
103 |         model_name=model_name,
104 |         output=output,
105 |         layer=layer,
106 |         return_multilayer=return_multilayer,
107 |     )
108 | 
109 |     in_feature = cfg.MODEL.FPN.IN_FEATURE
110 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
111 |     scale_factors = (2.0, 1.0, 0.5)
112 |     backbone = SimpleFeaturePyramid(
113 |         net=bottom_up,
114 |         in_feature=in_feature,
115 |         out_channels=out_channels,
116 |         scale_factors=scale_factors,
117 |         norm=cfg.MODEL.FPN.NORM,
118 |         top_block=None,
119 |         square_pad=cfg.MODEL.FPN.SQUARE_PAD
120 |     )
121 |     return backbone
122 | 
123 | def tokens_to_output(output_type, dense_tokens, cls_token, feat_hw):
124 |     if output_type == "cls":
125 |         assert cls_token is not None
126 |         output = cls_token
127 |     elif output_type == "gap":
128 |         output = dense_tokens.mean(dim=1)
129 |     elif output_type == "dense":
130 |         h, w = feat_hw
131 |         dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
132 |         output = dense_tokens.contiguous()
133 |     elif output_type == "dense-cls":
134 |         assert cls_token is not None
135 |         h, w = feat_hw
136 |         dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
137 |         cls_token = cls_token[:, :, None, None].repeat(1, 1, h, w)
138 |         output = torch.cat((dense_tokens, cls_token), dim=1).contiguous()
139 |     else:
140 |         raise ValueError()
141 | 
142 |     return output
143 | 
144 | class TestDINOBackbone(unittest.TestCase):
145 |     def setUp(self):
146 |         # Mock configuration
147 |         self.cfg = type('', (), {})()
148 |         self.cfg.MODEL = type('', (), {})()
149 |         self.cfg.MODEL.DINO = type('', (), {})()
150 |         self.cfg.MODEL.DINO.NAME = "dino"
151 |         self.cfg.MODEL.DINO.MODEL_NAME = "vitb16"
152 |         self.cfg.MODEL.DINO.OUTPUT = "dense"
153 |         self.cfg.MODEL.DINO.LAYER = -1
154 |         self.cfg.MODEL.DINO.RETURN_MULTILAYER = False
155 |         self.cfg.MODEL.FPN = type('', (), {})()
156 |         self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat'
157 |         self.cfg.MODEL.FPN.OUT_CHANNELS = 256
158 |         self.cfg.MODEL.FPN.NORM = "LN"
159 |         self.cfg.MODEL.FPN.FUSE_TYPE = "sum"
160 |         self.input_shape = ShapeSpec(channels=3, height=512, width=512)
161 | 
162 |     def test_dino_backbone_forward(self):
163 |         # Create the backbone
164 |         backbone = build_dino_backbone(self.cfg, self.input_shape)
165 |         # Generate a random input tensor
166 |         x = torch.randn(1, 3, 512, 512)
167 |         # Run forward pass
168 |         outputs = backbone(x)
169 |         print(backbone.net.output_shape())
170 |         for key, output in outputs.items():
171 |             print(key, output.shape)
172 | 
173 |         # print(backbone.net.vit)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     unittest.main()


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/mae.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import ShapeSpec
  2 | from detectron2.modeling.backbone import Backbone
  3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN
  5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid
  6 | import torch
  7 | from torch import nn
  8 | from torchvision import models
  9 | import torch.nn.functional as F
 10 | from transformers import ViTMAEForPreTraining
 11 | from transformers.models.vit_mae.modeling_vit_mae import (
 12 |     get_2d_sincos_pos_embed_from_grid,
 13 | )
 14 | import numpy as np
 15 | import einops as E
 16 | import unittest
 17 | from cubercnn.modeling.backbone.dino import tokens_to_output
 18 | 
 19 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/mae.py
 20 | class MAEBackbone(Backbone):
 21 |     def __init__(self, cfg, input_shape, checkpoint="facebook/vit-mae-base", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",):
 22 |         super().__init__()
 23 | 
 24 |         # get model
 25 |         self.checkpoint_name = checkpoint.split("/")[1]
 26 |         self.vit = ViTMAEForPreTraining.from_pretrained(checkpoint).vit
 27 | 
 28 |         assert output in ["cls", "gap", "dense", "dense-cls"]
 29 |         self.output = output
 30 |         self.patch_size = self.vit.config.patch_size
 31 | 
 32 |         self.image_size = self.vit.embeddings.patch_embeddings.image_size
 33 |         self.feat_h = self.image_size[0] // self.patch_size
 34 |         self.feat_w = self.image_size[1] // self.patch_size
 35 | 
 36 |         feat_dim = self.vit.config.hidden_size
 37 | 
 38 |         num_layers = len(self.vit.encoder.layer)
 39 |         multilayers = [
 40 |             num_layers // 4 - 1,
 41 |             num_layers // 2 - 1,
 42 |             num_layers // 4 * 3 - 1,
 43 |             num_layers - 1,
 44 |         ]
 45 | 
 46 |         if return_multilayer:
 47 |             self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim]
 48 |             self.multilayers = multilayers
 49 |         else:
 50 |             self.feat_dim = feat_dim
 51 |             layer = multilayers[-1] if layer == -1 else layer
 52 |             self.multilayers = [layer]
 53 | 
 54 |         # define layer name (for logging)
 55 |         self.layer = "-".join(str(_x) for _x in self.multilayers)
 56 | 
 57 |         self._out_feature_channels = {out_feature: feat_dim}
 58 |         self._out_feature_strides = {out_feature: self.patch_size}
 59 |         self._out_features = [out_feature]
 60 | 
 61 |     def resize_pos_embed(self, image_size):
 62 |         assert image_size[0] % self.patch_size == 0
 63 |         assert image_size[1] % self.patch_size == 0
 64 |         self.feat_h = image_size[0] // self.patch_size
 65 |         self.feat_w = image_size[1] // self.patch_size
 66 |         embed_dim = self.vit.config.hidden_size
 67 |         self.vit.embeddings.patch_embeddings.image_size = image_size
 68 |         pos_embed = get_2d_sincos_pos_embed(
 69 |             embed_dim, (self.feat_h, self.feat_w), add_cls_token=True
 70 |         )
 71 |         # there should be an easier way ... TODO
 72 |         device = self.vit.embeddings.patch_embeddings.projection.weight.device
 73 |         self.vit.embeddings.position_embeddings = nn.Parameter(
 74 |             torch.from_numpy(pos_embed).float().unsqueeze(0).to(device=device),
 75 |             requires_grad=False,
 76 |         )
 77 | 
 78 |     def embed_forward(self, embedder, pixel_values):
 79 |         # No masking here ...
 80 |         batch_size, num_channels, height, width = pixel_values.shape
 81 |         embeddings = embedder.patch_embeddings(pixel_values)
 82 | 
 83 |         # add position embeddings w/o cls token
 84 |         embeddings = embeddings + embedder.position_embeddings[:, 1:, :]
 85 | 
 86 |         # append cls token
 87 |         cls_token = embedder.cls_token + embedder.position_embeddings[:, :1, :]
 88 |         cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1)
 89 |         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
 90 | 
 91 |         return embeddings
 92 | 
 93 |     def forward(self, images):
 94 |         # check if positional embeddings are correct
 95 |         if self.image_size != images.shape[-2:]:
 96 |             self.resize_pos_embed(images.shape[-2:])
 97 | 
 98 |         # from MAE implementation
 99 |         head_mask = self.vit.get_head_mask(None, self.vit.config.num_hidden_layers)
100 | 
101 |         # ---- hidden ----
102 |         embedding_output = self.embed_forward(self.vit.embeddings, images)
103 |         encoder_outputs = self.vit.encoder(
104 |             embedding_output,
105 |             head_mask=head_mask,
106 |             output_attentions=self.vit.config.output_attentions,
107 |             output_hidden_states=True,
108 |             return_dict=self.vit.config.return_dict,
109 |         )
110 | 
111 |         outputs = {}
112 |         for idx, layer_i in enumerate(self.multilayers):
113 |             x_i = encoder_outputs.hidden_states[layer_i]
114 |             x_i = tokens_to_output(
115 |                 self.output, x_i[:, 1:], x_i[:, 0], (self.feat_h, self.feat_w)
116 |             )
117 |             outputs[self._out_features[idx]] = x_i
118 | 
119 |         return outputs
120 | 
121 | 
122 | @BACKBONE_REGISTRY.register()
123 | def build_mae_backbone(cfg, input_shape: ShapeSpec, priors=None):
124 |     checkpoint = cfg.MODEL.MAE.CHECKPOINT
125 |     output = cfg.MODEL.MAE.OUTPUT
126 |     layer = cfg.MODEL.MAE.LAYER
127 |     return_multilayer = cfg.MODEL.MAE.RETURN_MULTILAYER
128 | 
129 |     bottom_up = MAEBackbone(
130 |         cfg,
131 |         input_shape,
132 |         checkpoint=checkpoint,
133 |         output=output,
134 |         layer=layer,
135 |         return_multilayer=return_multilayer,
136 |     )
137 | 
138 |     in_feature = cfg.MODEL.FPN.IN_FEATURE
139 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
140 |     scale_factors = (4.0, 2.0, 1.0, 0.5)
141 |     backbone = SimpleFeaturePyramid(
142 |         net=bottom_up,
143 |         in_feature=in_feature,
144 |         out_channels=out_channels,
145 |         scale_factors=scale_factors,
146 |         norm=cfg.MODEL.FPN.NORM,
147 |         top_block=None,
148 |         square_pad=cfg.MODEL.FPN.SQUARE_PAD
149 |     )
150 |     return backbone
151 | 
152 | def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
153 |     """
154 |     COPIED FROM TRANSFORMERS PACKAGE AND EDITED TO ALLOW FOR DIFFERENT WIDTH-HEIGHT
155 |     Create 2D sin/cos positional embeddings.
156 | 
157 |     Args:
158 |         embed_dim (`int`):
159 |             Embedding dimension.
160 |         grid_size (`int`):
161 |             The grid height and width.
162 |         add_cls_token (`bool`, *optional*, defaults to `False`):
163 |             Whether or not to add a classification (CLS) token.
164 | 
165 |     Returns:
166 |         (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or
167 |         (1+grid_size*grid_size, embed_dim): the
168 |         position embeddings (with or without classification token)
169 |     """
170 |     grid_h = np.arange(grid_size[0], dtype=np.float32)
171 |     grid_w = np.arange(grid_size[1], dtype=np.float32)
172 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
173 |     grid = np.stack(grid, axis=0)
174 | 
175 |     grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
176 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
177 |     if add_cls_token:
178 |         pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
179 |     return pos_embed
180 | 
181 | class TestMAEBackbone(unittest.TestCase):
182 |     def setUp(self):
183 |         # Mock configuration
184 |         self.cfg = type('', (), {})()
185 |         self.cfg.MODEL = type('', (), {})()
186 |         self.cfg.MODEL.MAE = type('', (), {})()
187 |         self.cfg.MODEL.MAE.CHECKPOINT = "facebook/vit-mae-base"
188 |         self.cfg.MODEL.MAE.OUTPUT = "dense"
189 |         self.cfg.MODEL.MAE.LAYER = -1
190 |         self.cfg.MODEL.MAE.RETURN_MULTILAYER = False
191 |         self.cfg.MODEL.FPN = type('', (), {})()
192 |         self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat'
193 |         self.cfg.MODEL.FPN.OUT_CHANNELS = 256
194 |         self.cfg.MODEL.FPN.NORM = "LN"
195 |         self.cfg.MODEL.FPN.FUSE_TYPE = "sum"
196 |         self.cfg.MODEL.FPN.SQUARE_PAD = 1024
197 |         self.input_shape = ShapeSpec(channels=3, height=1024, width=1024)
198 | 
199 |     def test_mae_backbone_forward(self):
200 |         # Create the backbone
201 |         backbone = build_mae_backbone(self.cfg, self.input_shape)
202 |         # Generate a random input tensor
203 |         x = torch.randn(2, 3, 1024, 1024)
204 |         # Run forward pass
205 |         outputs = backbone(x)
206 |         print(backbone.net.output_shape())
207 |         for key, output in outputs.items():
208 |             print(key, output.shape)
209 | 
210 |         # print(backbone.net.vit)
211 | 
212 | 
213 | if __name__ == "__main__":
214 |     unittest.main()


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/midas_final.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import ShapeSpec
  2 | from detectron2.modeling.backbone import Backbone
  3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  4 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid
  5 | import torch
  6 | from torch import nn
  7 | from torchvision import models
  8 | import torch.nn.functional as F
  9 | import numpy as np
 10 | import einops as E
 11 | import unittest
 12 | from cubercnn.modeling.backbone.dino import tokens_to_output
 13 | from cubercnn.modeling.backbone.clip import resize_pos_embed
 14 | # from dino import tokens_to_output
 15 | # from clip import resize_pos_embed
 16 | 
 17 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/midas_final.py
 18 | class MIDASBackbone(Backbone):
 19 |     def __init__(self, cfg, input_shape, output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",):
 20 |         super().__init__()
 21 | 
 22 |         # get model
 23 |         midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
 24 |         self.vit = midas.pretrained.model
 25 | 
 26 |         # set parameters for feature extraction
 27 |         self.image_size = (384, 384)
 28 |         self.patch_size = 16
 29 |         self.output = output
 30 |         feat_dim = 1024
 31 |         self.feat_dim = 1024
 32 | 
 33 |         num_layers = len(self.vit.blocks)
 34 |         multilayers = [
 35 |             num_layers // 4 - 1,
 36 |             num_layers // 2 - 1,
 37 |             num_layers // 4 * 3 - 1,
 38 |             num_layers - 1,
 39 |         ]
 40 | 
 41 |         if return_multilayer:
 42 |             self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim]
 43 |             self.multilayers = multilayers
 44 |         else:
 45 |             self.feat_dim = feat_dim
 46 |             layer = multilayers[-1] if layer == -1 else layer
 47 |             self.multilayers = [layer]
 48 | 
 49 |         # define layer name (for logging)
 50 |         self.layer = "-".join(str(_x) for _x in self.multilayers)
 51 | 
 52 |         self._out_feature_channels = {out_feature: feat_dim}
 53 |         self._out_feature_strides = {out_feature: self.patch_size}
 54 |         self._out_features = [out_feature]
 55 | 
 56 | 
 57 |     def forward(self, x):
 58 |         # update shapes
 59 |         h, w = x.shape[2:]
 60 |         emb_hw = (h // self.patch_size, w // self.patch_size)
 61 |         # assert h == w, f"BeIT can only handle square images, not ({h}, {w})."
 62 |         if (h, w) != self.image_size:
 63 |             self.image_size = (h, w)
 64 |             self.vit.patch_embed.img_size = (h, w)
 65 |             # import pdb;pdb.set_trace()
 66 |             self.vit.pos_embed.data = resize_pos_embed(self.vit.pos_embed[0], emb_hw, True)[None]
 67 | 
 68 |         # actual forward from beit
 69 |         x = self.vit.patch_embed(x)
 70 |         x = torch.cat((self.vit.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
 71 |         x = x + self.vit.pos_embed
 72 | 
 73 |         x = self.vit.norm_pre(x)
 74 | 
 75 |         embeds = []
 76 |         for i, blk in enumerate(self.vit.blocks):
 77 |             x = blk(x)
 78 |             if i in self.multilayers:
 79 |                 embeds.append(x)
 80 |                 if i == self.layer:
 81 |                     break
 82 | 
 83 |         # map tokens to output
 84 |         outputs = {}
 85 |         for i, x_i in enumerate(embeds):
 86 |             x_i = tokens_to_output(self.output, x_i[:, 1:], x_i[:, 0], emb_hw)
 87 |             outputs[self._out_features[i]] = x_i
 88 | 
 89 |         return outputs
 90 | 
 91 | 
 92 | @BACKBONE_REGISTRY.register()
 93 | def build_midas_backbone(cfg, input_shape: ShapeSpec, priors=None):
 94 |     output = cfg.MODEL.MIDAS.OUTPUT
 95 |     layer = cfg.MODEL.MIDAS.LAYER
 96 |     return_multilayer = cfg.MODEL.MIDAS.RETURN_MULTILAYER
 97 | 
 98 |     bottom_up = MIDASBackbone(
 99 |         cfg,
100 |         input_shape,
101 |         output=output,
102 |         layer=layer,
103 |         return_multilayer=return_multilayer,
104 |     )
105 | 
106 |     in_feature = cfg.MODEL.FPN.IN_FEATURE
107 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
108 |     scale_factors = (4.0, 2.0, 1.0, 0.5)
109 |     backbone = SimpleFeaturePyramid(
110 |         net=bottom_up,
111 |         in_feature=in_feature,
112 |         out_channels=out_channels,
113 |         scale_factors=scale_factors,
114 |         norm=cfg.MODEL.FPN.NORM,
115 |         top_block=None,
116 |         square_pad=cfg.MODEL.FPN.SQUARE_PAD
117 |     )
118 |     return backbone
119 | 
120 | 
121 | class TestMIDASBackbone(unittest.TestCase):
122 |     def setUp(self):
123 |         # Mock configuration
124 |         self.cfg = type('', (), {})()
125 |         self.cfg.MODEL = type('', (), {})()
126 |         self.cfg.MODEL.MIDAS = type('', (), {})()
127 |         self.cfg.MODEL.MIDAS.OUTPUT = "dense"
128 |         self.cfg.MODEL.MIDAS.LAYER = -1
129 |         self.cfg.MODEL.MIDAS.RETURN_MULTILAYER = False
130 |         self.cfg.MODEL.FPN = type('', (), {})()
131 |         self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat'
132 |         self.cfg.MODEL.FPN.OUT_CHANNELS = 256
133 |         self.cfg.MODEL.FPN.NORM = "LN"
134 |         self.cfg.MODEL.FPN.FUSE_TYPE = "sum"
135 |         self.cfg.MODEL.FPN.SQUARE_PAD = 1024
136 |         self.input_shape = ShapeSpec(channels=3, height=1024, width=1024)
137 | 
138 |     def test_midas_backbone_forward(self):
139 |         # Create the backbone
140 |         backbone = build_midas_backbone(self.cfg, self.input_shape)
141 |         # Generate a random input tensor
142 |         x = torch.randn(2, 3, 1024, 1024)
143 |         # Run forward pass
144 |         outputs = backbone(x)
145 |         print(backbone.net.output_shape())
146 |         for key, output in outputs.items():
147 |             print(key, output.shape)
148 | 
149 |         # print(backbone.net.vit)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     unittest.main()


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/mnasnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class MNASNetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.mnasnet1_0(pretrained)
15 |         base  = base.layers 
16 |         
17 |         self.base = base
18 | 
19 |         self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320}
20 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22 |     
23 |     def forward(self, x):
24 | 
25 |         outputs = {}
26 |         
27 |         p2 = self.base[0:9](x)
28 |         p3 = self.base[9](p2)
29 |         p4 = self.base[10:12](p3)
30 |         p5 = self.base[12:14](p4)
31 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32 |         outputs['p2'] = p2
33 |         outputs['p3'] = p3
34 |         outputs['p4'] = p4
35 |         outputs['p5'] = p5
36 |         outputs['p6'] = p6
37 | 
38 |         return outputs
39 | 
40 | @BACKBONE_REGISTRY.register()
41 | def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
42 |     """
43 |     Args:
44 |         cfg: a detectron2 CfgNode
45 | 
46 |     Returns:
47 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
48 |     """
49 | 
50 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
51 | 
52 |     bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
53 |     in_features = cfg.MODEL.FPN.IN_FEATURES
54 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
55 | 
56 |     backbone = FPN(
57 |         bottom_up=bottom_up,
58 |         in_features=in_features,
59 |         out_channels=out_channels,
60 |         norm=cfg.MODEL.FPN.NORM,
61 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
62 |     )
63 |     return backbone
64 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/resnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 6 | from detectron2.modeling.backbone.resnet import build_resnet_backbone
 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 8 | import torch.nn.functional as F
 9 | 
10 | from detectron2.modeling.backbone.fpn import FPN
11 | 
12 | class ResNet(Backbone):
13 |     def __init__(self, cfg, input_shape, pretrained=True):
14 |         super().__init__()
15 | 
16 |         if cfg.MODEL.RESNETS.DEPTH == 18:
17 |             base  = models.resnet18(pretrained)
18 |             self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
19 |         elif cfg.MODEL.RESNETS.DEPTH == 34:
20 |             base  = models.resnet34(pretrained)
21 |             self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
22 |         elif cfg.MODEL.RESNETS.DEPTH == 50:
23 |             base  = models.resnet50(pretrained)
24 |             self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
25 |         elif cfg.MODEL.RESNETS.DEPTH == 101:
26 |             base  = models.resnet101(pretrained)
27 |             self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
28 |         else:
29 |             raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH))
30 |         
31 |         self.conv1 = base.conv1
32 |         self.bn1 = base.bn1
33 |         self.relu = base.relu
34 |         self.maxpool = base.maxpool
35 |         self.layer1 = base.layer1
36 |         self.layer2 = base.layer2
37 |         self.layer3 = base.layer3
38 |         self.layer4 = base.layer4
39 |         
40 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
41 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
42 |     
43 |     def forward(self, x):
44 | 
45 |         outputs = {}
46 |         
47 |         x = self.conv1(x)
48 |         x = self.bn1(x)
49 |         x = self.relu(x)
50 |         x = self.maxpool(x)
51 |         p2 = self.layer1(x)
52 |         p3 = self.layer2(p2)
53 |         p4 = self.layer3(p3)
54 |         p5 = self.layer4(p4)
55 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
56 | 
57 |         outputs['p2'] = p2
58 |         outputs['p3'] = p3
59 |         outputs['p4'] = p4
60 |         outputs['p5'] = p5
61 |         outputs['p6'] = p6
62 | 
63 |         return outputs
64 | 
65 | 
66 | @BACKBONE_REGISTRY.register()
67 | def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
68 |     """
69 |     Args:
70 |         cfg: a detectron2 CfgNode
71 | 
72 |     Returns:
73 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
74 |     """
75 | 
76 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
77 | 
78 |     if cfg.MODEL.RESNETS.TORCHVISION:
79 |         bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain)
80 | 
81 |     else:
82 |         # use the MSRA modeling logic to build the backbone.
83 |         bottom_up = build_resnet_backbone(cfg, input_shape)
84 | 
85 |     in_features = cfg.MODEL.FPN.IN_FEATURES
86 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
87 | 
88 |     backbone = FPN(
89 |         bottom_up=bottom_up,
90 |         in_features=in_features,
91 |         out_channels=out_channels,
92 |         norm=cfg.MODEL.FPN.NORM,
93 |         top_block=LastLevelMaxPool(),
94 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
95 |     )
96 |     return backbone
97 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/sam.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import ShapeSpec
  2 | from detectron2.modeling.backbone import Backbone
  3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN
  5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid
  6 | import torch
  7 | from torch import nn
  8 | from torchvision import models
  9 | import torch.nn.functional as F
 10 | 
 11 | from pathlib import Path
 12 | from urllib.request import urlretrieve
 13 | from segment_anything import sam_model_registry
 14 | import numpy as np
 15 | import einops as E
 16 | import unittest
 17 | 
 18 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/sam.py
 19 | class SAMBackbone(Backbone):
 20 |     def __init__(self, cfg, input_shape, checkpoint="facebook/vit-mae-base", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",):
 21 |         super().__init__()
 22 | 
 23 |         assert output in ["cls", "gap", "dense", "dense-cls"]
 24 |         self.output = output
 25 | 
 26 |         # get model
 27 |         ckpt_file = "sam_vit_b_01ec64.pth"
 28 |         ckpt_path = Path("checkpoints")  / ckpt_file
 29 |         
 30 |         ckpt_path.parent.mkdir(parents=True, exist_ok=True)
 31 |         
 32 |         if not ckpt_path.exists():
 33 |             download_path = (
 34 |                 f"https://dl.fbaipublicfiles.com/segment_anything/{ckpt_file}"
 35 |             )
 36 |             urlretrieve(download_path, ckpt_path)
 37 |         
 38 |         sam = sam_model_registry['vit_b'](checkpoint=ckpt_path)
 39 |         vit = sam.image_encoder
 40 | 
 41 |         feat_dim = vit.neck[0].in_channels
 42 |         emb_h, emb_w = vit.pos_embed.shape[1:3]
 43 |         self.patch_size = vit.patch_embed.proj.kernel_size[0]
 44 |         self.image_size = (emb_h * self.patch_size, emb_w * self.patch_size)
 45 |         assert self.patch_size == 16
 46 | 
 47 |         self.vit = vit
 48 | 
 49 | 
 50 |         num_layers = len(self.vit.blocks)
 51 |         multilayers = [
 52 |             num_layers // 4 - 1,
 53 |             num_layers // 2 - 1,
 54 |             num_layers // 4 * 3 - 1,
 55 |             num_layers - 1,
 56 |         ]
 57 | 
 58 |         if return_multilayer:
 59 |             self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim]
 60 |             self.multilayers = multilayers
 61 |         else:
 62 |             self.feat_dim = feat_dim
 63 |             layer = multilayers[-1] if layer == -1 else layer
 64 |             self.multilayers = [layer]
 65 | 
 66 |         # define layer name (for logging)
 67 |         self.layer = "-".join(str(_x) for _x in self.multilayers)
 68 | 
 69 |         self._out_feature_channels = {out_feature: feat_dim}
 70 |         self._out_feature_strides = {out_feature: self.patch_size}
 71 |         self._out_features = [out_feature]
 72 | 
 73 |     def resize_pos_embed(self, image_size):
 74 |         # get embed size
 75 |         h, w = image_size
 76 |         h = h // self.patch_size
 77 |         w = w // self.patch_size
 78 | 
 79 |         # resize embed
 80 |         pos_embed = self.vit.pos_embed.data.permute(0, 3, 1, 2)
 81 |         pos_embed = torch.nn.functional.interpolate(
 82 |             pos_embed, size=(h, w), mode="bicubic"
 83 |         )
 84 |         pos_embed = pos_embed.permute(0, 2, 3, 1)
 85 |         self.vit.pos_embed.data = pos_embed
 86 |         self.image_size = image_size
 87 | 
 88 |     def forward(self, x):
 89 |         _, _, h, w = x.shape
 90 |         assert h % self.patch_size == 0 and w % self.patch_size == 0, f"{h}, {w}"
 91 |         
 92 |         if h != self.image_size[0] or w != self.image_size[1]:
 93 |             self.resize_pos_embed(image_size=(h, w))
 94 | 
 95 |         # run vit
 96 |         x = self.vit.patch_embed(x)
 97 |         if self.vit.pos_embed is not None:
 98 |             x = x + self.vit.pos_embed
 99 | 
100 |         embeds = []
101 |         for i, blk in enumerate(self.vit.blocks):
102 |             x = blk(x)
103 |             if i in self.multilayers:
104 |                 embeds.append(x)
105 |                 if len(embeds) == len(self.multilayers):
106 |                     break
107 | 
108 |         # feat shape is batch x feat_dim x height x width
109 |         embeds = [_emb.permute(0, 3, 1, 2).contiguous() for _emb in embeds]
110 |         outputs = {self._out_features[i]: embeds[i] for i in range(len(self.multilayers))}
111 |         return outputs
112 | 
113 | 
114 | @BACKBONE_REGISTRY.register()
115 | def build_sam_backbone(cfg, input_shape: ShapeSpec, priors=None):
116 |     output = cfg.MODEL.SAM.OUTPUT
117 |     layer = cfg.MODEL.SAM.LAYER
118 |     return_multilayer = cfg.MODEL.SAM.RETURN_MULTILAYER
119 | 
120 |     bottom_up = SAMBackbone(
121 |         cfg,
122 |         input_shape,
123 |         output=output,
124 |         layer=layer,
125 |         return_multilayer=return_multilayer,
126 |     )
127 | 
128 |     in_feature = cfg.MODEL.FPN.IN_FEATURE
129 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
130 |     scale_factors = (4.0, 2.0, 1.0, 0.5)
131 |     backbone = SimpleFeaturePyramid(
132 |         net=bottom_up,
133 |         in_feature=in_feature,
134 |         out_channels=out_channels,
135 |         scale_factors=scale_factors,
136 |         norm=cfg.MODEL.FPN.NORM,
137 |         top_block=None,
138 |         square_pad=cfg.MODEL.FPN.SQUARE_PAD
139 |     )
140 |     return backbone
141 | 
142 | 
143 | class TestSAMBackbone(unittest.TestCase):
144 |     def setUp(self):
145 |         # Mock configuration
146 |         self.cfg = type('', (), {})()
147 |         self.cfg.MODEL = type('', (), {})()
148 |         self.cfg.MODEL.SAM = type('', (), {})()
149 |         self.cfg.MODEL.SAM.OUTPUT = "dense"
150 |         self.cfg.MODEL.SAM.LAYER = -1
151 |         self.cfg.MODEL.SAM.RETURN_MULTILAYER = False
152 |         self.cfg.MODEL.FPN = type('', (), {})()
153 |         self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat'
154 |         self.cfg.MODEL.FPN.OUT_CHANNELS = 256
155 |         self.cfg.MODEL.FPN.NORM = "LN"
156 |         self.cfg.MODEL.FPN.FUSE_TYPE = "sum"
157 |         self.cfg.MODEL.FPN.SQUARE_PAD = 1024
158 |         self.input_shape = ShapeSpec(channels=3, height=1024, width=1024)
159 | 
160 |     def test_sam_backbone_forward(self):
161 |         # Create the backbone
162 |         backbone = build_sam_backbone(self.cfg, self.input_shape)
163 |         # Generate a random input tensor
164 |         x = torch.randn(2, 3, 1024, 1024)
165 |         # Run forward pass
166 |         outputs = backbone(x)
167 |         print(backbone.net.output_shape())
168 |         for key, output in outputs.items():
169 |             print(key, output.shape)
170 | 
171 |         # print(backbone.net.vit)
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     unittest.main()


--------------------------------------------------------------------------------
/cubercnn/modeling/backbone/shufflenet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from torchvision import models
 3 | from detectron2.layers import ShapeSpec
 4 | from detectron2.modeling.backbone import Backbone
 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 6 | import torch.nn.functional as F
 7 | 
 8 | from detectron2.modeling.backbone.fpn import FPN
 9 | 
10 | class ShufflenetBackbone(Backbone):
11 |     def __init__(self, cfg, input_shape, pretrained=True):
12 |         super().__init__()
13 | 
14 |         base  = models.shufflenet_v2_x1_0(pretrained)
15 |         self.conv1 = base.conv1
16 |         self.maxpool = base.maxpool
17 |         self.stage2 = base.stage2
18 |         self.stage3 = base.stage3
19 |         self.stage4 = base.stage4
20 |         self.conv5 = base.conv5
21 | 
22 |         self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464}
23 |         self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
24 |         self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
25 |     
26 |     def forward(self, x):
27 | 
28 |         outputs = {}
29 |         
30 |         x = self.conv1(x)
31 |         p2 = self.maxpool(x)
32 |         p3 = self.stage2(p2)
33 |         p4 = self.stage3(p3)
34 |         p5 = self.stage4(p4)
35 |         p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
36 | 
37 |         outputs['p2'] = p2
38 |         outputs['p3'] = p3
39 |         outputs['p4'] = p4
40 |         outputs['p5'] = p5
41 |         outputs['p6'] = p6
42 | 
43 |         return outputs
44 | 
45 | 
46 | @BACKBONE_REGISTRY.register()
47 | def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
48 |     """
49 |     Args:
50 |         cfg: a detectron2 CfgNode
51 | 
52 |     Returns:
53 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
54 |     """
55 | 
56 |     imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
57 |     
58 |     bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
59 |     in_features = cfg.MODEL.FPN.IN_FEATURES
60 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
61 | 
62 |     backbone = FPN(
63 |         bottom_up=bottom_up,
64 |         in_features=in_features,
65 |         out_channels=out_channels,
66 |         norm=cfg.MODEL.FPN.NORM,
67 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
68 |     )
69 |     return backbone
70 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | from .rcnn3d import *


--------------------------------------------------------------------------------
/cubercnn/modeling/meta_arch/rcnn3d.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from typing import Dict, List, Optional
  3 | import torch
  4 | import numpy as np
  5 | from detectron2.layers import ShapeSpec, batched_nms
  6 | from detectron2.utils.visualizer import Visualizer
  7 | from detectron2.data.detection_utils import convert_image_to_rgb
  8 | from detectron2.structures import Instances
  9 | from detectron2.utils.events import get_event_storage
 10 | from detectron2.data import MetadataCatalog
 11 | 
 12 | from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
 13 | from detectron2.modeling.proposal_generator import build_proposal_generator
 14 | from detectron2.utils.logger import _log_api_usage
 15 | from detectron2.modeling.meta_arch import (
 16 |     META_ARCH_REGISTRY, GeneralizedRCNN
 17 | )
 18 | from cubercnn.modeling.roi_heads import build_roi_heads
 19 | 
 20 | from detectron2.data import MetadataCatalog
 21 | from pytorch3d.transforms import rotation_6d_to_matrix
 22 | from cubercnn.modeling.roi_heads import build_roi_heads
 23 | from cubercnn import util, vis
 24 | 
 25 | @META_ARCH_REGISTRY.register()
 26 | class RCNN3D(GeneralizedRCNN):
 27 |     
 28 |     @classmethod
 29 |     def from_config(cls, cfg, priors=None):
 30 |         backbone = build_backbone(cfg, priors=priors)
 31 |         return {
 32 |             "backbone": backbone,
 33 |             "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
 34 |             "roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
 35 |             "input_format": cfg.INPUT.FORMAT,
 36 |             "vis_period": cfg.VIS_PERIOD,
 37 |             "pixel_mean": cfg.MODEL.PIXEL_MEAN,
 38 |             "pixel_std": cfg.MODEL.PIXEL_STD,
 39 |         }
 40 | 
 41 |     def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
 42 |         
 43 |         if not self.training:
 44 |             return self.inference(batched_inputs)
 45 | 
 46 |         images = self.preprocess_image(batched_inputs)
 47 | 
 48 |         # scaling factor for the sample relative to its original scale
 49 |         # e.g., how much has the image been upsampled by? or downsampled?
 50 |         im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
 51 | 
 52 |         # The unmodified intrinsics for the image
 53 |         Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
 54 | 
 55 |         if "instances" in batched_inputs[0]:
 56 |             gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
 57 |         else:
 58 |             gt_instances = None
 59 | 
 60 |         features = self.backbone(images.tensor)
 61 |         proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
 62 | 
 63 |         instances, detector_losses = self.roi_heads(
 64 |             images, features, proposals, 
 65 |             Ks, im_scales_ratio, 
 66 |             gt_instances
 67 |         )
 68 | 
 69 |         if self.vis_period > 0:
 70 |             storage = get_event_storage()
 71 |             if storage.iter % self.vis_period == 0 and storage.iter > 0:
 72 |                 self.visualize_training(batched_inputs, proposals, instances)
 73 | 
 74 |         losses = {}
 75 |         losses.update(detector_losses)
 76 |         losses.update(proposal_losses)
 77 |         return losses
 78 | 
 79 |     def inference(
 80 |         self,
 81 |         batched_inputs: List[Dict[str, torch.Tensor]],
 82 |         detected_instances: Optional[List[Instances]] = None,
 83 |         do_postprocess: bool = True,
 84 |     ):
 85 |         assert not self.training
 86 | 
 87 |         images = self.preprocess_image(batched_inputs)
 88 | 
 89 |         # scaling factor for the sample relative to its original scale
 90 |         # e.g., how much has the image been upsampled by? or downsampled?
 91 |         im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
 92 |         
 93 |         # The unmodified intrinsics for the image
 94 |         Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
 95 | 
 96 |         features = self.backbone(images.tensor)
 97 | 
 98 |         # Pass oracle 2D boxes into the RoI heads
 99 |         if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
100 |             oracles = [b['oracle2D'] for b in batched_inputs]
101 |             results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)
102 |         
103 |         # normal inference
104 |         else:
105 |             proposals, _ = self.proposal_generator(images, features, None)
106 |             if np.any(['category_list' in b for b in batched_inputs]):
107 |                 # Gronding DINO inference is only supported to one image at one batch
108 |                 results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None, category_list=batched_inputs[0]["category_list"]) 
109 |             else:
110 |                 results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)
111 | 
112 |         if do_postprocess:
113 |             assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
114 |             return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
115 |         else:
116 |             return results
117 | 
118 |     def visualize_training(self, batched_inputs, proposals, instances):
119 |         """
120 |         A function used to visualize images and proposals. It shows ground truth
121 |         bounding boxes on the original image and up to 20 top-scoring predicted
122 |         object proposals on the original image. Users can implement different
123 |         visualization functions for different models.
124 |         Args:
125 |             batched_inputs (list): a list that contains input to the model.
126 |             proposals (list): a list that contains predicted proposals. Both
127 |                 batched_inputs and proposals should have the same length.
128 |             instances (list): a list that contains predicted RoIhead instances. Both
129 |                 batched_inputs and proposals should have the same length.
130 |         """
131 |         
132 |         storage = get_event_storage()
133 | 
134 |         # minimum number of boxes to try to visualize per image
135 |         max_vis_prop = 20
136 | 
137 |         if not hasattr(self, 'thing_classes'):
138 |             self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
139 |             self.num_classes = len(self.thing_classes)
140 | 
141 |         for input, prop, instances_i in zip(batched_inputs, proposals, instances):
142 | 
143 |             img = input["image"]            
144 |             img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
145 |             img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
146 |             img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
147 | 
148 |             '''
149 |             Visualize the 2D GT and proposal predictions
150 |             '''
151 |             v_gt = Visualizer(img, None)
152 |             v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
153 |             anno_img = v_gt.get_image()
154 |             box_size = min(len(prop.proposal_boxes), max_vis_prop)
155 |             v_pred = Visualizer(img, None)
156 |             v_pred = v_pred.overlay_instances(
157 |                 boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
158 |             )
159 |             prop_img = v_pred.get_image()
160 |             vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
161 |             vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
162 |             storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
163 | 
164 |             '''
165 |             Visualize the 3D GT and predictions
166 |             '''
167 |             K = torch.tensor(input['K'], device=self.device)
168 |             scale = input['height']/img.shape[0]
169 |             fx, sx = (val.item()/scale for val in K[0, [0, 2]])
170 |             fy, sy = (val.item()/scale for val in K[1, [1, 2]])
171 |             
172 |             K_scaled = torch.tensor(
173 |                 [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], 
174 |                 dtype=torch.float32, device=self.device
175 |             ) @ K
176 | 
177 |             gts_per_image = input["instances"]
178 | 
179 |             gt_classes = gts_per_image.gt_classes
180 |             
181 |             # Filter out irrelevant groundtruth
182 |             fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
183 | 
184 |             gt_classes = gt_classes[fg_selection_mask]
185 |             gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
186 |             gt_boxes   = gts_per_image.gt_boxes.tensor[fg_selection_mask]  # 2D boxes
187 |             gt_poses   = gts_per_image.gt_poses[fg_selection_mask]         # GT poses
188 | 
189 |             # projected 2D center, depth, w, h, l, 3D center
190 |             gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
191 | 
192 |             # this box may have been mirrored and scaled so
193 |             # we need to recompute XYZ in 3D by backprojecting.
194 |             gt_z = gt_boxes3D[:, 2]
195 | 
196 |             gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
197 |             gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
198 |             
199 |             # put together the GT boxes
200 |             gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
201 |             gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
202 | 
203 |             gt_colors = torch.tensor(
204 |                 [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], 
205 |                 device=self.device
206 |             )/255.0
207 | 
208 |             gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
209 | 
210 |             # perform a simple NMS, which is not cls dependent. 
211 |             keep = batched_nms(
212 |                 instances_i.pred_boxes.tensor, 
213 |                 instances_i.scores, 
214 |                 torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), 
215 |                 self.roi_heads.box_predictor.test_nms_thresh
216 |             )
217 |             
218 |             keep = keep[:max_vis_prop]
219 |             num_to_visualize = len(keep)
220 | 
221 |             pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
222 |             pred_pose = instances_i.pred_pose[keep]
223 | 
224 |             pred_colors = torch.tensor(
225 |                 [util.get_color(i) for i in range(num_to_visualize)], 
226 |                 device=self.device
227 |             )/255.0
228 | 
229 |             pred_boxes = instances_i.pred_boxes[keep]
230 |             pred_scores = instances_i.scores[keep]
231 |             pred_classes = instances_i.pred_classes[keep]
232 |             pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
233 |             pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
234 | 
235 |             # convert to lists
236 |             pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
237 |             gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
238 | 
239 |             img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
240 |             img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
241 | 
242 |             # horizontal stack 3D GT and pred left/right
243 |             vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
244 |             vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
245 |             vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
246 | 
247 |             storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
248 | 
249 |             break  # only visualize one image in a batch
250 | 
251 | def build_model(cfg, priors=None):
252 |     """
253 |     Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
254 |     Note that it does not load any weights from ``cfg``.
255 |     """
256 |     meta_arch = cfg.MODEL.META_ARCHITECTURE
257 |     model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
258 |     model.to(torch.device(cfg.MODEL.DEVICE))
259 |     _log_api_usage("modeling.meta_arch." + meta_arch)
260 |     return model
261 | 
262 | 
263 | def build_backbone(cfg, input_shape=None, priors=None):
264 |     """
265 |     Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
266 | 
267 |     Returns:
268 |         an instance of :class:`Backbone`
269 |     """
270 |     if input_shape is None:
271 |         input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
272 | 
273 |     backbone_name = cfg.MODEL.BACKBONE.NAME
274 |     backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
275 |     assert isinstance(backbone, Backbone)
276 |     return backbone


--------------------------------------------------------------------------------
/cubercnn/modeling/proposal_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from .rpn import *
2 | 


--------------------------------------------------------------------------------
/cubercnn/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .roi_heads import *
2 | from .roi_heads_gdino import *


--------------------------------------------------------------------------------
/cubercnn/modeling/roi_heads/cube_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from detectron2.utils.registry import Registry
  3 | from typing import Dict, List, Optional
  4 | from detectron2.layers import ShapeSpec
  5 | from torch import nn
  6 | import torch
  7 | import numpy as np
  8 | import fvcore.nn.weight_init as weight_init
  9 | 
 10 | from pytorch3d.transforms.rotation_conversions import _copysign
 11 | from pytorch3d.transforms import (
 12 |     rotation_6d_to_matrix, 
 13 |     euler_angles_to_matrix, 
 14 |     quaternion_to_matrix
 15 | )
 16 | 
 17 | 
 18 | ROI_CUBE_HEAD_REGISTRY = Registry("ROI_CUBE_HEAD")
 19 | 
 20 | @ROI_CUBE_HEAD_REGISTRY.register()
 21 | class CubeHead(nn.Module):
 22 | 
 23 |     def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
 24 |         super().__init__()
 25 | 
 26 |         #-------------------------------------------
 27 |         # Settings
 28 |         #-------------------------------------------
 29 |         self.num_classes        = cfg.MODEL.ROI_HEADS.NUM_CLASSES
 30 |         self.use_conf           = cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE
 31 |         self.z_type             = cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE
 32 |         self.pose_type          = cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE
 33 |         self.cluster_bins       = cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS
 34 |         self.shared_fc          = cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC
 35 |         self.use_prior          = cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED
 36 |         #-------------------------------------------
 37 |         # Feature generator
 38 |         #-------------------------------------------
 39 | 
 40 |         num_conv = cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV
 41 |         conv_dim = cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM
 42 |         num_fc = cfg.MODEL.ROI_CUBE_HEAD.NUM_FC
 43 |         fc_dim = cfg.MODEL.ROI_CUBE_HEAD.FC_DIM
 44 | 
 45 |         conv_dims = [conv_dim] * num_conv
 46 |         fc_dims = [fc_dim] * num_fc
 47 | 
 48 |         assert len(conv_dims) + len(fc_dims) > 0
 49 | 
 50 |         self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
 51 | 
 52 |         if self.shared_fc:
 53 |             self.feature_generator = nn.Sequential()
 54 |         else:
 55 |             self.feature_generator_XY = nn.Sequential()
 56 |             self.feature_generator_dims = nn.Sequential()
 57 |             self.feature_generator_pose = nn.Sequential()
 58 |             self.feature_generator_Z = nn.Sequential()
 59 | 
 60 |             if self.use_conf:
 61 |                 self.feature_generator_conf = nn.Sequential()
 62 | 
 63 |         # create fully connected layers for Cube Head
 64 |         for k, fc_dim in enumerate(fc_dims):
 65 |             
 66 |             fc_dim_in = int(np.prod(self._output_size))
 67 |             
 68 |             self._output_size = fc_dim
 69 | 
 70 |             if self.shared_fc:
 71 |                 fc = nn.Linear(fc_dim_in, fc_dim)
 72 |                 weight_init.c2_xavier_fill(fc)
 73 |                 self.feature_generator.add_module("fc{}".format(k + 1), fc)
 74 |                 self.feature_generator.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 75 |             
 76 |             else:
 77 |                 
 78 |                 fc = nn.Linear(fc_dim_in, fc_dim)
 79 |                 weight_init.c2_xavier_fill(fc)
 80 |                 self.feature_generator_dims.add_module("fc{}".format(k + 1), fc)
 81 |                 self.feature_generator_dims.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 82 | 
 83 |                 fc = nn.Linear(fc_dim_in, fc_dim)
 84 |                 weight_init.c2_xavier_fill(fc)
 85 |                 self.feature_generator_XY.add_module("fc{}".format(k + 1), fc)
 86 |                 self.feature_generator_XY.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 87 | 
 88 |                 fc = nn.Linear(fc_dim_in, fc_dim)
 89 |                 weight_init.c2_xavier_fill(fc)
 90 |                 self.feature_generator_pose.add_module("fc{}".format(k + 1), fc)
 91 |                 self.feature_generator_pose.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 92 | 
 93 |                 fc = nn.Linear(fc_dim_in, fc_dim)
 94 |                 weight_init.c2_xavier_fill(fc)
 95 |                 self.feature_generator_Z.add_module("fc{}".format(k + 1), fc)
 96 |                 self.feature_generator_Z.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 97 | 
 98 |                 if self.use_conf:
 99 |                     fc = nn.Linear(fc_dim_in, fc_dim)
100 |                     weight_init.c2_xavier_fill(fc)
101 |                     self.feature_generator_conf.add_module("fc{}".format(k + 1), fc)
102 |                     self.feature_generator_conf.add_module("fc_relu{}".format(k + 1), nn.ReLU())
103 | 
104 |         #-------------------------------------------
105 |         # 3D outputs
106 |         #-------------------------------------------
107 |         output_multiple_factor = self.num_classes if self.use_prior else 1
108 |         # Dimensions in meters (width, height, length)
109 |         self.bbox_3D_dims = nn.Linear(self._output_size, output_multiple_factor*3)
110 |         nn.init.normal_(self.bbox_3D_dims.weight, std=0.001)
111 |         nn.init.constant_(self.bbox_3D_dims.bias, 0)
112 | 
113 |         cluster_bins = self.cluster_bins if self.cluster_bins > 1 else 1
114 | 
115 |         # XY
116 |         self.bbox_3D_center_deltas = nn.Linear(self._output_size, output_multiple_factor*2)
117 |         nn.init.normal_(self.bbox_3D_center_deltas.weight, std=0.001)
118 |         nn.init.constant_(self.bbox_3D_center_deltas.bias, 0)
119 | 
120 |         # Pose
121 |         if self.pose_type == '6d':
122 |             self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*6)
123 | 
124 |         elif self.pose_type == 'quaternion':
125 |             self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*4)
126 | 
127 |         elif self.pose_type == 'euler':
128 |             self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*3)
129 | 
130 |         else:
131 |             raise ValueError('Cuboid pose type {} is not recognized'.format(self.pose_type))
132 |         
133 |         nn.init.normal_(self.bbox_3D_pose.weight, std=0.001)
134 |         nn.init.constant_(self.bbox_3D_pose.bias, 0)
135 | 
136 |         # Z 
137 |         self.bbox_3D_center_depth = nn.Linear(self._output_size, output_multiple_factor*cluster_bins)
138 |         nn.init.normal_(self.bbox_3D_center_depth.weight, std=0.001)
139 |         nn.init.constant_(self.bbox_3D_center_depth.bias, 0)
140 | 
141 |         # Optionally, box confidence
142 |         if self.use_conf:
143 |             self.bbox_3D_uncertainty = nn.Linear(self._output_size, output_multiple_factor*1)
144 |             nn.init.normal_(self.bbox_3D_uncertainty.weight, std=0.001)
145 |             nn.init.constant_(self.bbox_3D_uncertainty.bias, 5)
146 | 
147 | 
148 |     def forward(self, x, num_boxes_per_image: Optional[List[int]] = None):
149 |     
150 |         n = x.shape[0]
151 |         
152 |         box_z = None
153 |         box_uncert = None
154 |         box_2d_deltas = None
155 | 
156 |         if self.shared_fc:
157 |             features = self.feature_generator(x)
158 |             box_2d_deltas = self.bbox_3D_center_deltas(features)
159 |             box_dims = self.bbox_3D_dims(features)
160 |             box_pose = self.bbox_3D_pose(features)
161 |             box_z = self.bbox_3D_center_depth(features)
162 | 
163 |             if self.use_conf:
164 |                 box_uncert = self.bbox_3D_uncertainty(features).clip(0.01)
165 |         else:
166 | 
167 |             box_2d_deltas = self.bbox_3D_center_deltas(self.feature_generator_XY(x))
168 |             box_dims = self.bbox_3D_dims(self.feature_generator_dims(x))
169 |             box_pose = self.bbox_3D_pose(self.feature_generator_pose(x))
170 |             box_z = self.bbox_3D_center_depth(self.feature_generator_Z(x))
171 | 
172 |             if self.use_conf:
173 |                 box_uncert = self.bbox_3D_uncertainty(self.feature_generator_conf(x)).clip(0.01)
174 | 
175 |         # Pose
176 |         if self.pose_type == '6d':
177 |             box_pose = rotation_6d_to_matrix(box_pose.view(-1, 6))
178 | 
179 |         elif self.pose_type == 'quaternion':
180 |             quats = box_pose.view(-1, 4)
181 |             quats_scales = (quats * quats).sum(1)
182 |             quats = quats / _copysign(torch.sqrt(quats_scales), quats[:, 0])[:, None]
183 |             box_pose = quaternion_to_matrix(quats)
184 | 
185 |         elif self.pose_type == 'euler':
186 |             box_pose = euler_angles_to_matrix(box_pose.view(-1, 3), 'XYZ')
187 |         if self.use_prior:
188 |             box_2d_deltas = box_2d_deltas.view(n, self.num_classes, 2)
189 |             box_dims = box_dims.view(n, self.num_classes, 3)
190 |             box_pose = box_pose.view(n, self.num_classes, 3, 3)
191 | 
192 |         if self.cluster_bins > 1:
193 |             if self.use_prior:
194 |                 box_z = box_z.view(n, self.cluster_bins, self.num_classes, -1)
195 |             else:
196 |                 box_z = box_z.view(n, self.cluster_bins, -1)
197 | 
198 |         else:
199 |             if self.use_prior:
200 |                 box_z = box_z.view(n, self.num_classes, -1)
201 |             else:
202 |                 box_z = box_z.view(n, -1)
203 |             
204 |         return box_2d_deltas, box_z, box_dims, box_pose, box_uncert
205 | 
206 | 
207 | def build_cube_head(cfg, input_shape: Dict[str, ShapeSpec]):
208 |     name = cfg.MODEL.ROI_CUBE_HEAD.NAME
209 |     return ROI_CUBE_HEAD_REGISTRY.get(name)(cfg, input_shape)


--------------------------------------------------------------------------------
/cubercnn/modeling/roi_heads/fast_rcnn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from re import L
  3 | import torch
  4 | from torch.nn import functional as F
  5 | from typing import List, Tuple
  6 | 
  7 | from fvcore.nn import giou_loss, smooth_l1_loss
  8 | from detectron2.utils.events import get_event_storage
  9 | from detectron2.layers import cat, cross_entropy, nonzero_tuple, batched_nms
 10 | from detectron2.structures import Instances, Boxes
 11 | from detectron2.modeling.roi_heads.fast_rcnn import (
 12 |     FastRCNNOutputLayers, _log_classification_stats
 13 | )
 14 | from cubercnn.modeling.proposal_generator.rpn import matched_pairwise_iou
 15 | 
 16 | def fast_rcnn_inference(
 17 |     boxes: List[torch.Tensor],
 18 |     scores: List[torch.Tensor],
 19 |     image_shapes: List[Tuple[int, int]],
 20 |     score_thresh: float,
 21 |     nms_thresh: float,
 22 |     topk_per_image: int,
 23 | ):
 24 |     """
 25 |     Call `fast_rcnn_inference_single_image` for all images.
 26 | 
 27 |     Args:
 28 |         boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
 29 |             boxes for each image. Element i has shape (Ri, K * 4) if doing
 30 |             class-specific regression, or (Ri, 4) if doing class-agnostic
 31 |             regression, where Ri is the number of predicted objects for image i.
 32 |             This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
 33 |         scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
 34 |             Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
 35 |             for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
 36 |         image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
 37 |         score_thresh (float): Only return detections with a confidence score exceeding this
 38 |             threshold.
 39 |         nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
 40 |         topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
 41 |             all detections.
 42 | 
 43 |     Returns:
 44 |         instances: (list[Instances]): A list of N instances, one for each image in the batch,
 45 |             that stores the topk most confidence detections.
 46 |         kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
 47 |             the corresponding boxes/scores index in [0, Ri) from the input, for image i.
 48 |     """
 49 |     result_per_image = [
 50 |         fast_rcnn_inference_single_image(
 51 |             boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
 52 |         )
 53 |         for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
 54 |     ]
 55 |     return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
 56 | 
 57 | def fast_rcnn_inference_single_image(
 58 |     boxes,
 59 |     scores,
 60 |     image_shape: Tuple[int, int],
 61 |     score_thresh: float,
 62 |     nms_thresh: float,
 63 |     topk_per_image: int,
 64 | ):
 65 |     """
 66 |     Single-image inference. Return bounding-box detection results by thresholding
 67 |     on scores and applying non-maximum suppression (NMS).
 68 | 
 69 |     Args:
 70 |         Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
 71 |         per image.
 72 | 
 73 |     Returns:
 74 |         Same as `fast_rcnn_inference`, but for only one image.
 75 |     """
 76 |     valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
 77 |     if not valid_mask.all():
 78 |         boxes = boxes[valid_mask]
 79 |         scores = scores[valid_mask]
 80 | 
 81 |     scores = scores[:, :-1]
 82 |     num_bbox_reg_classes = boxes.shape[1] // 4
 83 | 
 84 |     # Convert to Boxes to use the `clip` function ...
 85 |     boxes = Boxes(boxes.reshape(-1, 4))
 86 |     boxes.clip(image_shape)
 87 |     boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
 88 | 
 89 |     # 1. Filter results based on detection scores. It can make NMS more efficient
 90 |     #    by filtering out low-confidence detections.
 91 |     filter_mask = scores > score_thresh  # R x K
 92 | 
 93 |     # R' x 2. First column contains indices of the R predictions;
 94 |     # Second column contains indices of classes.
 95 |     filter_inds = filter_mask.nonzero()
 96 |     if num_bbox_reg_classes == 1:
 97 |         boxes = boxes[filter_inds[:, 0], 0]
 98 |     else:
 99 |         boxes = boxes[filter_mask]
100 | 
101 |     scores_full = scores[filter_inds[:, 0]]
102 |     scores = scores[filter_mask]
103 | 
104 |     # 2. Apply NMS for each class independently.
105 |     keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
106 |     if topk_per_image >= 0:
107 |         keep = keep[:topk_per_image]
108 | 
109 |     boxes, scores, filter_inds, scores_full = boxes[keep], scores[keep], filter_inds[keep], scores_full[keep]
110 | 
111 |     result = Instances(image_shape)
112 |     result.pred_boxes = Boxes(boxes)
113 |     result.scores = scores
114 |     result.scores_full = scores_full
115 |     result.pred_classes = filter_inds[:, 1]
116 |     return result, filter_inds[:, 0]
117 | 
118 | 
119 | class FastRCNNOutputs(FastRCNNOutputLayers):
120 | 
121 |     def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
122 |         """
123 |         Args:
124 |             predictions: return values of :meth:`forward()`.
125 |             proposals (list[Instances]): proposals that match the features that were
126 |                 used to compute predictions. The ``proposal_boxes`` field is expected.
127 | 
128 |         Returns:
129 |             list[Instances]: same as `fast_rcnn_inference`.
130 |             list[Tensor]: same as `fast_rcnn_inference`.
131 |         """
132 |         boxes = self.predict_boxes(predictions, proposals)
133 |         scores = self.predict_probs(predictions, proposals)
134 |             
135 |         image_shapes = [x.image_size for x in proposals]
136 |         return fast_rcnn_inference(
137 |             boxes,
138 |             scores,
139 |             image_shapes,
140 |             self.test_score_thresh,
141 |             self.test_nms_thresh,
142 |             self.test_topk_per_image,
143 |         )
144 | 
145 |     def losses(self, predictions, proposals):
146 |         """
147 |         Args:
148 |             predictions: return values of :meth:`forward()`.
149 |             proposals (list[Instances]): proposals that match the features that were used
150 |                 to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
151 |                 ``gt_classes`` are expected.
152 | 
153 |         Returns:
154 |             Dict[str, Tensor]: dict of losses
155 |         """
156 |         scores, proposal_deltas = predictions
157 | 
158 |         # parse classification outputs
159 |         gt_classes = (
160 |             cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
161 |         )
162 | 
163 |         # parse box regression outputs
164 |         if len(proposals):
165 |             proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
166 |             assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
167 |             # If "gt_boxes" does not exist, the proposals must be all negative and
168 |             # should not be included in regression loss computation.
169 |             # Here we just use proposal_boxes as an arbitrary placeholder because its
170 |             # value won't be used in self.box_reg_loss().
171 |             gt_boxes = cat(
172 |                 [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
173 |                 dim=0,
174 |             )
175 |         else:
176 |             proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
177 | 
178 | 
179 |         normalize_factor = max(gt_classes.numel(), 1.0)
180 | 
181 |         '''
182 |         Standard Faster R-CNN losses
183 |         '''
184 |         _log_classification_stats(scores, gt_classes)
185 |         loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
186 |         loss_box_reg = self.box_reg_loss(proposal_boxes, gt_boxes, proposal_deltas, gt_classes, reduction="none")
187 |         loss_box_reg = (loss_box_reg).sum() / normalize_factor
188 | 
189 |         losses = {
190 |             "BoxHead/loss_cls": loss_cls,
191 |             "BoxHead/loss_box_reg": loss_box_reg,
192 |         }
193 |         
194 |         return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
195 |     
196 |     def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes, reduction='mean'):
197 |         """
198 |         Args:
199 |             All boxes are tensors with the same shape Rx(4 or 5).
200 |             gt_classes is a long tensor of shape R, the gt class label of each proposal.
201 |             R shall be the number of proposals.
202 |         """
203 |         box_dim = proposal_boxes.shape[1]  # 4 or 5
204 |         
205 |         # Regression loss is only computed for foreground proposals (those matched to a GT)
206 |         fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
207 |         if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
208 |             fg_pred_deltas = pred_deltas[fg_inds]
209 |         else:
210 |             fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
211 |                 fg_inds, gt_classes[fg_inds]
212 |             ]
213 | 
214 |         if reduction == 'mean':
215 |             if self.box_reg_loss_type == "smooth_l1":
216 |                 gt_pred_deltas = self.box2box_transform.get_deltas(
217 |                     proposal_boxes[fg_inds],
218 |                     gt_boxes[fg_inds],
219 |                 )
220 |                 loss_box_reg = smooth_l1_loss(
221 |                     fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum"
222 |                 )
223 |             elif self.box_reg_loss_type == "giou":
224 |                 fg_pred_boxes = self.box2box_transform.apply_deltas(
225 |                     fg_pred_deltas, proposal_boxes[fg_inds]
226 |                 )
227 |                 loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum")
228 |             else:
229 |                 raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
230 |         
231 |             # The reg loss is normalized using the total number of regions (R), not the number
232 |             # of foreground regions even though the box regression loss is only defined on
233 |             # foreground regions. Why? Because doing so gives equal training influence to
234 |             # each foreground example. To see how, consider two different minibatches:
235 |             #  (1) Contains a single foreground region
236 |             #  (2) Contains 100 foreground regions
237 |             # If we normalize by the number of foreground regions, the single example in
238 |             # minibatch (1) will be given 100 times as much influence as each foreground
239 |             # example in minibatch (2). Normalizing by the total number of regions, R,
240 |             # means that the single example in minibatch (1) and each of the 100 examples
241 |             # in minibatch (2) are given equal influence.
242 |             return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
243 |         
244 |         elif reduction == 'none':
245 |             if self.box_reg_loss_type == "smooth_l1":
246 |                 gt_pred_deltas = self.box2box_transform.get_deltas(
247 |                     proposal_boxes[fg_inds],
248 |                     gt_boxes[fg_inds],
249 |                 )
250 |                 loss_box_reg = smooth_l1_loss(
251 |                     fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="none"
252 |                 )
253 |             else:
254 |                 raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
255 |             
256 |             # return non-reduced type
257 |             return loss_box_reg
258 |         
259 |         else:
260 |             raise ValueError(f"Invalid bbox reg reduction type '{reduction}'")
261 | 
262 |     


--------------------------------------------------------------------------------
/cubercnn/modeling/roi_heads/roi_heads_gdino.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | import sys
  3 | sys.path.append('./GroundingDino/')
  4 | from cubercnn.modeling.roi_heads.roi_heads import *
  5 | 
  6 | from torchvision.ops import nms
  7 | 
  8 | # GroundingDINO imports
  9 | from groundingdino.models import build_model
 10 | from groundingdino.util.slconfig import SLConfig
 11 | from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
 12 | from groundingdino.util.vl_utils import create_positive_map_from_span
 13 | from transformers import AutoTokenizer
 14 | 
 15 | 
 16 | def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
 17 |     args = SLConfig.fromfile(model_config_path)
 18 |     args.device = "cuda" if not cpu_only else "cpu"
 19 |     model = build_model(args)
 20 |     checkpoint = torch.load(model_checkpoint_path, map_location="cpu", weights_only=True)
 21 |     load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
 22 |     _ = model.eval()
 23 |     return model
 24 | 
 25 | 
 26 | @ROI_HEADS_REGISTRY.register()
 27 | class ROIHeads3DGDINO(ROIHeads3D):
 28 | 
 29 |     @configurable
 30 |     def __init__(
 31 |         self,
 32 |         *,
 33 |         ignore_thresh: float,
 34 |         cube_head: nn.Module,
 35 |         cube_pooler: nn.Module,
 36 |         loss_w_3d: float,
 37 |         loss_w_xy: float,
 38 |         loss_w_z: float,
 39 |         loss_w_dims: float,
 40 |         loss_w_pose: float,
 41 |         loss_w_joint: float,
 42 |         use_confidence: float,
 43 |         inverse_z_weight: bool,
 44 |         z_type: str,
 45 |         pose_type: str,
 46 |         cluster_bins: int,
 47 |         priors = None,
 48 |         dims_priors_enabled = None,
 49 |         dims_priors_func = None,
 50 |         disentangled_loss=None,
 51 |         virtual_depth=None,
 52 |         virtual_focal=None,
 53 |         test_scale=None,
 54 |         allocentric_pose=None,
 55 |         chamfer_pose=None,
 56 |         scale_roi_boxes=None,
 57 |         **kwargs,
 58 |     ):
 59 |         super().__init__(
 60 |             ignore_thresh=ignore_thresh,
 61 |             cube_head=cube_head,
 62 |             cube_pooler=cube_pooler,
 63 |             loss_w_3d=loss_w_3d,
 64 |             loss_w_xy=loss_w_xy,
 65 |             loss_w_z=loss_w_z,
 66 |             loss_w_dims=loss_w_dims,
 67 |             loss_w_pose=loss_w_pose,
 68 |             loss_w_joint=loss_w_joint,
 69 |             use_confidence=use_confidence,
 70 |             inverse_z_weight=inverse_z_weight,
 71 |             z_type=z_type,
 72 |             pose_type=pose_type,
 73 |             cluster_bins=cluster_bins,
 74 |             priors=priors,
 75 |             dims_priors_enabled=dims_priors_enabled,
 76 |             dims_priors_func=dims_priors_func,
 77 |             disentangled_loss=disentangled_loss,
 78 |             virtual_depth=virtual_depth,
 79 |             virtual_focal=virtual_focal,
 80 |             test_scale=test_scale,
 81 |             allocentric_pose=allocentric_pose,
 82 |             chamfer_pose=chamfer_pose,
 83 |             scale_roi_boxes=scale_roi_boxes,
 84 |             **kwargs
 85 |         )
 86 | 
 87 |         self.groundingdino_model = load_model(
 88 |             "./configs/GroundingDINO_SwinB_cfg.py", 
 89 |             "./checkpoints/groundingdino_swinb_cogcoor.pth", 
 90 |             cpu_only=False
 91 |         )
 92 | 
 93 |     def forward(self, images, features, proposals, Ks, im_scales_ratio, targets=None, category_list=None):
 94 | 
 95 |         im_dims = [image.shape[1:] for image in images]
 96 | 
 97 |         # del images
 98 | 
 99 |         if self.training:
100 |             proposals = self.label_and_sample_proposals(proposals, targets)
101 |         
102 |         del targets
103 | 
104 |         if self.training:
105 | 
106 |             losses = self._forward_box(features, proposals)
107 |             if self.loss_w_3d > 0:
108 |                 instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio)
109 |                 losses.update(losses_cube)
110 | 
111 |             return instances_3d, losses
112 |         
113 |         else:
114 | 
115 |             # when oracle is available, by pass the box forward.
116 |             # simulate the predicted instances by creating a new 
117 |             # instance for each passed in image.
118 |             if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]):
119 |                 pred_instances = []
120 |                 for proposal, im_dim in zip(proposals, im_dims):
121 |                     
122 |                     pred_instances_i = Instances(im_dim)
123 |                     pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D'])
124 |                     pred_instances_i.pred_classes =  proposal['gt_classes']
125 |                     pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float()
126 |                     pred_instances.append(pred_instances_i)
127 |             else:
128 |                 pred_instances = self._forward_box(features, proposals)
129 |             
130 |             if category_list:
131 |                 filtered_texts = [ [cat]  for cat in  category_list]
132 | 
133 |             # Return empty Instances object if no valid text is found
134 |             if not filtered_texts:
135 |                 target = Instances(pred_instances[0].image_size)
136 |                 target.pred_classes = torch.tensor([], dtype=torch.int64)  # Empty class tensor
137 |                 target.pred_boxes = Boxes(torch.tensor([], dtype=torch.float32).view(-1, 4))  # Empty boxes tensor
138 |                 target.scores = torch.tensor([], dtype=torch.float32)  # Empty scores tensor
139 |                 target = target.to(device=pred_instances[0].scores.device)       
140 |             
141 |             else:
142 |                 
143 |                 # use grounding dino prediction
144 |                 configs = {
145 |                     "groundingdino_model": self.groundingdino_model,
146 |                     "image": images[0][[2, 1, 0], :, :],
147 |                     "text_prompt": filtered_texts,
148 |                     "box_threshold": 0.001,
149 |                     "text_threshold": 0.25,
150 |                     "token_spans": None,
151 |                     "cpu_only": False
152 |                 }
153 |                 
154 |                 ov_pred_instances = grounding_dino_inference_detector(configs)
155 |                 
156 |                 # init target
157 |                 target = Instances(pred_instances[0].image_size)
158 |                 
159 |                 # add classes, 2D boxes, scores
160 |                 class_names = ov_pred_instances["labels"]
161 |                 # h, w = pred_instances[0].image_size
162 |                 target.pred_classes = torch.tensor([filtered_texts.index([class_name])  for class_name in class_names])
163 |                 target.pred_boxes = Boxes( ov_pred_instances["bboxes"])
164 |                 # max_scores = [torch.max(score_tensor).item() for score_tensor in ov_pred_instances["scores"]]
165 |                 # target.scores = torch.tensor(max_scores).float()
166 |                 target.scores = ov_pred_instances["scores"]
167 |                 target = target.to(device=pred_instances[0].scores.device)
168 | 
169 |             if self.loss_w_3d > 0:
170 |                 pred_instances = self._forward_cube(features, [target,], Ks, im_dims, im_scales_ratio)
171 |             return pred_instances, {}
172 |         
173 | 
174 | def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
175 |     assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
176 |     cap_list = [cat[0] for cat in caption ]
177 |     caption = " . ".join(cap_list)
178 |     caption = caption.lower()
179 |     caption = caption.strip()
180 |     if not caption.endswith("."):
181 |         caption = caption + " ."
182 |     device = "cuda" if not cpu_only else "cpu"
183 |     model = model.to(device)
184 |     image = image.to(device)
185 |     with torch.no_grad():
186 |         outputs = model(image[None], captions=[caption])
187 |     logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
188 |     boxes = outputs["pred_boxes"][0]  # (nq, 4)
189 | 
190 |     all_logits = []
191 |     
192 |     # filter output
193 |     if token_spans is None:
194 |         tokenlizer = model.tokenizer
195 |         tokenized = tokenlizer(caption)
196 |         phrases_logits = get_phrase_logits_from_token_logits(logits, tokenized, tokenlizer, cap_list)
197 |         filt_mask = phrases_logits.max(dim=1)[0] > box_threshold
198 |         im_logits_filt = phrases_logits[filt_mask]
199 |         boxes_filt = boxes[filt_mask].cpu()
200 | 
201 |         im_pred_scores, im_pred_classes = im_logits_filt.max(dim = -1)
202 |         all_logits = im_pred_scores.cpu()
203 |         pred_phrases = [cap_list[idx] for idx in im_pred_classes]
204 |         
205 |     else:
206 |         # given-phrase mode
207 |         positive_maps = create_positive_map_from_span(
208 |             model.tokenizer(caption),
209 |             token_span=token_spans
210 |         ).to(image.device) # n_phrase, 256
211 | 
212 |         logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
213 |         all_logits = []
214 |         all_phrases = []
215 |         all_boxes = []
216 |         for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
217 |             # get phrase
218 |             phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
219 |             # get mask
220 |             filt_mask = logit_phr > box_threshold
221 |             # filt box
222 |             all_boxes.append(boxes[filt_mask])
223 |             # filt logits
224 |             all_logits.append(logit_phr[filt_mask])
225 |             if with_logits:
226 |                 logit_phr_num = logit_phr[filt_mask]
227 |                 all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
228 |             else:
229 |                 all_phrases.extend([phrase for _ in range(len(filt_mask))])
230 |         boxes_filt = torch.cat(all_boxes, dim=0).cpu()
231 |         pred_phrases = all_phrases
232 |         
233 |     return boxes_filt, pred_phrases, all_logits
234 | 
235 | 
236 | def grounding_dino_inference_detector(config):
237 |     image = config["image"]
238 |     text_prompt = config["text_prompt"]
239 |     box_threshold = config["box_threshold"]
240 |     text_threshold = config["text_threshold"]
241 |     token_spans = config["token_spans"]
242 |     cpu_only = config["cpu_only"]
243 |     groundingdino_model = config["groundingdino_model"]
244 |     
245 |     if token_spans is not None:
246 |         text_threshold = None
247 |         print("Using token_spans. Set the text_threshold to None.")
248 |         
249 |     boxes_filt, pred_phrases, all_logits = get_grounding_output(
250 |         groundingdino_model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans=eval(f"{token_spans}")
251 |     )
252 |     h, w = image.shape[1:]
253 |     boxes_filt = box_cxcywh_to_xyxy(boxes_filt * torch.tensor([w, h, w, h]))
254 |     nms_idx = nms(boxes_filt, all_logits, 0.5)
255 |     all_logits = all_logits[nms_idx]
256 |     boxes_filt = boxes_filt[nms_idx]
257 |     pred_phrases = [pred_phrases[idx] for idx in nms_idx]
258 |     ov_pred_instances = {}
259 |     ov_pred_instances["scores"] = all_logits
260 |     ov_pred_instances["bboxes"] = boxes_filt
261 |     ov_pred_instances["labels"] = pred_phrases
262 |     
263 |     return ov_pred_instances
264 |     
265 | 
266 | def box_cxcywh_to_xyxy(x):
267 |     x_c, y_c, w, h = x.unbind(-1)
268 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
269 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
270 |     return torch.stack(b, dim=-1)
271 | 
272 | 
273 | def get_phrase_logits_from_token_logits(
274 |     token_logits: torch.Tensor, tokenized: Dict, tokenizer: AutoTokenizer, cap_list: List
275 | ):
276 |     if token_logits.dim() == 2:  # (num of query, 256)
277 |         tokenized_phrases = tokenizer(cap_list, add_special_tokens=False)['input_ids']
278 |         begin_id = 1
279 |         phrase_logits = []
280 |         ids = list(range(len(tokenized['input_ids'])))
281 |         phrases_ids = []
282 |         for phrase_tokens in tokenized_phrases:
283 |             end_id = begin_id + len(phrase_tokens)
284 |             assert phrase_tokens == tokenized['input_ids'][begin_id : end_id], "assert error!!!"
285 |             phrases_ids.append(ids[begin_id : end_id])
286 |             begin_id = end_id + 1
287 |         for phrase_ids in phrases_ids:
288 |             # import pdb;pdb.set_trace()
289 |             phrase_logit = token_logits[:, phrase_ids].sum(dim=-1)
290 |             phrase_logits.append(phrase_logit)
291 |         phrase_logits = torch.stack(phrase_logits, dim=1)
292 |         return phrase_logits
293 |     else:
294 |         raise NotImplementedError("token_logits must be 1-dim")


--------------------------------------------------------------------------------
/cubercnn/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import *
2 | from .checkpoint import *


--------------------------------------------------------------------------------
/cubercnn/solver/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | import torch
 3 | from typing import Any, Dict, List, Set
 4 | from detectron2.solver.build import maybe_add_gradient_clipping
 5 | 
 6 | def build_optimizer(cfg, model):
 7 |     norm_module_types = (
 8 |         torch.nn.BatchNorm1d,
 9 |         torch.nn.BatchNorm2d,
10 |         torch.nn.BatchNorm3d,
11 |         torch.nn.SyncBatchNorm,
12 |         torch.nn.GroupNorm,
13 |         torch.nn.InstanceNorm1d,
14 |         torch.nn.InstanceNorm2d,
15 |         torch.nn.InstanceNorm3d,
16 |         torch.nn.LayerNorm,
17 |         torch.nn.LocalResponseNorm,
18 |     )
19 |     params: List[Dict[str, Any]] = []
20 |     memo: Set[torch.nn.parameter.Parameter] = set()
21 |     for module in model.modules():
22 |         for key, value in module.named_parameters(recurse=False):
23 |             if not value.requires_grad:
24 |                 continue
25 |             # Avoid duplicating parameters
26 |             if value in memo:
27 |                 continue
28 |             memo.add(value)
29 |             
30 |             lr = cfg.SOLVER.BASE_LR
31 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY
32 | 
33 |             if isinstance(module, norm_module_types) and (cfg.SOLVER.WEIGHT_DECAY_NORM is not None):
34 |                 weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
35 |             
36 |             elif key == "bias":
37 |                 if (cfg.SOLVER.BIAS_LR_FACTOR is not None):
38 |                     lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
39 |                 if (cfg.SOLVER.WEIGHT_DECAY_BIAS is not None):
40 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
41 | 
42 |             # these params do not need weight decay at all
43 |             # TODO parameterize these in configs instead.
44 |             if key in ['priors_dims_per_cat', 'priors_z_scales', 'priors_z_stats']:
45 |                 weight_decay = 0.0
46 | 
47 |             params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
48 | 
49 |     if cfg.SOLVER.TYPE == 'sgd':
50 |         optimizer = torch.optim.SGD(
51 |             params, 
52 |             cfg.SOLVER.BASE_LR, 
53 |             momentum=cfg.SOLVER.MOMENTUM, 
54 |             nesterov=cfg.SOLVER.NESTEROV, 
55 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY
56 |         )
57 |     elif cfg.SOLVER.TYPE == 'adam':
58 |         optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, eps=1e-02)
59 |     elif cfg.SOLVER.TYPE == 'adam+amsgrad':
60 |         optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
61 |     elif cfg.SOLVER.TYPE == 'adamw':
62 |         optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, eps=1e-02)
63 |     elif cfg.SOLVER.TYPE == 'adamw+amsgrad':
64 |         optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
65 |     else:
66 |         raise ValueError('{} is not supported as an optimizer.'.format(cfg.SOLVER.TYPE))
67 | 
68 |     optimizer = maybe_add_gradient_clipping(cfg, optimizer)
69 |     return optimizer
70 | 
71 | def freeze_bn(network):
72 | 
73 |     for _, module in network.named_modules():
74 |         if isinstance(module, torch.nn.BatchNorm2d):
75 |             module.eval()
76 |             module.track_running_stats = False
77 | 


--------------------------------------------------------------------------------
/cubercnn/solver/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from detectron2.checkpoint import PeriodicCheckpointer
 3 | from typing import Any
 4 | 
 5 | class PeriodicCheckpointerOnlyOne(PeriodicCheckpointer):
 6 |     def step(self, iteration: int, **kwargs: Any) -> None:
 7 |         """
 8 |         Perform the appropriate action at the given iteration.
 9 | 
10 |         Args:
11 |             iteration (int): the current iteration, ranged in [0, max_iter-1].
12 |             kwargs (Any): extra data to save, same as in
13 |                 :meth:`Checkpointer.save`.
14 |         """
15 |         iteration = int(iteration)
16 |         additional_state = {"iteration": iteration}
17 |         additional_state.update(kwargs)
18 | 
19 |         if (iteration + 1) % self.period == 0:
20 |             
21 |             # simply save a single recent model
22 |             self.checkpointer.save(
23 |                 "{}_recent".format(self.file_prefix), **additional_state
24 |             )
25 | 
26 |         if self.max_iter is not None:
27 |             if iteration >= self.max_iter - 1:
28 |                 self.checkpointer.save(f"{self.file_prefix}_final", **additional_state)


--------------------------------------------------------------------------------
/cubercnn/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .model_zoo import *
3 | from .math_util import *


--------------------------------------------------------------------------------
/cubercnn/util/model_zoo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates
 2 | from detectron2.utils.file_io import PathHandler, PathManager
 3 | 
 4 | __all__ = ["CubeRCNNHandler"]
 5 | 
 6 | class CubeRCNNHandler(PathHandler):
 7 |     """
 8 |     Resolves CubeRCNN's model zoo files. 
 9 |     """
10 | 
11 |     PREFIX = "cubercnn://"
12 |     CUBERCNN_PREFIX = "https://dl.fbaipublicfiles.com/cubercnn/"
13 | 
14 |     def _get_supported_prefixes(self):
15 |         return [self.PREFIX]
16 | 
17 |     def _get_local_path(self, path):
18 |         name = path[len(self.PREFIX) :]
19 |         return PathManager.get_local_path(self.CUBERCNN_PREFIX + name)
20 | 
21 |     def _open(self, path, mode="r", **kwargs):
22 |         return PathManager.open(self._get_local_path(path), mode, **kwargs)
23 | 
24 | 
25 | PathManager.register_handler(CubeRCNNHandler())


--------------------------------------------------------------------------------
/cubercnn/util/util.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | import json
  3 | import pickle
  4 | import cv2
  5 | from time import time
  6 | import numpy as np
  7 | import os 
  8 | import shutil
  9 | import scipy.io
 10 | from PIL import Image
 11 | from glob import glob
 12 | from difflib import SequenceMatcher
 13 | import matplotlib.colors as mplc
 14 | 
 15 | def file_parts(file_path):
 16 | 
 17 |     base_path, tail = os.path.split(file_path)
 18 |     name, ext = os.path.splitext(tail)
 19 | 
 20 |     return base_path, name, ext
 21 | 
 22 | def save_json(path, data):
 23 | 
 24 |     with open(path, 'w') as fp:
 25 |         json.dump(data, fp)
 26 | 
 27 | def load_json(path):
 28 |     
 29 |     with open(path, 'r') as fp:
 30 |         data = json.load(fp)
 31 | 
 32 |     return data
 33 | 
 34 | def load_mat(path):
 35 | 
 36 |     data = scipy.io.loadmat(path, struct_as_record=False, squeeze_me=True)
 37 | 
 38 |     return data
 39 | 
 40 | def pickle_write(file_path, obj):
 41 | 
 42 |     with open(file_path, 'wb') as file:
 43 |         pickle.dump(obj, file)
 44 | 
 45 | 
 46 | def pickle_read(file_path, latin=False, iso8859=False, bytes=False):
 47 |     
 48 |     
 49 |     with open(file_path, 'rb') as file:
 50 |         if bytes:
 51 |             obj = pickle.load(file, encoding='bytes')
 52 |         elif latin:
 53 |             obj = pickle.load(file, encoding='latin1')
 54 |         elif iso8859:
 55 |             obj = pickle.load(file, encoding='iso-8859-1')
 56 |         
 57 |         # default encoding
 58 |         else:
 59 |             obj = pickle.load(file)
 60 |             
 61 | 
 62 |         return obj
 63 | 
 64 | def imread(path):
 65 |     return cv2.imread(path)
 66 | 
 67 | # much faster than reading the entire image, just to get the width, height
 68 | def imreadstats(path):
 69 | 
 70 |     im = Image.open(path)
 71 |     width, height = im.size
 72 | 
 73 |     return width, height
 74 | 
 75 | def imwrite(im, path):
 76 |     cv2.imwrite(path, im)
 77 | 
 78 | def compute_eta(start_time, idx, total):
 79 |     """
 80 |     Computes estimated time left for an iterative function to finish. 
 81 |     Args:
 82 |         start_time (int): the time the function started at (e.g from time())
 83 |         idx (int): the index the function is currently on, or has completed.
 84 |         total (int): the total amount that needs to pass for completion.
 85 |     Returns:
 86 |         time_str (str): convenient string to display the time remaining
 87 |             in seconds, minutes or hours depending on magnitude. 
 88 |         dt (float): the average change in seconds per iteration. 
 89 |     """
 90 |     
 91 |     # cannot be less than 1
 92 |     idx = max(idx, 1)
 93 | 
 94 |     dt = (time() - start_time)/idx
 95 |     timeleft = np.max([dt * (total - idx), 0])
 96 |     if timeleft > 3600: time_str = '{:.1f}h'.format(timeleft / 3600);
 97 |     elif timeleft > 60: time_str = '{:.1f}m'.format(timeleft / 60);
 98 |     else: time_str = '{:.1f}s'.format(timeleft);
 99 | 
100 |     return time_str, dt
101 | 
102 | def list_files(base_dir, file_pattern):
103 |     """
104 |     Returns a list of files given a directory and pattern
105 |     The results are sorted alphabetically
106 |     Example:
107 |         files = list_files('path/to/images/', '*.jpg')
108 |     """
109 |     return sorted(glob(os.path.join(base_dir) + file_pattern))
110 | 
111 | def list_subdirectories(path, include_files=False):
112 | 
113 |     # this lists everything.
114 |     if include_files:
115 |         return sorted(glob(os.path.join(path, '*')))
116 | 
117 |     # only subdirectories.
118 |     else:
119 |         return [fpath for fpath in glob(os.path.join(path, '*')) if os.path.isdir(fpath)]
120 | 
121 | def mkdir_if_missing(directory, delete_if_exist=False):
122 | 
123 |     if delete_if_exist and os.path.exists(directory): shutil.rmtree(directory)
124 | 
125 |     # check if not exist, then make
126 |     if not os.path.exists(directory):
127 |         os.makedirs(directory)
128 | 
129 | # All coco categories, together with their nice-looking visualization colors
130 | # It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
131 | COCO_CATEGORIES = [
132 |     {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
133 |     {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
134 |     {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
135 |     {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
136 |     {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
137 |     {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
138 |     {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
139 |     {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
140 |     {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
141 |     {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
142 |     {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
143 |     {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
144 |     {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
145 |     {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
146 |     {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
147 |     {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
148 |     {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
149 |     {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
150 |     {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
151 |     {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
152 |     {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
153 |     {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
154 |     {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
155 |     {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
156 |     {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
157 |     {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
158 |     {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
159 |     {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
160 |     {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
161 |     {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
162 |     {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
163 |     {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
164 |     {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
165 |     {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
166 |     {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
167 |     {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
168 |     {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
169 |     {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
170 |     {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
171 |     {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
172 |     {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
173 |     {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
174 |     {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
175 |     {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
176 |     {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
177 |     {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
178 |     {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
179 |     {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
180 |     {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
181 |     {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
182 |     {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
183 |     {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
184 |     {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
185 |     {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
186 |     {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
187 |     {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
188 |     {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
189 |     {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
190 |     {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
191 |     {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
192 |     {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
193 |     {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
194 |     {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
195 |     {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
196 |     {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
197 |     {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
198 |     {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
199 |     {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
200 |     {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
201 |     {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
202 |     {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
203 |     {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
204 |     {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
205 |     {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
206 |     {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
207 |     {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
208 |     {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
209 |     {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
210 |     {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
211 |     {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
212 |     {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
213 |     {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
214 |     {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
215 |     {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
216 |     {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
217 |     {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
218 |     {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
219 |     {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
220 |     {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
221 |     {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
222 |     {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
223 |     {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
224 |     {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
225 |     {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
226 |     {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
227 |     {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
228 |     {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
229 |     {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
230 |     {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
231 |     {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
232 |     {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
233 |     {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
234 |     {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
235 |     {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
236 |     {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
237 |     {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
238 |     {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
239 |     {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
240 |     {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
241 |     {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
242 |     {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
243 |     {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
244 |     {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
245 |     {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
246 |     {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
247 |     {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
248 |     {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
249 |     {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
250 |     {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
251 |     {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
252 |     {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
253 |     {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
254 |     {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
255 |     {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
256 |     {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
257 |     {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
258 |     {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
259 |     {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
260 |     {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
261 |     {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
262 |     {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
263 |     {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
264 |     {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},]
265 | 
266 | 
267 | _colors = [cat['color'] for cat in COCO_CATEGORIES]
268 | 
269 | def _jitter(color):
270 |     """
271 |     Randomly modifies given color to produce a slightly different color than the color given.
272 |     Args:
273 |         color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
274 |             picked. The values in the list are in the [0.0, 1.0] range.
275 |     Returns:
276 |         jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
277 |             color after being jittered. The values in the list are in the [0.0, 1.0] range.
278 |     """
279 |     color = [c/255.0 for c in color]
280 |     color = mplc.to_rgb(color)
281 |     vec = np.random.rand(3)
282 | 
283 |     # better to do it in another color space
284 |     vec = vec / np.linalg.norm(vec) * 0.5
285 |     res = np.clip(vec + color, 0, 1)
286 |     return [c*255.0 for c in res]
287 | 
288 | 
289 | def get_color(ind=None, hex=False):
290 | 
291 |     if ind is None:
292 |         ind = np.random.randint(len(_colors))
293 | 
294 |     color = _jitter(_colors[ind % len(_colors)])
295 | 
296 |     if hex:
297 |         return '#%02x%02x%02x' % (color[0], color[1], color[2])
298 |         
299 |     else:
300 |         return color
301 | 
302 | def string_similarity(text1, text2):
303 |     return SequenceMatcher(None, text1, text2).ratio()


--------------------------------------------------------------------------------
/cubercnn/vis/__init__.py:
--------------------------------------------------------------------------------
1 | from .vis import * 


--------------------------------------------------------------------------------
/cubercnn/vis/logperf.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | from termcolor import colored
  3 | import itertools
  4 | from tabulate import tabulate
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def print_ap_category_histogram(dataset, results):
 10 |     """
 11 |     Prints AP and AR performance for each category.
 12 |     Args:
 13 |         results: dictionary; each entry contains information for a dataset
 14 |     """
 15 |     num_classes = len(results)
 16 |     N_COLS = 10
 17 |     data = list(
 18 |         itertools.chain(
 19 |             *[
 20 |                 [
 21 |                     cat,
 22 |                     out["AP2D"],
 23 |                     out["AP3D"],
 24 |                     out.get("AR2D", "-"),
 25 |                     out.get("AR3D", "-")
 26 |                 ]
 27 |                 for cat, out in results.items()
 28 |             ]
 29 |         )
 30 |     )
 31 |     data.extend([None] * (N_COLS - (len(data) % N_COLS)))
 32 |     data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
 33 |     table = tabulate(
 34 |         data,
 35 |         headers=["category", "AP2D", "AP3D", "AR2D", "AR3D"] * (N_COLS // 5),
 36 |         tablefmt="pipe",
 37 |         numalign="left",
 38 |         stralign="center",
 39 |     )
 40 |     logger.info(
 41 |         "Performance for each of {} categories on {}:\n".format(num_classes, dataset)
 42 |         + colored(table, "cyan")
 43 |     )
 44 | 
 45 | 
 46 | def print_ap_analysis_histogram(results):
 47 |     """
 48 |     Prints AP performance for various IoU thresholds and (near, medium, far) objects.
 49 |     Args:
 50 |         results: dictionary. Each entry in results contains outputs for a dataset
 51 |     """
 52 |     metric_names = ["AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F", "AR2D", "AR3D"]
 53 |     N_COLS = 10
 54 |     data = []
 55 |     for name, metrics in results.items():
 56 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AP3D@15"], metrics["AP3D@25"], metrics["AP3D@50"], metrics["AP3D-N"], metrics["AP3D-M"], metrics["AP3D-F"],
 57 |                         metrics["AR2D"], metrics["AR3D"]]
 58 |         data.append(data_item)
 59 |     table = tabulate(
 60 |         data,
 61 |         headers=["Dataset", "#iters", "AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F", "AR2D", "AR3D"],
 62 |         tablefmt="grid",
 63 |         numalign="left",
 64 |         stralign="center",
 65 |     )
 66 |     logger.info(
 67 |         "Per-dataset performance analysis on test set:\n"
 68 |         + colored(table, "cyan")
 69 |     )
 70 | 
 71 | 
 72 | def print_ap_dataset_histogram(results):
 73 |     """
 74 |     Prints AP performance for each dataset.
 75 |     Args:
 76 |         results: list of dicts. Each entry in results contains outputs for a dataset
 77 |     """
 78 |     metric_names = ["AP2D", "AP3D"]
 79 |     N_COLS = 4
 80 |     data = []
 81 |     for name, metrics in results.items():
 82 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]]
 83 |         data.append(data_item)
 84 |     table = tabulate(
 85 |         data,
 86 |         headers=["Dataset", "#iters", "AP2D", "AP3D"],
 87 |         tablefmt="grid",
 88 |         numalign="left",
 89 |         stralign="center",
 90 |     )
 91 |     logger.info(
 92 |         "Per-dataset performance on test set:\n"
 93 |         + colored(table, "cyan")
 94 |     )
 95 | 
 96 | 
 97 | def print_ap_omni_histogram(results):
 98 |     """
 99 |     Prints AP and AR performance for Omni3D dataset.
100 |     Args:
101 |         results: list of dicts. Each entry in results contains outputs for a dataset
102 |     """
103 |     metric_names = ["AP2D", "AP3D", "AR2D", "AR3D"]
104 |     N_COLS = 4
105 |     data = []
106 |     for name, metrics in results.items():
107 |         data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AR2D"], metrics["AR3D"]]
108 |         data.append(data_item)
109 |     table = tabulate(
110 |         data,
111 |         headers=["Dataset", "#iters", "AP2D", "AP3D", "AR2D", "AR3D"],
112 |         tablefmt="grid",
113 |         numalign="left",
114 |         stralign="center",
115 |     )
116 |     logger.info(
117 |         "Performance on Omni3D:\n"
118 |         + colored(table, "magenta")
119 |     )
120 | 
121 | def print_ap_hard_easy_for_novel(easy_metrics_formatted, hard_metrics_formatted):
122 |     table_data = [
123 |         ["Easy Novel", easy_metrics_formatted['AP2D'], easy_metrics_formatted['AP3D'], 
124 |         easy_metrics_formatted['AR2D'], easy_metrics_formatted['AR3D']],
125 |         ["Hard Novel", hard_metrics_formatted['AP2D'], hard_metrics_formatted['AP3D'],
126 |         hard_metrics_formatted['AR2D'], hard_metrics_formatted['AR3D']]
127 |     ]
128 | 
129 |     table = tabulate(
130 |         table_data,
131 |         headers=["Subset", "AP2D", "AP3D", "AR2D", "AR3D"],
132 |         tablefmt="grid"
133 |     )
134 | 
135 |     logger.info("Novel Categories Evaluation Results on Easy and Hard subsets:\n" + table)
136 | 


--------------------------------------------------------------------------------
/datasets/ARKitScenes/download_arkitscenes_images.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved
4 | 
5 | wget https://dl.fbaipublicfiles.com/omni3d_data/ARKitScenes_images.zip
6 | unzip ARKitScenes_images.zip


--------------------------------------------------------------------------------
/datasets/Omni3D/download_omni3d_json.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved
4 | 
5 | wget https://dl.fbaipublicfiles.com/omni3d_data/Omni3D_json.zip
6 | unzip Omni3D_json.zip


--------------------------------------------------------------------------------
/datasets/coco_examples/000000044260.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000044260.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000088432.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000088432.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000101762.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000101762.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000120584.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000120584.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000128148.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000128148.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000162543.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000162543.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000164115.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000164115.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000311950.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000311950.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/000000429011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000429011.jpg


--------------------------------------------------------------------------------
/datasets/coco_examples/labels.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "000000044260": [
 3 |         "apple"
 4 |     ],
 5 |     "000000088432": [
 6 |         "truck",
 7 |         "traffic light",
 8 |         "fire hydrant"
 9 |     ],
10 |     "000000101762": [
11 |         "bicycle",
12 |         "cat"
13 |     ],
14 |     "000000120584": [
15 |         "clock"
16 |     ],
17 |     "000000128148": [
18 |         "book",
19 |         "chair",
20 |         "potted plant",
21 |         "couch",
22 |         "dining table"
23 |     ],
24 |     "000000162543": [
25 |         "elephant"
26 |     ],
27 |     "000000164115": [
28 |         "surfboard"
29 |     ],
30 |     "000000311950": [
31 |         "hot dog"
32 |     ],
33 |     "000000429011": [
34 |         "truck",
35 |         "car"
36 |     ]
37 | }


--------------------------------------------------------------------------------
/datasets/objectron/download_objectron_images.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved
4 | 
5 | wget https://dl.fbaipublicfiles.com/omni3d_data/objectron_images.zip
6 | unzip objectron_images.zip


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates
  2 | import logging
  3 | import os
  4 | import argparse
  5 | import sys
  6 | import numpy as np
  7 | from collections import OrderedDict
  8 | import torch
  9 | 
 10 | from detectron2.checkpoint import DetectionCheckpointer
 11 | from detectron2.config import get_cfg
 12 | from detectron2.engine import default_argument_parser, default_setup, launch
 13 | from detectron2.data import transforms as T
 14 | 
 15 | logger = logging.getLogger("detectron2")
 16 | 
 17 | sys.dont_write_bytecode = True
 18 | sys.path.append(os.getcwd())
 19 | np.set_printoptions(suppress=True)
 20 | 
 21 | from cubercnn.config import get_cfg_defaults
 22 | from cubercnn.modeling.proposal_generator import RPNWithIgnore
 23 | from cubercnn.modeling.roi_heads import ROIHeads3D
 24 | from cubercnn.modeling.meta_arch import RCNN3D, build_model
 25 | from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone
 26 | from cubercnn import util, vis
 27 | from pycocotools.coco import COCO
 28 | from tqdm import tqdm
 29 | 
 30 | 
 31 | def do_test(args, cfg, model):
 32 | 
 33 |     list_of_ims = util.list_files(os.path.join(args.input_folder, ''), '*')
 34 |     list_of_ims = [ im for im in list_of_ims if not im.endswith('.json')]
 35 |     list_of_cats_per_img = util.load_json(args.labels_file)
 36 | 
 37 |     model.eval()
 38 |     
 39 |     focal_length = args.focal_length
 40 |     principal_point = args.principal_point
 41 |     thres = args.threshold
 42 | 
 43 |     output_dir = cfg.OUTPUT_DIR
 44 |     min_size = cfg.INPUT.MIN_SIZE_TEST
 45 |     max_size = cfg.INPUT.MAX_SIZE_TEST
 46 |     augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")])
 47 | 
 48 |     util.mkdir_if_missing(output_dir)
 49 | 
 50 |     for path in tqdm(list_of_ims):
 51 |         im_name = util.file_parts(path)[1]
 52 |         im = util.imread(path)
 53 |         cats = list_of_cats_per_img[im_name]
 54 |         if cats == []:
 55 |             continue
 56 |         if im is None:
 57 |             continue
 58 |         
 59 |         image_shape = im.shape[:2]  # h, w
 60 | 
 61 |         h, w = image_shape
 62 |         
 63 |         if focal_length == 0:
 64 |             focal_length_ndc = 4.0
 65 |             focal_length = focal_length_ndc * h / 2
 66 | 
 67 |         if len(principal_point) == 0:
 68 |             px, py = w/2, h/2
 69 |         else:
 70 |             px, py = principal_point
 71 | 
 72 |         K = np.array([
 73 |             [focal_length, 0.0, px], 
 74 |             [0.0, focal_length, py], 
 75 |             [0.0, 0.0, 1.0]
 76 |         ])
 77 | 
 78 |         aug_input = T.AugInput(im)
 79 |         _ = augmentations(aug_input)
 80 |         image = aug_input.image
 81 | 
 82 |         batched = [{
 83 |             'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))).cuda(), 
 84 |             'height': image_shape[0], 'width': image_shape[1], 'K': K, 'category_list': cats
 85 |         }]
 86 |         dets = model(batched)[0]['instances']
 87 |         n_det = len(dets)
 88 | 
 89 |         meshes = []
 90 |         meshes_text = []
 91 | 
 92 |         if n_det > 0:
 93 |             for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip(
 94 |                     dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions, 
 95 |                     dets.pred_pose, dets.scores, dets.pred_classes
 96 |                 )):
 97 | 
 98 |                 # skip
 99 |                 if score < thres:
100 |                     continue
101 |                 
102 |                 cat = cats[cat_idx]
103 | 
104 |                 bbox3D = center_cam.tolist() + dimensions.tolist()
105 |                 meshes_text.append('{} {:.2f}'.format(cat, score))
106 |                 color = [c/255.0 for c in util.get_color(idx)]
107 |                 box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color)
108 |                 meshes.append(box_mesh)
109 |         
110 |         print('File: {} with {} dets'.format(im_name, len(meshes)))
111 | 
112 |         if len(meshes) > 0:
113 |             im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
114 |             im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1)
115 |             if args.display:
116 |                 vis.imshow(im_concat)
117 | 
118 |             util.imwrite(im_concat, os.path.join(output_dir, im_name+'_combine.jpg'))
119 |             # util.imwrite(im_drawn_rgb, os.path.join(output_dir, im_name+'_boxes.jpg'))
120 |             # util.imwrite(im_topdown, os.path.join(output_dir, im_name+'_novel.jpg'))
121 |         else:
122 |             util.imwrite(im, os.path.join(output_dir, im_name+'_boxes.jpg'))
123 | 
124 | def setup(args):
125 |     """
126 |     Create configs and perform basic setups.
127 |     """
128 |     cfg = get_cfg()
129 |     get_cfg_defaults(cfg)
130 | 
131 |     config_file = args.config_file
132 | 
133 |     # store locally if needed
134 |     if config_file.startswith(util.CubeRCNNHandler.PREFIX):    
135 |         config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
136 | 
137 |     cfg.merge_from_file(config_file)
138 |     cfg.merge_from_list(args.opts)
139 |     cfg.freeze()
140 |     default_setup(cfg, args)
141 |     return cfg
142 | 
143 | def main(args):
144 |     cfg = setup(args)
145 |     model = build_model(cfg)
146 |     
147 |     logger.info("Model:\n{}".format(model))
148 |     DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
149 |         cfg.MODEL.WEIGHTS, resume=True
150 |     )
151 | 
152 |     with torch.no_grad():
153 |         do_test(args, cfg, model)
154 | 
155 | if __name__ == "__main__":
156 |     
157 |     parser = argparse.ArgumentParser(
158 |         epilog=None, formatter_class=argparse.RawDescriptionHelpFormatter,
159 |     )
160 |     parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
161 |     parser.add_argument('--input-folder',  type=str, help='list of image folders to process', required=True)
162 |     parser.add_argument('--labels-file',  type=str, help='path to labels file', required=True)
163 |     parser.add_argument("--focal-length", type=float, default=0, help="focal length for image inputs (in px)")
164 |     parser.add_argument("--principal-point", type=float, default=[], nargs=2, help="principal point for image inputs (in px)")
165 |     parser.add_argument("--threshold", type=float, default=0.25, help="threshold on score for visualizing")
166 |     parser.add_argument("--display", default=False, action="store_true", help="Whether to show the images in matplotlib",)
167 |     
168 |     parser.add_argument("--eval-only", default=True, action="store_true", help="perform evaluation only")
169 |     parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
170 |     parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
171 |     parser.add_argument(
172 |         "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
173 |     )
174 |     port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14
175 |     parser.add_argument(
176 |         "--dist-url",
177 |         default="tcp://127.0.0.1:{}".format(port),
178 |         help="initialization URL for pytorch distributed backend. See "
179 |         "https://pytorch.org/docs/stable/distributed.html for details.",
180 |     )
181 |     parser.add_argument(
182 |         "opts",
183 |         help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
184 |         "See config references at "
185 |         "https://detectron2.readthedocs.io/modules/config.html#config-references",
186 |         default=None,
187 |         nargs=argparse.REMAINDER,
188 |     )
189 | 
190 |     args = parser.parse_args()
191 | 
192 |     print("Command Line Args:", args)
193 |     launch(
194 |         main,
195 |         args.num_gpus,
196 |         num_machines=args.num_machines,
197 |         machine_rank=args.machine_rank,
198 |         dist_url=args.dist_url,
199 |         args=(args,),
200 |     )


--------------------------------------------------------------------------------
/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | # -*- coding: utf-8 -*-
3 | 
4 | wget -P datasets https://huggingface.co/datasets/uva-cv-lab/ovmono3d_data/resolve/main/ovmono3d_data.zip
5 | unzip datasets/ovmono3d_data.zip -d datasets/Omni3D
6 | 
7 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | pip install git+https://github.com/facebookresearch/pytorch3d.git@055ab3a
 2 | pip install git+https://github.com/yaojin17/detectron2.git  # slightly modified detectron2 for OVMono3D
 3 | pip install cython opencv-python scipy pandas einops open_clip_torch open3d
 4 | 
 5 | pip install git+https://github.com/apple/ml-depth-pro.git@b2cd0d5
 6 | pip install git+https://github.com/facebookresearch/segment-anything.git@dca509f
 7 | pip install git+https://github.com/IDEA-Research/GroundingDINO.git@856dde2
 8 | 
 9 | mkdir -p checkpoints
10 | wget -P ./checkpoints/ https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth
11 | wget  -P checkpoints https://ml-site.cdn-apple.com/models/depth-pro/depth_pro.pt
12 | wget -P checkpoints https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
13 | huggingface-cli download uva-cv-lab/ovmono3d_lift ovmono3d_lift.pth --local-dir checkpoints
14 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/tools/__init__.py


--------------------------------------------------------------------------------
/tools/eval_ovmono3d_geo.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | import torch
  5 | import numpy as np
  6 | from collections import OrderedDict
  7 | from detectron2.data import MetadataCatalog, DatasetCatalog
  8 | from detectron2.utils.file_io import PathManager
  9 | from detectron2.utils.logger import setup_logger
 10 | import detectron2.utils.comm as comm
 11 | 
 12 | sys.dont_write_bytecode = True
 13 | sys.path.append(os.getcwd())
 14 | np.set_printoptions(suppress=True)
 15 | from cubercnn.data import (
 16 |     get_filter_settings_from_cfg,
 17 |     simple_register,
 18 |     get_omni3d_categories
 19 | )
 20 | from cubercnn.evaluation import Omni3DEvaluationHelper
 21 | from cubercnn import util
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | def setup_categories(category_path):
 26 |     """Setup category mapping"""
 27 |     metadata = util.load_json(category_path)
 28 |     thing_classes = metadata['thing_classes']
 29 |     id_map = {int(key):val for key, val in metadata['thing_dataset_id_to_contiguous_id'].items()}
 30 |     MetadataCatalog.get('omni3d_model').thing_classes = thing_classes
 31 |     MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id = id_map
 32 | 
 33 | def evaluate_predictions(
 34 |     dataset_names,
 35 |     prediction_paths,
 36 |     filter_settings,
 37 |     output_dir,
 38 |     category_path,
 39 |     eval_mode="novel",
 40 |     iter_label='final'
 41 | ):
 42 |     """
 43 |     Evaluate predictions from pre-computed prediction files.
 44 |     
 45 |     Args:
 46 |         dataset_names (list): List of dataset names to evaluate
 47 |         prediction_paths (dict): Dictionary mapping dataset names to prediction file paths
 48 |         filter_settings (dict): Filter settings for evaluation
 49 |         output_dir (str): Output directory for evaluation results
 50 |         category_path (str): Path to category metadata json file
 51 |         eval_mode (str): Evaluation mode, either "novel" or "base"
 52 |         iter_label (str): Label for the iteration being evaluated
 53 |     """
 54 |     # Setup logging
 55 |     os.makedirs(output_dir, exist_ok=True)
 56 |     setup_logger(output=output_dir, name="cubercnn")
 57 | 
 58 |     # Setup categories
 59 |     setup_categories(category_path)
 60 | 
 61 |     # Initialize evaluation helper
 62 |     thing_classes = ['monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem', 'fireplace', 'tram']
 63 |     filter_settings['category_names'] = thing_classes
 64 |     eval_helper = Omni3DEvaluationHelper(
 65 |         dataset_names=dataset_names,
 66 |         filter_settings=filter_settings,
 67 |         output_folder=output_dir,
 68 |         iter_label=iter_label,
 69 |         only_2d=False,
 70 |         eval_categories=thing_classes
 71 |     )
 72 | 
 73 |     # Load and evaluate predictions for each dataset
 74 |     for dataset_name in dataset_names:
 75 |         logger.info(f"Evaluating predictions for {dataset_name}")
 76 |         # to get the thing_classes and thing_dataset_id_to_contiguous_id for the MetadataCatalog.get(dataset_name)
 77 |         DatasetCatalog.get(dataset_name)
 78 |         # Load predictions
 79 |         pred_path = prediction_paths[dataset_name]
 80 |         if not os.path.exists(pred_path):
 81 |             raise FileNotFoundError(f"Prediction file not found: {pred_path}")
 82 |         
 83 |         with PathManager.open(pred_path, "rb") as f:
 84 |             predictions = torch.load(f)
 85 |         
 86 |         # Add predictions to evaluator
 87 |         eval_helper.add_predictions(dataset_name, predictions)
 88 |         
 89 |         # Run evaluation
 90 |         eval_helper.evaluate(dataset_name)
 91 |         
 92 |         # Save predictions if needed
 93 |         eval_helper.save_predictions(dataset_name)
 94 | 
 95 |     # Summarize results
 96 |     eval_helper.summarize_all()
 97 | 
 98 | def main():
 99 |     """Main function demonstrating how to use the evaluation script"""
100 | 
101 |     dataset_names = ["SUNRGBD_test_novel", "KITTI_test_novel", "ARKitScenes_test_novel"] 
102 |     prediction_paths = {
103 |         "SUNRGBD_test_novel": "./output/ovmono3d_geo/SUNRGBD_test_novel.pth",
104 |         "KITTI_test_novel": "./output/ovmono3d_geo/KITTI_test_novel.pth",
105 |         "ARKitScenes_test_novel": "./output/ovmono3d_geo/ARKitScenes_test_novel.pth"
106 |     }
107 |     
108 |     # Setup filter settings
109 |     filter_settings = {
110 |         'visibility_thres': 0.33333333,
111 |         'truncation_thres': 0.33333333,
112 |         'min_height_thres': 0.0625,
113 |         'max_depth': 100000000.0,        
114 |         'category_names': None,  # Will be set based on category_path
115 |         'ignore_names': ['dontcare', 'ignore', 'void'],
116 |         'trunc_2D_boxes': True,
117 |         'modal_2D_boxes': False,
118 |         'max_height_thres': 1.5,
119 |     }
120 |     
121 |     # Set paths
122 |     output_dir = "./output/ovmono3d_geo"
123 |     category_path = "./configs/category_meta.json"
124 |     
125 |     # Run evaluation
126 |     evaluate_predictions(
127 |         dataset_names=dataset_names,
128 |         prediction_paths=prediction_paths,
129 |         filter_settings=filter_settings,
130 |         output_dir=output_dir,
131 |         category_path=category_path,
132 |         eval_mode="novel",
133 |         iter_label='final'
134 |     )
135 | 
136 | if __name__ == "__main__":
137 |     main() 


--------------------------------------------------------------------------------
/tools/ovmono3d_geo.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pickle
  3 | import os
  4 | import sys
  5 | import torch
  6 | import numpy as np
  7 | import pdb
  8 | import cv2
  9 | import open3d as o3d
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.patches as patches
 12 | from pytorch3d.transforms import (
 13 |     rotation_6d_to_matrix, 
 14 |     matrix_to_rotation_6d,
 15 | )
 16 | from sklearn.decomposition import PCA
 17 | from sklearn.cluster import DBSCAN
 18 | from segment_anything import SamPredictor, sam_model_registry
 19 | import glob
 20 | from pytorch3d import _C
 21 | import depth_pro
 22 | import tqdm
 23 | from sklearn.utils import shuffle
 24 | 
 25 | sys.dont_write_bytecode = True
 26 | sys.path.append(os.getcwd())
 27 | np.set_printoptions(suppress=True)
 28 | 
 29 | from cubercnn.data import xywh_to_xyxy
 30 | import cubercnn.util as util
 31 | 
 32 | def project_3d_to_2d(X, Y, Z, K):
 33 | 
 34 |     fx = K[0, 0]
 35 |     fy = K[1, 1]
 36 |     cx = K[0, 2]
 37 |     cy = K[1, 2]
 38 |     
 39 |     x = (fx * X) / Z + cx
 40 |     y = (fy * Y) / Z + cy
 41 | 
 42 |     return x, y
 43 | 
 44 | 
 45 | def get_dims(bbox3d):
 46 |     x = np.sqrt(np.sum((bbox3d[0] - bbox3d[1]) * (bbox3d[0] - bbox3d[1])))
 47 |     y = np.sqrt(np.sum((bbox3d[0] - bbox3d[3]) * (bbox3d[0] - bbox3d[3])))
 48 |     z = np.sqrt(np.sum((bbox3d[0] - bbox3d[4]) * (bbox3d[0] - bbox3d[4])))
 49 |     return np.array([z, y, x])
 50 | 
 51 | def get_pose(bbox3d_a, bbox3d_b):
 52 |     # assume a and b share the same bbox center and have same dimension
 53 |     center = np.mean(bbox3d_a, axis=0)
 54 |     dim_a = get_dims(bbox3d_a)
 55 |     dim_b = get_dims(bbox3d_b)
 56 |     bbox3d_a -= center 
 57 |     bbox3d_b -= center 
 58 |     U, _, Vt = np.linalg.svd(bbox3d_a.T @ bbox3d_b, full_matrices=True)
 59 |     R = U @ Vt
 60 |     if np.linalg.det(R) < 0:
 61 |         U[:, -1] *= -1
 62 |         R = U @ Vt
 63 |     return R
 64 | 
 65 | 
 66 | def auto_downsample(points, max_points):
 67 |     """
 68 |     If the number of points exceeds max_points, randomly sample down to max_points.
 69 |     Otherwise, return the original point cloud.
 70 |     
 71 |     Parameters:
 72 |         points (numpy.ndarray): Input point cloud with shape (N, D), where N is the number of points, and D is the dimension.
 73 |         max_points (int): The maximum number of points to retain.
 74 |         
 75 |     Returns:
 76 |         sampled_points (numpy.ndarray): The downsampled point cloud.
 77 |     """
 78 |     num_points = len(points)
 79 |     if num_points > max_points:
 80 |         # Randomly sample points
 81 |         sampled_points = shuffle(points, random_state=42)[:max_points]
 82 |         print(f"Points downsampled from {num_points} to {max_points}.")
 83 |     else:
 84 |         sampled_points = points
 85 |         print(f"Points remain unchanged: {num_points}.")
 86 |     return sampled_points
 87 | 
 88 | # (3) for each annotation, load image, run seg anything, unproject, clustering, 3D bbox, save to new annotations
 89 | def build_lineset(bbox3d, color=[1,0,0], flip=True):
 90 |     if flip:
 91 |         flip_matrix = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
 92 |         bbox3d_flip = bbox3d.dot(flip_matrix)
 93 |     else:
 94 |         bbox3d_flip = bbox3d.copy()
 95 |     lines = [[0, 1], [1, 2], [2, 3], [0, 3],
 96 |              [4, 5], [5, 6], [6, 7], [4, 7],
 97 |              [0, 4], [1, 5], [2, 6], [3, 7]]
 98 |     # Use the same color for all lines
 99 |     colors = [color for _ in range(len(lines))]
100 |     line_set = o3d.geometry.LineSet()
101 |     line_set.points = o3d.utility.Vector3dVector(bbox3d_flip)
102 |     line_set.lines = o3d.utility.Vector2iVector(lines)
103 |     line_set.colors = o3d.utility.Vector3dVector(colors)
104 |     return line_set
105 | 
106 | def gen_8corners(x_min, y_min, z_min, cx, cy, cz):
107 |     corners_flag = [[0,0,0], [1,0,0], [1,1,0], [0,1,0],
108 |                [0,0,1], [1,0,1], [1,1,1], [0,1,1]]
109 |     corners = []
110 |     for flag in corners_flag:
111 |         c = np.array([x_min, y_min, z_min]) + np.array(flag) * np.array([cx, cy, cz])
112 |         corners.append(c)
113 |     return np.array(corners)
114 | 
115 | def heading2rotmat(heading_angle):
116 |     rotmat = np.zeros((3,3))
117 |     rotmat[1, 1] = 1
118 |     cosval = np.cos(heading_angle)
119 |     sinval = np.sin(heading_angle)
120 |     rotmat[0, 0] = cosval
121 |     rotmat[0, 2] = -sinval
122 |     rotmat[2, 0] = sinval
123 |     rotmat[2, 2] = cosval
124 |     return rotmat
125 | 
126 | 
127 | def build_pseudo_bbox3d_from_mask2d_outlier(mask2d, depth, K):
128 |     frustum = []
129 |     depth = np.array(depth) # HxW
130 | 
131 |     ys, xs = np.where(mask2d > 0.5)
132 |     # (1) generate mask 
133 |     for y, x in zip(ys, xs):
134 |         # (2) unproject 2d points (visualize in 3D) 
135 |         z = depth[y, x]
136 |         x_3d = z * (x - K[0, 2]) / K[0, 0]
137 |         y_3d = z * (y - K[1, 2]) / K[1, 1]
138 |         frustum.append([x_3d, -y_3d, -z]) # flip
139 |     frustum = np.array(frustum)
140 | 
141 |     # (3) fit 3D bounding boxes (visualize in 3D)
142 |     xyz_offset = np.mean(frustum, axis=0)
143 |     xyz = frustum - xyz_offset
144 |     pca = PCA(2)
145 |     pca.fit(xyz[:, [0, 2]]) # xz plane
146 |     yaw_vec = pca.components_[0, :]
147 |     yaw = np.arctan2(yaw_vec[1], yaw_vec[0])
148 |     xyz_tmp = xyz.copy()
149 |     pose = heading2rotmat(-yaw)
150 |     xyz_tmp = (pose @ xyz_tmp[:,:3].T).T
151 |     xyz_tmp += xyz_offset
152 | 
153 |     # remove outliers
154 |     eps=0.01
155 |     min_samples=100
156 |     trial_time = 0
157 |     # print(len(xyz_tmp))
158 |     max_points = 40000
159 |     xyz_tmp = auto_downsample(xyz_tmp, max_points)
160 |     while True:
161 |         trial_time += 1
162 |         if trial_time > 4:
163 |             xyz_clean = xyz_tmp.copy()
164 |             break
165 |         db = DBSCAN(eps=eps, min_samples=min_samples).fit(xyz_tmp)
166 |         xyz_clean = []
167 |         count_points = 0
168 |         for cluster in np.unique(db.labels_):
169 |             if cluster < 0:
170 |                 continue
171 |             cluster_ind = np.where(db.labels_ == cluster)[0]
172 |             if cluster_ind.shape[0] / xyz_tmp.shape[0] < 0.1 or cluster_ind.shape[0] <=100:
173 |                 continue
174 |             xyz_clean.append(xyz_tmp[cluster_ind, :])
175 |             count_points += len(cluster_ind)
176 |         if count_points > 0.5 * len(xyz_tmp):
177 |             xyz_clean = np.concatenate(xyz_clean, axis=0)
178 |             print("%d --> %d" % (len(xyz_tmp), len(xyz_clean)))
179 |             break
180 |         else:
181 |             eps = 2 * eps
182 |             print("try once more: eps = %f" % eps)
183 |     # xyz_clean = xyz_tmp
184 | 
185 |     x_min = xyz_tmp[:,0].min()
186 |     x_max = xyz_tmp[:,0].max()
187 |     y_max = xyz_tmp[:,1].min()
188 |     y_min = xyz_tmp[:,1].max()
189 |     z_max = xyz_tmp[:,2].min()
190 |     z_min = xyz_tmp[:,2].max()
191 |     dx_orig = x_max-x_min
192 |     dy_orig = y_max-y_min
193 |     dz_orig = z_max-z_min
194 | 
195 |     x_min = xyz_clean[:,0].min()
196 |     x_max = xyz_clean[:,0].max()
197 |     y_max = xyz_clean[:,1].min()
198 |     y_min = xyz_clean[:,1].max()
199 |     z_max = xyz_clean[:,2].min()
200 |     z_min = xyz_clean[:,2].max()
201 |     dx = x_max-x_min
202 |     dy = y_max-y_min
203 |     dz = z_max-z_min
204 |     # 8 corners
205 |     bbox3d_pseudo = gen_8corners(x_min, y_min, z_min, dx, dy, dz)
206 |     bbox3d_pseudo -= xyz_offset
207 |     bbox = heading2rotmat(yaw) @ bbox3d_pseudo.T
208 |     bbox = bbox.T + xyz_offset
209 |     lineset = build_lineset(bbox, color=[0,0,1], flip=False)
210 |     return bbox, lineset, (dx, dy, dz), yaw
211 | 
212 | 
213 | def run_seg_anything(model, im, bbox2D):
214 |     model.set_image(im, image_format="BGR")
215 |     bbox = np.array(bbox2D) # XYXY
216 |     masks, _, _ = model.predict(box=bbox)
217 |     return masks
218 | 
219 | 
220 | def run_one_2dbox_to_3d(depth_o3d, mask2d, rgb_o3d, K):
221 | 
222 |     rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(
223 |         color=rgb_o3d,
224 |         depth=depth_o3d,
225 |         depth_scale=1.0,  
226 |         depth_trunc=1000.0,  
227 |         convert_rgb_to_intensity=False
228 |     )
229 |     # try:
230 |     if True:
231 |         print("start build pseudo bbox3d")
232 |         bbox3d_pseudo, _, _, yaw = build_pseudo_bbox3d_from_mask2d_outlier(
233 |             mask2d, rgbd_image.depth, K
234 |         ) 
235 |         print("end build pseudo bbox3d")
236 | 
237 |     flip_matrix = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
238 |     bbox3d_pseudo = bbox3d_pseudo.dot(flip_matrix)
239 | 
240 |     # center, dimension, then get the pose 
241 |     # such that conver from (center, dimension, pose) to 8 corners 
242 |     # aligning with the pseudo label
243 |     cube_dims = torch.from_numpy(get_dims(bbox3d_pseudo)).unsqueeze(0)
244 |     cube_3d = torch.from_numpy(np.mean(bbox3d_pseudo, axis=0)).unsqueeze(0)
245 |     cube_pose = torch.eye(3).unsqueeze(0)
246 |     bbox3d_infer = util.get_cuboid_verts_faces(
247 |         torch.cat((cube_3d, cube_dims), dim=1), 
248 |         cube_pose,
249 |     )[0]
250 |     bbox3d_infer = bbox3d_infer.squeeze().numpy()
251 | 
252 |     cube_pose_new = get_pose(bbox3d_pseudo, bbox3d_infer)
253 |     bbox3d_infer2 = util.get_cuboid_verts_faces(
254 |         torch.cat((cube_3d, cube_dims), dim=1), 
255 |         cube_pose_new,
256 |     )[0]
257 |     bbox3d_infer2 = bbox3d_infer2.squeeze().numpy()
258 |     return cube_3d.tolist(), cube_dims.tolist(), cube_pose_new.tolist(), bbox3d_infer2.tolist()
259 | 
260 | 
261 | dataset_list = {
262 |                 'KITTI_test_novel': './datasets/Omni3D/gdino_kitti_novel_oracle_2d.json',
263 |                 'ARKitScenes_test_novel': './datasets/Omni3D/gdino_arkitscenes_novel_oracle_2d.json',
264 |                 'SUNRGBD_test_novel': './datasets/Omni3D/gdino_sunrgbd_novel_oracle_2d.json',}
265 | 
266 | # Load model and preprocessing transform
267 | depthpro_model, depthpro_transform = depth_pro.create_model_and_transforms(device=torch.device("cuda"),precision=torch.float16)
268 | depthpro_model.eval()
269 | 
270 | ckpt = "./checkpoints/sam_vit_h_4b8939.pth"
271 | sam = sam_model_registry["default"](checkpoint=ckpt).to(device="cuda")
272 | seg_predictor = SamPredictor(sam)
273 | 
274 | threshold = 0.30
275 | 
276 | for dataset_name, dataset_pth in dataset_list.items():
277 |     with open(dataset_pth, 'r') as f:
278 |         dataset = json.load(f)
279 |     root = "./datasets/"
280 |     with open(os.path.join(root, "Omni3D", f"{dataset_name}.json"), "r") as file:
281 |         gt_anns = json.load(file)
282 |     imgid2path = {}
283 |     for img in gt_anns["images"]:
284 |         imgid2path[img['id']] = img['file_path']
285 |     new_dataset = []
286 |     for img in tqdm.tqdm(dataset):
287 |         im_path = os.path.join(root, imgid2path[img['image_id']])
288 | 
289 |         # Load and preprocess an image.
290 |         image, _, f_px = depth_pro.load_rgb(im_path)
291 |         image = depthpro_transform(image)
292 | 
293 |         # Run inference.
294 |         prediction = depthpro_model.infer(image, f_px=f_px)
295 |         depth = prediction["depth"]  # Depth in [m].
296 | 
297 |         depth_numpy = depth.cpu().numpy().astype(np.float32)
298 | 
299 |         depth_o3d = o3d.geometry.Image(depth_numpy)
300 |         new_instances = []
301 |         rgb = cv2.imread(im_path)
302 |         rgb_o3d = o3d.io.read_image(im_path)
303 |         K = np.array(img['K'])
304 |         for ins in img["instances"]:
305 |             if ins['score'] < threshold:
306 |                 continue
307 |             bbox2D = xywh_to_xyxy(ins["bbox"])
308 |             mask2D = run_seg_anything(seg_predictor, rgb, bbox2D)
309 |             mask2d = mask2D[2, :, :] # largest mask
310 |             cube_3d, cube_dims, cube_pose_new, bbox3d_infer2 = run_one_2dbox_to_3d(depth_o3d, mask2d, rgb_o3d, K)
311 | 
312 |             new_instance = {key: value for key, value in ins.items() if key in ['category_id', 'bbox', 'score', 'category_name']}
313 |             new_instance["image_id"] = img['image_id']
314 |             new_instance["bbox3D"] = bbox3d_infer2
315 |             new_instance["depth"] = cube_3d[0][-1]
316 | 
317 |             new_instance["center_cam"] = cube_3d[0]
318 |             new_instance["dimensions"] = cube_dims[0]
319 |             new_instance["pose"] = cube_pose_new
320 |             x, y = project_3d_to_2d(cube_3d[0][0], cube_3d[0][1], cube_3d[0][2], K)
321 |             new_instance["center_2D"] = [x, y]
322 |             new_instances.append(new_instance)
323 |             
324 |         new_img = {key: value for key, value in img.items()}
325 |         new_img["instances"] = new_instances
326 |         new_dataset.append(new_img)
327 |     # Create output directory if it doesn't exist
328 |     output_dir = "./output/ovmono3d_geo"
329 |     os.makedirs(output_dir, exist_ok=True)
330 |     
331 |     torch.save(new_dataset, f"{output_dir}/{dataset_name}.pth")
332 | 


--------------------------------------------------------------------------------