├── .github └── coco.png ├── .gitignore ├── LICENSE ├── README.md ├── configs ├── Base.yaml ├── GroundingDINO_SwinB_cfg.py ├── OVMono3D_clip_SFP.yaml ├── OVMono3D_dinov2_SFP.yaml ├── OVMono3D_mae_SFP.yaml ├── OVMono3D_midas_SFP.yaml ├── OVMono3D_sam_SFP.yaml ├── category_meta.json └── category_meta50.json ├── cubercnn ├── config │ ├── __init__.py │ └── config.py ├── data │ ├── __init__.py │ ├── build.py │ ├── builtin.py │ ├── dataset_mapper.py │ └── datasets.py ├── evaluation │ ├── __init__.py │ └── omni3d_evaluation.py ├── modeling │ ├── backbone │ │ ├── __init__.py │ │ ├── clip.py │ │ ├── densenet.py │ │ ├── dino.py │ │ ├── dla.py │ │ ├── mae.py │ │ ├── midas_final.py │ │ ├── mnasnet.py │ │ ├── resnet.py │ │ ├── sam.py │ │ └── shufflenet.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── rcnn3d.py │ ├── proposal_generator │ │ ├── __init__.py │ │ └── rpn.py │ └── roi_heads │ │ ├── __init__.py │ │ ├── cube_head.py │ │ ├── fast_rcnn.py │ │ ├── roi_heads.py │ │ └── roi_heads_gdino.py ├── solver │ ├── __init__.py │ ├── build.py │ └── checkpoint.py ├── util │ ├── __init__.py │ ├── math_util.py │ ├── model_zoo.py │ └── util.py └── vis │ ├── __init__.py │ ├── logperf.py │ └── vis.py ├── datasets ├── ARKitScenes │ └── download_arkitscenes_images.sh ├── Omni3D │ └── download_omni3d_json.sh ├── coco_examples │ ├── 000000044260.jpg │ ├── 000000088432.jpg │ ├── 000000101762.jpg │ ├── 000000120584.jpg │ ├── 000000128148.jpg │ ├── 000000162543.jpg │ ├── 000000164115.jpg │ ├── 000000311950.jpg │ ├── 000000429011.jpg │ └── labels.json └── objectron │ └── download_objectron_images.sh ├── demo └── demo.py ├── download_data.sh ├── setup.sh └── tools ├── __init__.py ├── eval_ovmono3d_geo.py ├── ovmono3d_geo.py └── train_net.py /.github/coco.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/.github/coco.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # folders or files 2 | datasets/* 3 | cubercnn/modeling/backbone/checkpoint_weights/* 4 | .vscode/ 5 | .ipynb_checkpoints/ 6 | .idea/ 7 | output/ 8 | cubercnn/external/ 9 | slurm/ 10 | datasets 11 | unused/ 12 | checkpoints/ 13 | ovmono3d_data/ 14 | ovmono3d 15 | # filetypes 16 | *.pyc 17 | *.mexa64 18 | */output/* 19 | */output*/* 20 | *~ 21 | *.so 22 | *.ipynb 23 | *.pth 24 | *.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | # Open Vocabulary Monocular 3D Object Detection 6 | 7 | [Jin Yao][jy], [Hao Gu][hg], [Xuweiyi Chen][xc], [Jiayun Wang][jw], [Zezhou Cheng][zc] 8 | 9 | 10 | [![Website](https://img.shields.io/badge/Project-Page-b361ff 11 | )](https://uva-computer-vision-lab.github.io/ovmono3d/) 12 | [![Paper](https://img.shields.io/badge/arXiv-PDF-b31b1b)](https://arxiv.org/pdf/2411.16833) 13 | 14 | 15 |
16 | 17 | 18 | 19 | 25 | 31 | 32 |
26 |

27 | Zero-shot predictions on COCO 28 | COCO demo 29 |

30 |
33 | 34 | 35 | ## Installation 36 | Our used cuda version is 12.1.1. 37 | Run 38 | ```bash 39 | conda create -n ovmono3d python=3.8.20 40 | conda activate ovmono3d 41 | 42 | pip install torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cu121 43 | ``` 44 | to create the environment and install pytorch. 45 | 46 | Run 47 | ```bash 48 | sh setup.sh 49 | ``` 50 | to install additional dependencies and download model checkpoints of OVMono3D-LIFT and other foundation models. 51 | 52 | ## Demo 53 | Run 54 | ```bash 55 | python demo/demo.py --config-file configs/OVMono3D_dinov2_SFP.yaml \ 56 | --input-folder datasets/coco_examples \ 57 | --labels-file datasets/coco_examples/labels.json \ 58 | --threshold 0.45 \ 59 | MODEL.ROI_HEADS.NAME ROIHeads3DGDINO \ 60 | MODEL.WEIGHTS checkpoints/ovmono3d_lift.pth \ 61 | OUTPUT_DIR output/coco_examples 62 | ``` 63 | to get the results for the example COCO images. 64 | 65 | You can also try your own images and prompted category labels. See the format of the label file in [`labels.json`](datasets/coco_examples/labels.json). If you know the camera intrinsics you could input them as arguments with the convention `--focal-length ` and `--principal-point `. Check [`demo.py`](demo/demo.py) for more details. 66 | 67 | 68 | ## Data 69 | Please follow the instructions in [Omni3D](https://github.com/facebookresearch/omni3d/blob/main/DATA.md) to set up the datasets. 70 | Run 71 | ```bash 72 | sh ./download_data.sh 73 | ``` 74 | to download our pre-processed OVMono3D 2D predictions (12 GB after unzipping). 75 | 76 | 77 | ## Evaluation 78 | 79 | 80 | To run inference and evaluation of OVMono3D-LIFT, use the following command: 81 | ```bash 82 | python tools/train_net.py --eval-only --config-file configs/OVMono3D_dinov2_SFP.yaml --num-gpus 2 \ 83 | OUTPUT_DIR output/ovmono3d_lift \ 84 | MODEL.WEIGHTS checkpoints/ovmono3d_lift.pth \ 85 | TEST.CAT_MODE "novel" \ 86 | DATASETS.ORACLE2D_FILES.EVAL_MODE "target_aware" 87 | ``` 88 | TEST.CAT_MODE denotes the category set to be evaluated: `novel` or `base` or `all` 89 | 90 | DATASETS.ORACLE2D_FILES.EVAL_MODE denotes the evaluation protocol: `target_aware` or `previous_metric` 91 | 92 | To run inference and evaluation of OVMono3D-GEO, use the following commands: 93 | ```bash 94 | python tools/ovmono3d_geo.py 95 | python tools/eval_ovmono3d_geo.py 96 | ``` 97 | 98 | 99 | ## Training 100 | 101 | To run training of OVMono3D-LIFT, use the following command: 102 | ```bash 103 | python tools/train_net.py --config-file configs/OVMono3D_dinov2_SFP.yaml --num-gpus 8 \ 104 | OUTPUT_DIR output/ovmono3d_lift \ 105 | VIS_PERIOD 500 TEST.EVAL_PERIOD 2000 \ 106 | MODEL.STABILIZE 0.03 \ 107 | SOLVER.BASE_LR 0.012 \ 108 | SOLVER.CHECKPOINT_PERIOD 1000 \ 109 | SOLVER.IMS_PER_BATCH 64 110 | ``` 111 | 112 | The training hyperparameters above are used in our experiments. While these parameters can be customized to suit your specific requirements, please note that performance may vary across different configurations. 113 | 114 | 115 | ## Citing 116 | If you find this work useful for your research, please kindly cite: 117 | 118 | ```BibTeX 119 | @article{yao2024open, 120 | title={Open Vocabulary Monocular 3D Object Detection}, 121 | author={Yao, Jin and Gu, Hao and Chen, Xuweiyi and Wang, Jiayun and Cheng, Zezhou}, 122 | journal={arXiv preprint arXiv:2411.16833}, 123 | year={2024} 124 | } 125 | ``` 126 | Please also consider cite the awesome work of [Omni3D](https://github.com/facebookresearch/omni3d) and datasets used in Omni3D. 127 |
BibTex 128 | 129 | ```BibTeX 130 | @inproceedings{brazil2023omni3d, 131 | author = {Garrick Brazil and Abhinav Kumar and Julian Straub and Nikhila Ravi and Justin Johnson and Georgia Gkioxari}, 132 | title = {{Omni3D}: A Large Benchmark and Model for {3D} Object Detection in the Wild}, 133 | booktitle = {CVPR}, 134 | address = {Vancouver, Canada}, 135 | month = {June}, 136 | year = {2023}, 137 | organization = {IEEE}, 138 | } 139 | ``` 140 | 141 | ```BibTex 142 | @inproceedings{Geiger2012CVPR, 143 | author = {Andreas Geiger and Philip Lenz and Raquel Urtasun}, 144 | title = {Are we ready for Autonomous Driving? The KITTI Vision Benchmark Suite}, 145 | booktitle = {CVPR}, 146 | year = {2012} 147 | } 148 | ``` 149 | 150 | ```BibTex 151 | @inproceedings{caesar2020nuscenes, 152 | title={nuscenes: A multimodal dataset for autonomous driving}, 153 | author={Caesar, Holger and Bankiti, Varun and Lang, Alex H and Vora, Sourabh and Liong, Venice Erin and Xu, Qiang and Krishnan, Anush and Pan, Yu and Baldan, Giancarlo and Beijbom, Oscar}, 154 | booktitle={CVPR}, 155 | year={2020} 156 | } 157 | ``` 158 | 159 | ```BibTex 160 | @inproceedings{song2015sun, 161 | title={Sun rgb-d: A rgb-d scene understanding benchmark suite}, 162 | author={Song, Shuran and Lichtenberg, Samuel P and Xiao, Jianxiong}, 163 | booktitle={CVPR}, 164 | year={2015} 165 | } 166 | ``` 167 | 168 | ```BibTex 169 | @inproceedings{dehghan2021arkitscenes, 170 | title={{ARK}itScenes - A Diverse Real-World Dataset for 3D Indoor Scene Understanding Using Mobile {RGB}-D Data}, 171 | author={Gilad Baruch and Zhuoyuan Chen and Afshin Dehghan and Tal Dimry and Yuri Feigin and Peter Fu and Thomas Gebauer and Brandon Joffe and Daniel Kurz and Arik Schwartz and Elad Shulman}, 172 | booktitle={NeurIPS Datasets and Benchmarks Track (Round 1)}, 173 | year={2021}, 174 | } 175 | ``` 176 | 177 | ```BibTex 178 | @inproceedings{hypersim, 179 | author = {Mike Roberts AND Jason Ramapuram AND Anurag Ranjan AND Atulit Kumar AND 180 | Miguel Angel Bautista AND Nathan Paczan AND Russ Webb AND Joshua M. Susskind}, 181 | title = {{Hypersim}: {A} Photorealistic Synthetic Dataset for Holistic Indoor Scene Understanding}, 182 | booktitle = {ICCV}, 183 | year = {2021}, 184 | } 185 | ``` 186 | 187 | ```BibTex 188 | @article{objectron2021, 189 | title={Objectron: A Large Scale Dataset of Object-Centric Videos in the Wild with Pose Annotations}, 190 | author={Ahmadyan, Adel and Zhang, Liangkai and Ablavatski, Artsiom and Wei, Jianing and Grundmann, Matthias}, 191 | journal={CVPR}, 192 | year={2021}, 193 | } 194 | ``` 195 | 196 |
197 | 198 | 199 | [jy]: https://yaojin17.github.io 200 | [hg]: https://www.linkedin.com/in/hao--gu/ 201 | [xc]: https://xuweiyichen.github.io/ 202 | [jw]: https://pwang.pw/ 203 | [zc]: https://sites.google.com/site/zezhoucheng/ 204 | 205 | -------------------------------------------------------------------------------- /configs/Base.yaml: -------------------------------------------------------------------------------- 1 | SOLVER: 2 | TYPE: "sgd" 3 | IMS_PER_BATCH: 32 4 | BASE_LR: 0.02 5 | STEPS: (19200, 25600) 6 | MAX_ITER: 32000 7 | WEIGHT_DECAY: 0.0001 8 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 9 | INPUT: 10 | MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,) 11 | MIN_SIZE_TEST: 512 12 | MAX_SIZE_TRAIN: 4096 13 | MAX_SIZE_TEST: 4096 14 | TEST: 15 | VISIBILITY_THRES: 0.33333333 16 | TRUNCATION_THRES: 0.33333333 17 | EVAL_PERIOD: 16000 18 | DATASETS: 19 | TRAIN: ('KITTI_train', 'KITTI_val') 20 | TEST: ('KITTI_test',) 21 | CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person') 22 | IGNORE_NAMES: "['dontcare', 'ignore', 'void']" 23 | MIN_HEIGHT_THRES: 0.05 24 | TRUNCATION_THRES: 0.75 25 | VISIBILITY_THRES: 0.25 26 | TRUNC_2D_BOXES: True 27 | VIS_PERIOD: 640 28 | DATALOADER: 29 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 30 | REPEAT_THRESHOLD: 0.1 31 | MODEL: 32 | PIXEL_MEAN: [103.530, 116.280, 123.675] 33 | PIXEL_STD: [57.375, 57.120, 58.395] 34 | META_ARCHITECTURE: "RCNN3D" 35 | MASK_ON: False 36 | STABILIZE: 0.02 37 | USE_BN: True 38 | BACKBONE: 39 | FREEZE_AT: 0 40 | NAME: 'build_dla_from_vision_fpn_backbone' 41 | DLA: 42 | TYPE: 'dla34' 43 | FPN: 44 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6'] 45 | ANCHOR_GENERATOR: 46 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 47 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 48 | RPN: 49 | HEAD_NAME: "StandardRPNHead" 50 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6'] 51 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 52 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 53 | POST_NMS_TOPK_TRAIN: 1000 54 | POST_NMS_TOPK_TEST: 1000 55 | BOUNDARY_THRESH: -1 56 | OBJECTNESS_UNCERTAINTY: "IoUness" 57 | IOU_THRESHOLDS: [0.05, 0.05] 58 | POSITIVE_FRACTION: 1.0 59 | PROPOSAL_GENERATOR: 60 | NAME: "RPNWithIgnore" 61 | ROI_HEADS: 62 | NAME: "ROIHeads3D" 63 | IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6'] 64 | BATCH_SIZE_PER_IMAGE: 512 65 | SCORE_THRESH_TEST: 0.01 66 | NUM_CLASSES: 43 67 | ROI_BOX_HEAD: 68 | NAME: "FastRCNNConvFCHead" 69 | NUM_FC: 2 70 | POOLER_RESOLUTION: 7 71 | ROI_CUBE_HEAD: 72 | NAME: 'CubeHead' 73 | Z_TYPE: 'direct' 74 | POSE_TYPE: '6d' 75 | NUM_FC: 2 76 | SHARED_FC: True 77 | USE_CONFIDENCE: 1.0 78 | LOSS_W_3D: 1.0 79 | POOLER_TYPE: 'ROIAlignV2' 80 | POOLER_RESOLUTION: 7 81 | DIMS_PRIORS_ENABLED: False 82 | DISENTANGLED_LOSS: True 83 | ALLOCENTRIC_POSE: True 84 | VIRTUAL_FOCAL: 512.0 85 | VIRTUAL_DEPTH: True 86 | CHAMFER_POSE: True 87 | VERSION: 2 -------------------------------------------------------------------------------- /configs/GroundingDINO_SwinB_cfg.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_B_384_22k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /configs/OVMono3D_clip_SFP.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | INPUT: 10 | MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,) 11 | MIN_SIZE_TEST: 608 12 | MAX_SIZE_TRAIN: 1024 13 | MAX_SIZE_TEST: 1024 14 | FORMAT: "RGB" 15 | TEST: 16 | EVAL_PERIOD: 29000 17 | VIS_PERIOD: 2320 18 | DATASETS: 19 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 20 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 21 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 22 | MODEL: 23 | PIXEL_MEAN: [123.675, 116.280, 103.530] 24 | PIXEL_STD: [58.395, 57.120, 57.375] # changed to rgb order 25 | BACKBONE: 26 | FREEZE_AT: 0 27 | NAME: 'build_clip_backbone' 28 | CLIP: 29 | ARCH: 'ViT-B-16' 30 | CHECKPOINT: 'openai' 31 | OUTPUT: 'dense' 32 | LAYER: -1 33 | RETURN_MULTILAYER: False 34 | FPN: 35 | IN_FEATURE: 'last_feat' 36 | NORM: 'LN' 37 | SQUARE_PAD: 1024 38 | ANCHOR_GENERATOR: 39 | SIZES: [[64], [128], [256], [512]] 40 | RPN: 41 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] 42 | ROI_HEADS: 43 | NUM_CLASSES: 50 44 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] -------------------------------------------------------------------------------- /configs/OVMono3D_dinov2_SFP.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | INPUT: 10 | MIN_SIZE_TRAIN: (280, 308, 336, 364, 392, 420, 448, 476, 504, 532, 560, 588, 616, 644, 672, 700, 728, 756, 784, 812, 840, 868, 896, 924, 952, 980, 1008,) 11 | MIN_SIZE_TEST: 532 12 | MAX_SIZE_TRAIN: 896 13 | MAX_SIZE_TEST: 896 14 | FORMAT: "RGB" 15 | TEST: 16 | EVAL_PERIOD: 29000 17 | VIS_PERIOD: 2320 18 | DATASETS: 19 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 20 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 21 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 22 | MODEL: 23 | PIXEL_MEAN: [123.675, 116.280, 103.530] 24 | PIXEL_STD: [58.395, 57.120, 57.375] # changed to rgb order 25 | BACKBONE: 26 | FREEZE_AT: 0 27 | NAME: 'build_dino_backbone' 28 | DINO: 29 | NAME: 'dinov2' 30 | MODEL_NAME: 'vitb14' 31 | OUTPUT: 'dense' 32 | LAYER: -1 33 | RETURN_MULTILAYER: False 34 | FPN: 35 | IN_FEATURE: 'last_feat' 36 | NORM: 'LN' 37 | SQUARE_PAD: 896 38 | ANCHOR_GENERATOR: 39 | SIZES: [[64], [256], [512]] 40 | RPN: 41 | IN_FEATURES: ['p2', 'p3', 'p4'] 42 | ROI_HEADS: 43 | NUM_CLASSES: 50 44 | IN_FEATURES: ['p2', 'p3', 'p4'] -------------------------------------------------------------------------------- /configs/OVMono3D_mae_SFP.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | INPUT: 10 | MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,) 11 | MIN_SIZE_TEST: 608 12 | MAX_SIZE_TRAIN: 1024 13 | MAX_SIZE_TEST: 1024 14 | FORMAT: "RGB" 15 | TEST: 16 | EVAL_PERIOD: 29000 17 | VIS_PERIOD: 2320 18 | DATASETS: 19 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 20 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 21 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 22 | MODEL: 23 | PIXEL_MEAN: [123.675, 116.280, 103.530] 24 | PIXEL_STD: [58.395, 57.120, 57.375] # changed to rgb order 25 | BACKBONE: 26 | FREEZE_AT: 0 27 | NAME: 'build_mae_backbone' 28 | MAE: 29 | CHECKPOINT: 'facebook/vit-mae-base' 30 | OUTPUT: 'dense' 31 | LAYER: -1 32 | RETURN_MULTILAYER: False 33 | FPN: 34 | IN_FEATURE: 'last_feat' 35 | NORM: 'LN' 36 | SQUARE_PAD: 1024 37 | ANCHOR_GENERATOR: 38 | SIZES: [[64], [128], [256], [512]] 39 | RPN: 40 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] 41 | ROI_HEADS: 42 | NUM_CLASSES: 50 43 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] -------------------------------------------------------------------------------- /configs/OVMono3D_midas_SFP.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | INPUT: 10 | MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,) 11 | MIN_SIZE_TEST: 608 12 | MAX_SIZE_TRAIN: 1024 13 | MAX_SIZE_TEST: 1024 14 | FORMAT: "RGB" 15 | TEST: 16 | EVAL_PERIOD: 29000 17 | VIS_PERIOD: 2320 18 | DATASETS: 19 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 20 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 21 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 22 | MODEL: 23 | PIXEL_MEAN: [123.675, 116.280, 103.530] 24 | PIXEL_STD: [58.395, 57.120, 57.375] # changed to rgb order 25 | BACKBONE: 26 | FREEZE_AT: 0 27 | NAME: 'build_midas_backbone' 28 | MIDAS: 29 | OUTPUT: 'dense' 30 | LAYER: -1 31 | RETURN_MULTILAYER: False 32 | FPN: 33 | IN_FEATURE: 'last_feat' 34 | NORM: 'LN' 35 | SQUARE_PAD: 1024 36 | ANCHOR_GENERATOR: 37 | SIZES: [[64], [128], [256], [512]] 38 | RPN: 39 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] 40 | ROI_HEADS: 41 | NUM_CLASSES: 50 42 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] -------------------------------------------------------------------------------- /configs/OVMono3D_sam_SFP.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | SOLVER: 3 | TYPE: "sgd" 4 | IMS_PER_BATCH: 192 5 | BASE_LR: 0.12 6 | STEPS: (69600, 92800) 7 | MAX_ITER: 116000 8 | WARMUP_ITERS: 3625 9 | INPUT: 10 | MIN_SIZE_TRAIN: (320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024,) 11 | MIN_SIZE_TEST: 608 12 | MAX_SIZE_TRAIN: 1024 13 | MAX_SIZE_TEST: 1024 14 | FORMAT: "RGB" 15 | TEST: 16 | EVAL_PERIOD: 29000 17 | VIS_PERIOD: 2320 18 | DATASETS: 19 | TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val') 20 | TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 21 | CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 22 | MODEL: 23 | PIXEL_MEAN: [123.675, 116.280, 103.530] 24 | PIXEL_STD: [58.395, 57.120, 57.375] # changed to rgb order 25 | BACKBONE: 26 | FREEZE_AT: 0 27 | NAME: 'build_sam_backbone' 28 | MIDAS: 29 | OUTPUT: 'dense' 30 | LAYER: -1 31 | RETURN_MULTILAYER: False 32 | FPN: 33 | IN_FEATURE: 'last_feat' 34 | NORM: 'LN' 35 | SQUARE_PAD: 1024 36 | ANCHOR_GENERATOR: 37 | SIZES: [[64], [128], [256], [512]] 38 | RPN: 39 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] 40 | ROI_HEADS: 41 | NUM_CLASSES: 50 42 | IN_FEATURES: ['p2', 'p3', 'p4', 'p5'] -------------------------------------------------------------------------------- /configs/category_meta.json: -------------------------------------------------------------------------------- 1 | {"_comment": "97 classes", "thing_classes": ["pedestrian", "car", "dontcare", "cyclist", "van", "truck", "tram", "person", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "dresser", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "board", "kitchen pan", "bin", "stove", "microwave", "plates", "bowl", "oven", "vase", "faucet", "tissues", "machine", "printer", "monitor", "podium", "cart", "projector", "electronics", "computer", "air conditioner", "drawers", "coffee maker", "toaster", "potted plant", "painting", "bag", "tray", "keyboard", "blanket", "rack", "phone", "mouse", "fire extinguisher", "toys", "ladder", "fan", "glass", "clock", "toilet paper", "closet", "fume hood", "utensils", "soundsystem", "shower curtain", "remote", "pen", "fireplace"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "12": 12, "13": 13, "14": 14, "15": 15, "16": 16, "17": 17, "18": 18, "19": 19, "20": 20, "21": 21, "22": 22, "23": 23, "24": 24, "25": 25, "26": 26, "27": 27, "28": 28, "29": 29, "30": 30, "31": 31, "32": 32, "33": 33, "34": 34, "35": 35, "36": 36, "37": 37, "38": 38, "39": 39, "40": 40, "41": 41, "42": 42, "43": 43, "44": 44, "45": 45, "46": 46, "47": 47, "48": 48, "49": 49, "50": 50, "51": 51, "52": 52, "53": 53, "54": 54, "55": 55, "56": 56, "57": 57, "58": 58, "59": 59, "60": 60, "61": 61, "62": 62, "63": 63, "64": 64, "65": 65, "66": 66, "67": 67, "68": 68, "69": 69, "70": 70, "71": 71, "72": 72, "73": 73, "74": 74, "75": 75, "76": 76, "77": 77, "78": 78, "79": 79, "80": 80, "81": 81, "82": 82, "83": 83, "84": 84, "85": 85, "86": 86, "87": 87, "88": 88, "89": 89, "90": 90, "91": 91, "92": 92, "94": 93, "95": 94, "96": 95, "97": 96}} -------------------------------------------------------------------------------- /configs/category_meta50.json: -------------------------------------------------------------------------------- 1 | {"_comment": "50 classes", "thing_classes": ["pedestrian", "car", "cyclist", "van", "truck", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "bin", "stove", "oven", "machine"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "3": 2, "4": 3, "5": 4, "8": 5, "9": 6, "10": 7, "11": 8, "12": 9, "13": 10, "14": 11, "15": 12, "16": 13, "17": 14, "18": 15, "19": 16, "20": 17, "21": 18, "22": 19, "23": 20, "24": 21, "25": 22, "26": 23, "27": 24, "28": 25, "29": 26, "30": 27, "31": 28, "32": 29, "33": 30, "34": 31, "35": 32, "36": 33, "37": 34, "38": 35, "39": 36, "40": 37, "42": 38, "43": 39, "44": 40, "45": 41, "46": 42, "47": 43, "48": 44, "49": 45, "52": 46, "53": 47, "57": 48, "61": 49}} -------------------------------------------------------------------------------- /cubercnn/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * -------------------------------------------------------------------------------- /cubercnn/config/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.config import CfgNode as CN 3 | 4 | def get_cfg_defaults(cfg): 5 | 6 | # A list of category names which will be used 7 | cfg.DATASETS.CATEGORY_NAMES = [] 8 | 9 | # The category names which will be treated as ignore 10 | # e.g., not counting as background during training 11 | # or as false positives during evaluation. 12 | cfg.DATASETS.IGNORE_NAMES = [] 13 | 14 | # Should the datasets appear with the same probabilty 15 | # in batches (e.g., the imbalance from small and large 16 | # datasets will be accounted for during sampling) 17 | cfg.DATALOADER.BALANCE_DATASETS = False 18 | 19 | # The thresholds for when to treat a known box 20 | # as ignore based on too heavy of truncation or 21 | # too low of visibility in the image. This affects 22 | # both training and evaluation ignores. 23 | cfg.DATASETS.TRUNCATION_THRES = 0.99 24 | cfg.DATASETS.VISIBILITY_THRES = 0.01 25 | cfg.DATASETS.MIN_HEIGHT_THRES = 0.00 26 | cfg.DATASETS.MAX_DEPTH = 1e8 27 | 28 | # Whether modal 2D boxes should be loaded, 29 | # or if the full 3D projected boxes should be used. 30 | cfg.DATASETS.MODAL_2D_BOXES = False 31 | 32 | # Whether truncated 2D boxes should be loaded, 33 | # or if the 3D full projected boxes should be used. 34 | cfg.DATASETS.TRUNC_2D_BOXES = True 35 | 36 | cfg.DATASETS.TEST_BASE = ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 37 | cfg.DATASETS.TEST_NOVEL = ('SUNRGBD_test_novel','ARKitScenes_test_novel', 'KITTI_test_novel') 38 | cfg.DATASETS.CATEGORY_NAMES_BASE = ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin') 39 | cfg.DATASETS.CATEGORY_NAMES_NOVEL = ('monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem', 'fireplace', 'tram') 40 | 41 | # Oracle 2D files for evaluation 42 | cfg.DATASETS.ORACLE2D_FILES = CN() 43 | cfg.DATASETS.ORACLE2D_FILES.EVAL_MODE = 'target_aware' # 'target_aware' or 'previous_metric' 44 | 45 | # Create a configuration for each evaluation mode 46 | for mode in ['target_aware', 'previous_metric']: 47 | cfg.DATASETS.ORACLE2D_FILES[mode] = CN() 48 | cfg.DATASETS.ORACLE2D_FILES[mode].novel = CN() 49 | cfg.DATASETS.ORACLE2D_FILES[mode].base = CN() 50 | 51 | # Oracle 2D file for the Novel class dataset 52 | novel_datasets = { 53 | 'SUNRGBD_test_novel': 'sunrgbd', 54 | 'ARKitScenes_test_novel': 'arkitscenes', 55 | 'KITTI_test_novel': 'kitti' 56 | } 57 | 58 | # Oracle 2D file for the Base class dataset 59 | base_datasets = { 60 | 'SUNRGBD_test': 'sunrgbd', 61 | 'Hypersim_test': 'hypersim', 62 | 'ARKitScenes_test': 'arkitscenes', 63 | 'Objectron_test': 'objectron', 64 | 'KITTI_test': 'kitti', 65 | 'nuScenes_test': 'nuscenes' 66 | } 67 | 68 | # Set the file path for the novel class 69 | for dataset, dataset_name in novel_datasets.items(): 70 | prefix = 'gdino_novel_previous_metric' if mode == 'previous_metric' else 'gdino' 71 | cfg.DATASETS.ORACLE2D_FILES[mode].novel[dataset] = f'datasets/Omni3D/{prefix}_{dataset_name}_novel_oracle_2d.json' 72 | 73 | # Set the file path for the base class 74 | for dataset, dataset_name in base_datasets.items(): 75 | prefix = 'gdino_previous_eval' if mode == 'previous_metric' else 'gdino' 76 | cfg.DATASETS.ORACLE2D_FILES[mode].base[dataset] = f'datasets/Omni3D/{prefix}_{dataset_name}_base_oracle_2d.json' 77 | 78 | cfg.MODEL.FPN.IN_FEATURE = None 79 | cfg.MODEL.FPN.SQUARE_PAD = 0 80 | # Threshold used for matching and filtering boxes 81 | # inside of ignore regions, within the RPN and ROIHeads 82 | cfg.MODEL.RPN.IGNORE_THRESHOLD = 0.5 83 | 84 | cfg.MODEL.DINO = CN() 85 | cfg.MODEL.DINO.NAME = 'dinov2' 86 | cfg.MODEL.DINO.MODEL_NAME = 'vitb14' 87 | cfg.MODEL.DINO.OUTPUT = 'dense' 88 | cfg.MODEL.DINO.LAYER = -1 89 | cfg.MODEL.DINO.RETURN_MULTILAYER = False 90 | 91 | cfg.MODEL.MAE = CN() 92 | cfg.MODEL.MAE.CHECKPOINT = 'facebook/vit-mae-base' 93 | cfg.MODEL.MAE.OUTPUT = 'dense' 94 | cfg.MODEL.MAE.LAYER = -1 95 | cfg.MODEL.MAE.RETURN_MULTILAYER = False 96 | 97 | cfg.MODEL.CLIP = CN() 98 | cfg.MODEL.CLIP.ARCH = 'ViT-B-16' 99 | cfg.MODEL.CLIP.CHECKPOINT = 'openai' 100 | cfg.MODEL.CLIP.OUTPUT = 'dense' 101 | cfg.MODEL.CLIP.LAYER = -1 102 | cfg.MODEL.CLIP.RETURN_MULTILAYER = False 103 | 104 | cfg.MODEL.MIDAS = CN() 105 | cfg.MODEL.MIDAS.OUTPUT = 'dense' 106 | cfg.MODEL.MIDAS.LAYER = -1 107 | cfg.MODEL.MIDAS.RETURN_MULTILAYER = False 108 | 109 | cfg.MODEL.SAM = CN() 110 | cfg.MODEL.SAM.OUTPUT = 'dense' 111 | cfg.MODEL.SAM.LAYER = -1 112 | cfg.MODEL.SAM.RETURN_MULTILAYER = False 113 | 114 | # Configuration for cube head 115 | cfg.MODEL.ROI_CUBE_HEAD = CN() 116 | cfg.MODEL.ROI_CUBE_HEAD.NAME = "CubeHead" 117 | cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION = 7 118 | cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO = 0 119 | cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE = "ROIAlignV2" 120 | 121 | # Settings for the cube head features 122 | cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV = 0 123 | cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM = 256 124 | cfg.MODEL.ROI_CUBE_HEAD.NUM_FC = 2 125 | cfg.MODEL.ROI_CUBE_HEAD.FC_DIM = 1024 126 | cfg.MODEL.ROI_CUBE_HEAD.USE_TRANSFORMER = False 127 | 128 | # the style to predict Z with currently supported 129 | # options --> ['direct', 'sigmoid', 'log', 'clusters'] 130 | cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE = "direct" 131 | 132 | # the style to predict pose with currently supported 133 | # options --> ['6d', 'euler', 'quaternion'] 134 | cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE = "6d" 135 | 136 | # Whether to scale all 3D losses by inverse depth 137 | cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT = False 138 | 139 | # Virtual depth puts all predictions of depth into 140 | # a shared virtual space with a shared focal length. 141 | cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH = True 142 | cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL = 512.0 143 | 144 | # If true, then all losses are computed using the 8 corners 145 | # such that they are all in a shared scale space. 146 | # E.g., their scale correlates with their impact on 3D IoU. 147 | # This way no manual weights need to be set. 148 | cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS = True 149 | 150 | # When > 1, the outputs of the 3D head will be based on 151 | # a 2D scale clustering, based on 2D proposal height/width. 152 | # This parameter describes the number of bins to cluster. 153 | cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS = 1 154 | 155 | # Whether batch norm is enabled during training. 156 | # If false, all BN weights will be frozen. 157 | cfg.MODEL.USE_BN = True 158 | 159 | # Whether to predict the pose in allocentric space. 160 | # The allocentric space may correlate better with 2D 161 | # images compared to egocentric poses. 162 | cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE = True 163 | 164 | # Whether to use chamfer distance for disentangled losses 165 | # of pose. This avoids periodic issues of rotation but 166 | # may prevent the pose "direction" from being interpretable. 167 | cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE = True 168 | 169 | # Should the prediction heads share FC features or not. 170 | # These include groups of uv, z, whl, pose. 171 | cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC = True 172 | 173 | # Check for stable gradients. When inf is detected, skip the update. 174 | # This prevents an occasional bad sample from exploding the model. 175 | # The threshold below is the allows percent of bad samples. 176 | # 0.0 is off, and 0.01 is recommended for minor robustness to exploding. 177 | cfg.MODEL.STABILIZE = 0.01 178 | 179 | # Whether or not to use the dimension priors 180 | cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED = True 181 | 182 | # How prior dimensions should be computed? 183 | # The supported modes are ["exp", "sigmoid"] 184 | # where exp is unbounded and sigmoid is bounded 185 | # between +- 3 standard deviations from the mean. 186 | cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC = 'exp' 187 | 188 | # weight for confidence loss. 0 is off. 189 | cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE = 1.0 190 | 191 | # Loss weights for XY, Z, Dims, Pose 192 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D = 1.0 193 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY = 1.0 194 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z = 1.0 195 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS = 1.0 196 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE = 1.0 197 | 198 | cfg.MODEL.DLA = CN() 199 | 200 | # Supported types for DLA backbones are... 201 | # dla34, dla46_c, dla46x_c, dla60x_c, dla60, dla60x, dla102x, dla102x2, dla169 202 | cfg.MODEL.DLA.TYPE = 'dla34' 203 | 204 | # Only available for dla34, dla60, dla102 205 | cfg.MODEL.DLA.TRICKS = False 206 | 207 | # A joint loss for the disentangled loss. 208 | # All predictions are computed using a corner 209 | # or chamfers loss depending on chamfer_pose! 210 | # Recommened to keep this weight small: [0.05, 0.5] 211 | cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT = 1.0 212 | 213 | # sgd, adam, adam+amsgrad, adamw, adamw+amsgrad 214 | cfg.SOLVER.TYPE = 'sgd' 215 | 216 | cfg.MODEL.RESNETS.TORCHVISION = True 217 | cfg.TEST.DETECTIONS_PER_IMAGE = 100 218 | 219 | cfg.TEST.VISIBILITY_THRES = 1/2.0 220 | cfg.TEST.TRUNCATION_THRES = 1/2.0 221 | 222 | # If ORACLE2D is True, the ocacle 2d bboxes and categories will be loaded when evaluation. 223 | cfg.TEST.ORACLE2D = True 224 | cfg.TEST.CAT_MODE = "base" # "base" or "novel" or "all" 225 | 226 | cfg.INPUT.RANDOM_FLIP = "horizontal" 227 | cfg.INPUT.TRAIN_SET_PERCENTAGE = 1.0 228 | # When True, we will use localization uncertainty 229 | # as the new IoUness score in the RPN. 230 | cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY = 'IoUness' 231 | 232 | # If > 0.0 this is the scaling factor that will be applied to 233 | # an RoI 2D box before doing any pooling to give more context. 234 | # Ex. 1.5 makes width and height 50% larger. 235 | cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES = 0.0 236 | 237 | # weight path specifically for pretraining (no checkpointables will be loaded) 238 | cfg.MODEL.WEIGHTS_PRETRAIN = '' -------------------------------------------------------------------------------- /cubercnn/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .dataset_mapper import * 3 | from .build import * 4 | from .builtin import * -------------------------------------------------------------------------------- /cubercnn/data/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import itertools 3 | import logging 4 | import numpy as np 5 | import math 6 | import json 7 | from collections import defaultdict 8 | import torch 9 | import torch.utils.data 10 | 11 | from detectron2.config import configurable 12 | from detectron2.utils.logger import _log_api_usage 13 | 14 | from detectron2.data.catalog import DatasetCatalog 15 | from detectron2.data.common import DatasetFromList, MapDataset 16 | from detectron2.data.dataset_mapper import DatasetMapper 17 | from detectron2.data.samplers import ( 18 | InferenceSampler, 19 | RepeatFactorTrainingSampler, 20 | TrainingSampler 21 | ) 22 | from detectron2.data.build import ( 23 | filter_images_with_only_crowd_annotations, 24 | build_batch_data_loader, 25 | trivial_batch_collator 26 | ) 27 | import random 28 | 29 | 30 | def sample_by_percentage(data_list, percentage, seed=None): 31 | if seed is not None: 32 | random.seed(seed) 33 | sample_size = int(len(data_list) * percentage) 34 | return random.sample(data_list, sample_size) 35 | 36 | def xywh_to_xyxy(bbox): 37 | x, y, w, h = bbox 38 | x_min = x 39 | y_min = y 40 | x_max = x + w 41 | y_max = y + h 42 | return [x_min, y_min, x_max, y_max] 43 | 44 | 45 | def merge_oracle2d_to_detection_dicts(dataset_dicts, oracle2d): 46 | for dataset, oracle in zip(dataset_dicts, oracle2d): 47 | with open(oracle, 'r') as file: 48 | oracle_data = json.load(file) 49 | for data_dict, oracle_dict in zip(dataset,oracle_data): 50 | assert data_dict['image_id'] == oracle_dict['image_id'] 51 | data_dict["oracle2D"] = {"gt_bbox2D": torch.tensor([xywh_to_xyxy(instance["bbox"]) for instance in oracle_dict["instances"]]), 52 | "gt_classes": torch.tensor([instance["category_id"] for instance in oracle_dict["instances"]]), 53 | "gt_scores": torch.tensor([instance["score"] for instance in oracle_dict["instances"]]), 54 | } 55 | 56 | 57 | def get_detection_dataset_dicts(names, filter_empty=True, oracle2d=None, **kwargs): 58 | 59 | if isinstance(names, str): 60 | names = [names] 61 | 62 | assert len(names), names 63 | dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names] 64 | if oracle2d: 65 | merge_oracle2d_to_detection_dicts(dataset_dicts, oracle2d) 66 | for dataset_name, dicts in zip(names, dataset_dicts): 67 | assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) 68 | 69 | dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) 70 | 71 | has_instances = "annotations" in dataset_dicts[0] 72 | 73 | if filter_empty and has_instances: 74 | dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts) 75 | 76 | assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names)) 77 | return dataset_dicts 78 | 79 | 80 | def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None, dataset_id_to_src=None): 81 | if dataset is None: 82 | dataset = get_detection_dataset_dicts( 83 | cfg.DATASETS.TRAIN, 84 | filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, 85 | min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE 86 | if cfg.MODEL.KEYPOINT_ON 87 | else 0, 88 | proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, 89 | ) 90 | _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) 91 | 92 | if cfg.INPUT.TRAIN_SET_PERCENTAGE != 1.0: 93 | dataset = sample_by_percentage(dataset, cfg.INPUT.TRAIN_SET_PERCENTAGE, seed=42) 94 | logger = logging.getLogger(__name__) 95 | logger.info("Using {} training images".format(len(dataset))) 96 | if mapper is None: 97 | mapper = DatasetMapper(cfg, True) 98 | 99 | if sampler is None: 100 | sampler_name = cfg.DATALOADER.SAMPLER_TRAIN 101 | balance_datasets = cfg.DATALOADER.BALANCE_DATASETS 102 | logger = logging.getLogger(__name__) 103 | logger.info("Using training sampler {}".format(sampler_name)) 104 | 105 | if balance_datasets: 106 | assert dataset_id_to_src is not None, 'Need dataset sources.' 107 | 108 | dataset_source_to_int = {val:i for i, val in enumerate(set(dataset_id_to_src.values()))} 109 | dataset_ids_per_img = [dataset_source_to_int[dataset_id_to_src[img['dataset_id']]] for img in dataset] 110 | dataset_ids = np.unique(dataset_ids_per_img) 111 | 112 | # only one source? don't re-weight then. 113 | if len(dataset_ids) == 1: 114 | weights_per_img = torch.ones(len(dataset_ids_per_img)).float() 115 | 116 | # compute per-dataset weights. 117 | else: 118 | counts = np.bincount(dataset_ids_per_img) 119 | counts = [counts[id] for id in dataset_ids] 120 | weights = [1 - count/np.sum(counts) for count in counts] 121 | weights = [weight/np.min(weights) for weight in weights] 122 | 123 | weights_per_img = torch.zeros(len(dataset_ids_per_img)).float() 124 | dataset_ids_per_img = torch.FloatTensor(dataset_ids_per_img).long() 125 | 126 | # copy weights 127 | for dataset_id, weight in zip(dataset_ids, weights): 128 | weights_per_img[dataset_ids_per_img == dataset_id] = weight 129 | 130 | # no special sampling whatsoever 131 | if sampler_name == "TrainingSampler" and not balance_datasets: 132 | sampler = TrainingSampler(len(dataset)) 133 | 134 | # balance the weight sampling by datasets 135 | elif sampler_name == "TrainingSampler" and balance_datasets: 136 | sampler = RepeatFactorTrainingSampler(weights_per_img) 137 | 138 | # balance the weight sampling by categories 139 | elif sampler_name == "RepeatFactorTrainingSampler" and not balance_datasets: 140 | repeat_factors = repeat_factors_from_category_frequency( 141 | dataset, cfg.DATALOADER.REPEAT_THRESHOLD 142 | ) 143 | sampler = RepeatFactorTrainingSampler(repeat_factors) 144 | 145 | # balance the weight sampling by categories AND by dataset frequency 146 | elif sampler_name == "RepeatFactorTrainingSampler" and balance_datasets: 147 | repeat_factors = repeat_factors_from_category_frequency( 148 | dataset, cfg.DATALOADER.REPEAT_THRESHOLD 149 | ) 150 | repeat_factors *= weights_per_img 151 | repeat_factors /= repeat_factors.min().item() 152 | sampler = RepeatFactorTrainingSampler(repeat_factors) 153 | else: 154 | raise ValueError("Unknown training sampler: {}".format(sampler_name)) 155 | 156 | return { 157 | "dataset": dataset, 158 | "sampler": sampler, 159 | "mapper": mapper, 160 | "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, 161 | "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, 162 | "num_workers": cfg.DATALOADER.NUM_WORKERS, 163 | } 164 | 165 | 166 | def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh): 167 | """ 168 | Compute (fractional) per-image repeat factors based on category frequency. 169 | The repeat factor for an image is a function of the frequency of the rarest 170 | category labeled in that image. The "frequency of category c" in [0, 1] is defined 171 | as the fraction of images in the training set (without repeats) in which category c 172 | appears. 173 | See :paper:`lvis` (>= v2) Appendix B.2. 174 | 175 | Args: 176 | dataset_dicts (list[dict]): annotations in Detectron2 dataset format. 177 | repeat_thresh (float): frequency threshold below which data is repeated. 178 | If the frequency is half of `repeat_thresh`, the image will be 179 | repeated twice. 180 | 181 | Returns: 182 | torch.Tensor: 183 | the i-th element is the repeat factor for the dataset image at index i. 184 | """ 185 | # 1. For each category c, compute the fraction of images that contain it: f(c) 186 | category_freq = defaultdict(int) 187 | for dataset_dict in dataset_dicts: # For each image (without repeats) 188 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} 189 | for cat_id in cat_ids: 190 | if cat_id < 0: continue 191 | category_freq[cat_id] += 1 192 | num_images = len(dataset_dicts) 193 | for k, v in category_freq.items(): 194 | category_freq[k] = v / num_images 195 | 196 | # 2. For each category c, compute the category-level repeat factor: 197 | # r(c) = max(1, sqrt(t / f(c))) 198 | category_rep = { 199 | cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) 200 | for cat_id, cat_freq in category_freq.items() 201 | } 202 | 203 | # 3. For each image I, compute the image-level repeat factor: 204 | # r(I) = max_{c in I} r(c) 205 | rep_factors = [] 206 | for dataset_dict in dataset_dicts: 207 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} 208 | rep_factor = max({category_rep[cat_id] for cat_id in cat_ids if cat_id >= 0}, default=1.0) 209 | rep_factors.append(rep_factor) 210 | 211 | return torch.tensor(rep_factors, dtype=torch.float32) 212 | 213 | @configurable(from_config=_train_loader_from_config) 214 | def build_detection_train_loader(dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0): 215 | if isinstance(dataset, list): 216 | dataset = DatasetFromList(dataset, copy=False) 217 | if mapper is not None: 218 | dataset = MapDataset(dataset, mapper) 219 | if sampler is None: 220 | sampler = TrainingSampler(len(dataset)) 221 | assert isinstance(sampler, torch.utils.data.sampler.Sampler) 222 | return build_batch_data_loader( 223 | dataset, 224 | sampler, 225 | total_batch_size, 226 | aspect_ratio_grouping=aspect_ratio_grouping, 227 | num_workers=num_workers 228 | ) 229 | 230 | def _test_loader_from_config(cfg, dataset_name, mode, mapper=None): 231 | if isinstance(dataset_name, str): 232 | dataset_name = [dataset_name] 233 | dataset = get_detection_dataset_dicts( 234 | dataset_name, 235 | filter_empty=False, 236 | oracle2d=[ 237 | getattr(getattr(cfg.DATASETS.ORACLE2D_FILES[cfg.DATASETS.ORACLE2D_FILES.EVAL_MODE], mode), x) for x in dataset_name 238 | ] 239 | if cfg.TEST.ORACLE2D 240 | else None, 241 | proposal_files=[ 242 | cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name 243 | ] 244 | if cfg.MODEL.LOAD_PROPOSALS 245 | else None, 246 | ) 247 | if mapper is None: 248 | mapper = DatasetMapper(cfg, False) 249 | 250 | return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS} 251 | 252 | @configurable(from_config=_test_loader_from_config) 253 | def build_detection_test_loader(dataset, *, mapper, sampler=None, num_workers=0): 254 | 255 | if isinstance(dataset, list): 256 | dataset = DatasetFromList(dataset, copy=False) 257 | if mapper is not None: 258 | dataset = MapDataset(dataset, mapper) 259 | if sampler is None: 260 | sampler = InferenceSampler(len(dataset)) 261 | 262 | # Always use 1 image per worker during inference since this is the 263 | # standard when reporting inference time in papers. 264 | batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) 265 | data_loader = torch.utils.data.DataLoader( 266 | dataset, 267 | num_workers=num_workers, 268 | batch_sampler=batch_sampler, 269 | collate_fn=trivial_batch_collator, 270 | ) 271 | return data_loader 272 | 273 | -------------------------------------------------------------------------------- /cubercnn/data/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | 3 | def get_omni3d_categories(dataset="omni3d"): 4 | """ 5 | Returns the Omni3D categories for dataset 6 | Args: 7 | dataset: str 8 | Returns: 9 | cats: set of strings with category names 10 | """ 11 | 12 | if dataset == "omni3d": 13 | cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'}) 14 | assert len(cats) == 50 15 | elif dataset == "omni3d_in": 16 | cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'}) 17 | assert len(cats) == 38 18 | elif dataset == "omni3d_out": 19 | cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'}) 20 | assert len(cats) == 11 21 | elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test"]: 22 | cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'}) 23 | assert len(cats) == 38 24 | elif dataset in ["Hypersim_train", "Hypersim_val"]: 25 | cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'}) 26 | assert len(cats) == 29 27 | elif dataset == "Hypersim_test": 28 | # Hypersim test annotation does not contain toilet 29 | cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'}) 30 | assert len(cats) == 28 31 | elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]: 32 | cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'}) 33 | assert len(cats) == 14 34 | elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]: 35 | cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'}) 36 | assert len(cats) == 9 37 | elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]: 38 | cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'}) 39 | assert len(cats) == 5 40 | elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]: 41 | cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'}) 42 | assert len(cats) == 9 43 | elif dataset in [ "SUNRGBD_test_novel"]: 44 | cats = set({'monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem'}) 45 | assert len(cats) == 20 46 | elif dataset in [ "ARKitScenes_test_novel"]: 47 | cats = set({'fireplace'}) 48 | assert len(cats) == 1 49 | elif dataset in [ "KITTI_test_novel"]: 50 | cats = set({'tram'}) 51 | assert len(cats) == 1 52 | else: 53 | raise ValueError("%s dataset is not registered." % (dataset)) 54 | 55 | return cats -------------------------------------------------------------------------------- /cubercnn/data/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import copy 3 | import torch 4 | import numpy as np 5 | from detectron2.structures import BoxMode, Keypoints 6 | from detectron2.data import detection_utils 7 | from detectron2.data import transforms as T 8 | from detectron2.data import ( 9 | DatasetMapper 10 | ) 11 | from detectron2.structures import ( 12 | Boxes, 13 | BoxMode, 14 | Instances, 15 | ) 16 | 17 | class DatasetMapper3D(DatasetMapper): 18 | 19 | def __call__(self, dataset_dict): 20 | 21 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 22 | 23 | image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format) 24 | detection_utils.check_image_size(dataset_dict, image) 25 | 26 | aug_input = T.AugInput(image) 27 | transforms = self.augmentations(aug_input) 28 | image = aug_input.image 29 | 30 | image_shape = image.shape[:2] # h, w 31 | 32 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 33 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 34 | # Therefore it's important to use torch.Tensor. 35 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 36 | 37 | # no need for additoinal processing at inference 38 | if not self.is_train: 39 | return dataset_dict 40 | 41 | if "annotations" in dataset_dict: 42 | 43 | dataset_id = dataset_dict['dataset_id'] 44 | K = np.array(dataset_dict['K']) 45 | 46 | unknown_categories = self.dataset_id_to_unknown_cats[dataset_id] 47 | 48 | # transform and pop off annotations 49 | annos = [ 50 | transform_instance_annotations(obj, transforms, K=K) 51 | for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 52 | ] 53 | 54 | # convert to instance format 55 | instances = annotations_to_instances(annos, image_shape, unknown_categories) 56 | dataset_dict["instances"] = detection_utils.filter_empty_instances(instances) 57 | 58 | return dataset_dict 59 | 60 | ''' 61 | Cached for mirroring annotations 62 | ''' 63 | _M1 = np.array([ 64 | [1, 0, 0], 65 | [0, -1, 0], 66 | [0, 0, -1] 67 | ]) 68 | _M2 = np.array([ 69 | [-1., 0., 0.], 70 | [ 0., -1., 0.], 71 | [ 0., 0., 1.] 72 | ]) 73 | 74 | 75 | def transform_instance_annotations(annotation, transforms, *, K): 76 | 77 | if isinstance(transforms, (tuple, list)): 78 | transforms = T.TransformList(transforms) 79 | 80 | # bbox is 1d (per-instance bounding box) 81 | bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) 82 | bbox = transforms.apply_box(np.array([bbox]))[0] 83 | 84 | annotation["bbox"] = bbox 85 | annotation["bbox_mode"] = BoxMode.XYXY_ABS 86 | 87 | if annotation['center_cam'][2] != 0: 88 | 89 | # project the 3D box annotation XYZ_3D to screen 90 | point3D = annotation['center_cam'] 91 | point2D = K @ np.array(point3D) 92 | point2D[:2] = point2D[:2] / point2D[-1] 93 | annotation["center_cam_proj"] = point2D.tolist() 94 | 95 | # apply coords transforms to 2D box 96 | annotation["center_cam_proj"][0:2] = transforms.apply_coords( 97 | point2D[np.newaxis][:, :2] 98 | )[0].tolist() 99 | 100 | keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T 101 | keypoints[:, 0] /= keypoints[:, -1] 102 | keypoints[:, 1] /= keypoints[:, -1] 103 | 104 | if annotation['ignore']: 105 | # all keypoints marked as not visible 106 | # 0 - unknown, 1 - not visible, 2 visible 107 | keypoints[:, 2] = 1 108 | else: 109 | 110 | valid_keypoints = keypoints[:, 2] > 0 111 | 112 | # 0 - unknown, 1 - not visible, 2 visible 113 | keypoints[:, 2] = 2 114 | keypoints[valid_keypoints, 2] = 2 115 | 116 | # in place 117 | transforms.apply_coords(keypoints[:, :2]) 118 | annotation["keypoints"] = keypoints.tolist() 119 | 120 | # manually apply mirror for pose 121 | for transform in transforms: 122 | 123 | # horrizontal flip? 124 | if isinstance(transform, T.HFlipTransform): 125 | 126 | pose = _M1 @ np.array(annotation["pose"]) @ _M2 127 | annotation["pose"] = pose.tolist() 128 | annotation["R_cam"] = pose.tolist() 129 | 130 | return annotation 131 | 132 | 133 | def annotations_to_instances(annos, image_size, unknown_categories): 134 | 135 | # init 136 | target = Instances(image_size) 137 | 138 | # add classes, 2D boxes, 3D boxes and poses 139 | target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64) 140 | target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]) 141 | target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos]) 142 | target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos]) 143 | 144 | n = len(target.gt_classes) 145 | 146 | # do keypoints? 147 | target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos])) 148 | 149 | gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool) 150 | gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True 151 | 152 | # include available category indices as tensor with GTs 153 | target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1]) 154 | 155 | return target 156 | -------------------------------------------------------------------------------- /cubercnn/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .omni3d_evaluation import * -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .densenet import * 2 | from .mnasnet import * 3 | from .resnet import * 4 | from .shufflenet import * 5 | from .dla import * 6 | from .dino import * 7 | from .mae import * 8 | from .clip import * 9 | from .midas_final import * 10 | from .sam import * -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/clip.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import ShapeSpec 2 | from detectron2.modeling.backbone import Backbone 3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN 5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid 6 | import torch 7 | from torch import nn 8 | from torchvision import models 9 | import torch.nn.functional as F 10 | import einops as E 11 | import unittest 12 | import open_clip 13 | from cubercnn.modeling.backbone.dino import tokens_to_output 14 | from typing import Tuple 15 | 16 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/clip.py 17 | class CLIPBackbone(Backbone): 18 | def __init__(self, cfg, input_shape, arch="ViT-B-16", checkpoint="openai", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",): 19 | super().__init__() 20 | assert output in ["dense-cls", "cls", "gap", "dense"] 21 | self.output = output 22 | # Initialize a pre-trained CLIP image encoder and freeze it. 23 | _clip_model, _, _ = open_clip.create_model_and_transforms( 24 | arch, pretrained=checkpoint 25 | ) 26 | _clip_model = _clip_model.to(torch.float32) 27 | self.visual = _clip_model.visual 28 | del _clip_model 29 | 30 | # Extract some attributes from CLIP module for easy access. 31 | self.patch_size = self.visual.conv1.stride[0] 32 | 33 | # get feature dimension 34 | feat_dim = self.visual.transformer.width 35 | feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim 36 | feat_dims = [feat_dim, feat_dim, feat_dim, feat_dim] 37 | 38 | # get extraction targets 39 | n_layers = len(self.visual.transformer.resblocks) 40 | multilayers = [ 41 | n_layers // 4 - 1, 42 | n_layers // 2 - 1, 43 | n_layers // 4 * 3 - 1, 44 | n_layers - 1, 45 | ] 46 | 47 | if return_multilayer: 48 | self.feat_dim = feat_dims 49 | self.multilayers = multilayers 50 | else: 51 | self.feat_dim = feat_dims 52 | layer = multilayers[-1] if layer == -1 else layer 53 | self.multilayers = [layer] 54 | 55 | # define layer name (for logging) 56 | self.layer = "-".join(str(_x) for _x in self.multilayers) 57 | 58 | self._out_feature_channels = {out_feature: feat_dim} 59 | self._out_feature_strides = {out_feature: self.patch_size} 60 | self._out_features = [out_feature] 61 | 62 | def forward(self, images): 63 | img_h, img_w = images.shape[-2:] 64 | out_hw = (img_h // self.patch_size, img_w // self.patch_size) 65 | 66 | # clip stuff 67 | x = self.visual.conv1(images) 68 | x_hw = x.shape[-2:] 69 | x = E.rearrange(x, "b c h w -> b (h w) c") 70 | 71 | # concat cls token 72 | _cls_embed = E.repeat(self.visual.class_embedding, "c -> b 1 c", b=x.shape[0]) 73 | x = torch.cat([_cls_embed.to(x.dtype), x], dim=1) 74 | 75 | # add pos embed 76 | pos_embed = resize_pos_embed(self.visual.positional_embedding, x_hw) 77 | x = self.visual.ln_pre(x + pos_embed.to(x.dtype)) 78 | 79 | embeds = [] 80 | for i, blk in enumerate(self.visual.transformer.resblocks): 81 | x = blk(x) 82 | if i in self.multilayers: 83 | embeds.append(x) 84 | if len(embeds) == len(self.multilayers): 85 | break 86 | 87 | outputs = {} 88 | for i, _x in enumerate(embeds): 89 | _x = tokens_to_output(self.output, _x[:, 1:], _x[:, 0], out_hw) 90 | outputs[self._out_features[i]] = _x 91 | return outputs 92 | 93 | def resize_pos_embed( 94 | pos_embed: torch.Tensor, hw: Tuple[int, int], has_cls_token: bool = True 95 | ): 96 | """ 97 | Resize positional embedding for arbitrary image resolution. Resizing is done 98 | via bicubic interpolation. 99 | 100 | Args: 101 | pos_embed: Positional embedding tensor of shape ``(n_patches, embed_dim)``. 102 | hw: Target height and width of the tensor after interpolation. 103 | has_cls_token: Whether ``pos_embed[0]`` is for the ``[cls]`` token. 104 | 105 | Returns: 106 | Tensor of shape ``(new_n_patches, embed_dim)`` of resized embedding. 107 | ``new_n_patches`` is ``new_height * new_width`` if ``has_cls`` is False, 108 | else ``1 + new_height * new_width``. 109 | """ 110 | 111 | n_grid = pos_embed.shape[0] - 1 if has_cls_token else pos_embed.shape[0] 112 | 113 | # Do not resize if already in same shape. 114 | if n_grid == hw[0] * hw[1]: 115 | return pos_embed 116 | 117 | # Get original position embedding and extract ``[cls]`` token. 118 | if has_cls_token: 119 | cls_embed, pos_embed = pos_embed[[0]], pos_embed[1:] 120 | 121 | orig_dim = int(pos_embed.shape[0] ** 0.5) 122 | 123 | pos_embed = E.rearrange(pos_embed, "(h w) c -> 1 c h w", h=orig_dim) 124 | pos_embed = F.interpolate( 125 | pos_embed, hw, mode="bicubic", align_corners=False, antialias=True 126 | ) 127 | pos_embed = E.rearrange(pos_embed, "1 c h w -> (h w) c") 128 | 129 | # Add embedding of ``[cls]`` token back after resizing. 130 | if has_cls_token: 131 | pos_embed = torch.cat([cls_embed, pos_embed], dim=0) 132 | 133 | return pos_embed 134 | 135 | @BACKBONE_REGISTRY.register() 136 | def build_clip_backbone(cfg, input_shape: ShapeSpec, priors=None): 137 | arch = cfg.MODEL.CLIP.ARCH 138 | checkpoint = cfg.MODEL.CLIP.CHECKPOINT 139 | output = cfg.MODEL.CLIP.OUTPUT 140 | layer = cfg.MODEL.CLIP.LAYER 141 | return_multilayer = cfg.MODEL.CLIP.RETURN_MULTILAYER 142 | 143 | bottom_up = CLIPBackbone( 144 | cfg, 145 | input_shape, 146 | arch=arch, 147 | checkpoint=checkpoint, 148 | output=output, 149 | layer=layer, 150 | return_multilayer=return_multilayer, 151 | ) 152 | 153 | in_feature = cfg.MODEL.FPN.IN_FEATURE 154 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 155 | scale_factors = (4.0, 2.0, 1.0, 0.5) 156 | backbone = SimpleFeaturePyramid( 157 | net=bottom_up, 158 | in_feature=in_feature, 159 | out_channels=out_channels, 160 | scale_factors=scale_factors, 161 | norm=cfg.MODEL.FPN.NORM, 162 | top_block=None, 163 | square_pad=cfg.MODEL.FPN.SQUARE_PAD 164 | ) 165 | return backbone 166 | 167 | class TestCLIPBackbone(unittest.TestCase): 168 | def setUp(self): 169 | # Mock configuration 170 | self.cfg = type('', (), {})() 171 | self.cfg.MODEL = type('', (), {})() 172 | self.cfg.MODEL.CLIP = type('', (), {})() 173 | self.cfg.MODEL.CLIP.ARCH = "ViT-B-16" 174 | self.cfg.MODEL.CLIP.CHECKPOINT = "openai" 175 | self.cfg.MODEL.CLIP.OUTPUT = "dense" 176 | self.cfg.MODEL.CLIP.LAYER = -1 177 | self.cfg.MODEL.CLIP.RETURN_MULTILAYER = False 178 | self.cfg.MODEL.FPN = type('', (), {})() 179 | self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat' 180 | self.cfg.MODEL.FPN.OUT_CHANNELS = 256 181 | self.cfg.MODEL.FPN.NORM = "LN" 182 | self.cfg.MODEL.FPN.FUSE_TYPE = "sum" 183 | self.cfg.MODEL.FPN.SQUARE_PAD = 512 184 | self.input_shape = ShapeSpec(channels=3, height=512, width=512) 185 | 186 | def test_clip_backbone_forward(self): 187 | # Create the backbone 188 | backbone = build_clip_backbone(self.cfg, self.input_shape) 189 | # Generate a random input tensor 190 | x = torch.randn(1, 3, 512, 512) 191 | # Run forward pass 192 | outputs = backbone(x) 193 | print(backbone.net.output_shape()) 194 | for key, output in outputs.items(): 195 | print(key, output.shape) 196 | 197 | # print(backbone.net.vit) 198 | 199 | 200 | if __name__ == "__main__": 201 | unittest.main() -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/densenet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class DenseNetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.densenet121(pretrained) 15 | base = base.features 16 | 17 | self.base = base 18 | 19 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024} 20 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 21 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 22 | 23 | def forward(self, x): 24 | 25 | outputs = {} 26 | 27 | db1 = self.base[0:5](x) 28 | db2 = self.base[5:7](db1) 29 | db3 = self.base[7:9](db2) 30 | p5 = self.base[9:](db3) 31 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 32 | outputs['p2'] = db1 33 | outputs['p3'] = db2 34 | outputs['p4'] = db3 35 | outputs['p5'] = p5 36 | outputs['p6'] = p6 37 | 38 | return outputs 39 | 40 | 41 | @BACKBONE_REGISTRY.register() 42 | def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 43 | """ 44 | Args: 45 | cfg: a detectron2 CfgNode 46 | 47 | Returns: 48 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 49 | """ 50 | 51 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 52 | 53 | bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 54 | in_features = cfg.MODEL.FPN.IN_FEATURES 55 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 56 | 57 | backbone = FPN( 58 | bottom_up=bottom_up, 59 | in_features=in_features, 60 | out_channels=out_channels, 61 | norm=cfg.MODEL.FPN.NORM, 62 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE 63 | ) 64 | return backbone -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/dino.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import ShapeSpec 2 | from detectron2.modeling.backbone import Backbone 3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN 5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid 6 | import torch 7 | from torch import nn 8 | from torchvision import models 9 | import torch.nn.functional as F 10 | import einops as E 11 | import unittest 12 | 13 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/dino.py 14 | class DINOBackbone(Backbone): 15 | def __init__(self, cfg, input_shape, dino_name="dino", model_name="vitb16", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",): 16 | super().__init__() 17 | feat_dims = { 18 | "vitb8": 768, 19 | "vitb16": 768, 20 | "vitb14": 768, 21 | "vitb14_reg": 768, 22 | "vitl14": 1024, 23 | "vitg14": 1536, 24 | } 25 | 26 | # get model 27 | self.model_name = dino_name 28 | self.checkpoint_name = f"{dino_name}_{model_name}" 29 | dino_vit = torch.hub.load(f"facebookresearch/{dino_name}", self.checkpoint_name) 30 | self.vit = dino_vit 31 | self.has_registers = "_reg" in model_name 32 | 33 | assert output in ["cls", "gap", "dense", "dense-cls"] 34 | self.output = output 35 | self.patch_size = self.vit.patch_embed.proj.kernel_size[0] 36 | 37 | feat_dim = feat_dims[model_name] 38 | feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim 39 | 40 | num_layers = len(self.vit.blocks) 41 | multilayers = [ 42 | num_layers // 4 - 1, 43 | num_layers // 2 - 1, 44 | num_layers // 4 * 3 - 1, 45 | num_layers - 1, 46 | ] 47 | 48 | if return_multilayer: 49 | self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim] 50 | self.multilayers = multilayers 51 | else: 52 | self.feat_dim = feat_dim 53 | layer = multilayers[-1] if layer == -1 else layer 54 | self.multilayers = [layer] 55 | 56 | # define layer name (for logging) 57 | self.layer = "-".join(str(_x) for _x in self.multilayers) 58 | 59 | self._out_feature_channels = {out_feature: feat_dim} 60 | self._out_feature_strides = {out_feature: self.patch_size} 61 | self._out_features = [out_feature] 62 | 63 | def forward(self, images): 64 | h, w = images.shape[-2:] 65 | h, w = h // self.patch_size, w // self.patch_size 66 | 67 | if self.model_name == "dinov2": 68 | x = self.vit.prepare_tokens_with_masks(images, None) 69 | else: 70 | x = self.vit.prepare_tokens(images) 71 | 72 | embeds = [] 73 | for i, blk in enumerate(self.vit.blocks): 74 | x = blk(x) 75 | if i in self.multilayers: 76 | embeds.append(x) 77 | if len(embeds) == len(self.multilayers): 78 | break 79 | 80 | num_spatial = h * w 81 | outputs = {} 82 | for idx, x_i in enumerate(embeds): 83 | cls_tok = x_i[:, 0] 84 | spatial = x_i[:, -1 * num_spatial:] 85 | x_i = tokens_to_output(self.output, spatial, cls_tok, (h, w)) 86 | outputs[self._out_features[idx]] = x_i 87 | 88 | return outputs 89 | 90 | 91 | @BACKBONE_REGISTRY.register() 92 | def build_dino_backbone(cfg, input_shape: ShapeSpec, priors=None): 93 | dino_name = cfg.MODEL.DINO.NAME 94 | model_name = cfg.MODEL.DINO.MODEL_NAME 95 | output = cfg.MODEL.DINO.OUTPUT 96 | layer = cfg.MODEL.DINO.LAYER 97 | return_multilayer = cfg.MODEL.DINO.RETURN_MULTILAYER 98 | 99 | bottom_up = DINOBackbone( 100 | cfg, 101 | input_shape, 102 | dino_name=dino_name, 103 | model_name=model_name, 104 | output=output, 105 | layer=layer, 106 | return_multilayer=return_multilayer, 107 | ) 108 | 109 | in_feature = cfg.MODEL.FPN.IN_FEATURE 110 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 111 | scale_factors = (2.0, 1.0, 0.5) 112 | backbone = SimpleFeaturePyramid( 113 | net=bottom_up, 114 | in_feature=in_feature, 115 | out_channels=out_channels, 116 | scale_factors=scale_factors, 117 | norm=cfg.MODEL.FPN.NORM, 118 | top_block=None, 119 | square_pad=cfg.MODEL.FPN.SQUARE_PAD 120 | ) 121 | return backbone 122 | 123 | def tokens_to_output(output_type, dense_tokens, cls_token, feat_hw): 124 | if output_type == "cls": 125 | assert cls_token is not None 126 | output = cls_token 127 | elif output_type == "gap": 128 | output = dense_tokens.mean(dim=1) 129 | elif output_type == "dense": 130 | h, w = feat_hw 131 | dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w) 132 | output = dense_tokens.contiguous() 133 | elif output_type == "dense-cls": 134 | assert cls_token is not None 135 | h, w = feat_hw 136 | dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w) 137 | cls_token = cls_token[:, :, None, None].repeat(1, 1, h, w) 138 | output = torch.cat((dense_tokens, cls_token), dim=1).contiguous() 139 | else: 140 | raise ValueError() 141 | 142 | return output 143 | 144 | class TestDINOBackbone(unittest.TestCase): 145 | def setUp(self): 146 | # Mock configuration 147 | self.cfg = type('', (), {})() 148 | self.cfg.MODEL = type('', (), {})() 149 | self.cfg.MODEL.DINO = type('', (), {})() 150 | self.cfg.MODEL.DINO.NAME = "dino" 151 | self.cfg.MODEL.DINO.MODEL_NAME = "vitb16" 152 | self.cfg.MODEL.DINO.OUTPUT = "dense" 153 | self.cfg.MODEL.DINO.LAYER = -1 154 | self.cfg.MODEL.DINO.RETURN_MULTILAYER = False 155 | self.cfg.MODEL.FPN = type('', (), {})() 156 | self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat' 157 | self.cfg.MODEL.FPN.OUT_CHANNELS = 256 158 | self.cfg.MODEL.FPN.NORM = "LN" 159 | self.cfg.MODEL.FPN.FUSE_TYPE = "sum" 160 | self.input_shape = ShapeSpec(channels=3, height=512, width=512) 161 | 162 | def test_dino_backbone_forward(self): 163 | # Create the backbone 164 | backbone = build_dino_backbone(self.cfg, self.input_shape) 165 | # Generate a random input tensor 166 | x = torch.randn(1, 3, 512, 512) 167 | # Run forward pass 168 | outputs = backbone(x) 169 | print(backbone.net.output_shape()) 170 | for key, output in outputs.items(): 171 | print(key, output.shape) 172 | 173 | # print(backbone.net.vit) 174 | 175 | 176 | if __name__ == "__main__": 177 | unittest.main() -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/mae.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import ShapeSpec 2 | from detectron2.modeling.backbone import Backbone 3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN 5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid 6 | import torch 7 | from torch import nn 8 | from torchvision import models 9 | import torch.nn.functional as F 10 | from transformers import ViTMAEForPreTraining 11 | from transformers.models.vit_mae.modeling_vit_mae import ( 12 | get_2d_sincos_pos_embed_from_grid, 13 | ) 14 | import numpy as np 15 | import einops as E 16 | import unittest 17 | from cubercnn.modeling.backbone.dino import tokens_to_output 18 | 19 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/mae.py 20 | class MAEBackbone(Backbone): 21 | def __init__(self, cfg, input_shape, checkpoint="facebook/vit-mae-base", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",): 22 | super().__init__() 23 | 24 | # get model 25 | self.checkpoint_name = checkpoint.split("/")[1] 26 | self.vit = ViTMAEForPreTraining.from_pretrained(checkpoint).vit 27 | 28 | assert output in ["cls", "gap", "dense", "dense-cls"] 29 | self.output = output 30 | self.patch_size = self.vit.config.patch_size 31 | 32 | self.image_size = self.vit.embeddings.patch_embeddings.image_size 33 | self.feat_h = self.image_size[0] // self.patch_size 34 | self.feat_w = self.image_size[1] // self.patch_size 35 | 36 | feat_dim = self.vit.config.hidden_size 37 | 38 | num_layers = len(self.vit.encoder.layer) 39 | multilayers = [ 40 | num_layers // 4 - 1, 41 | num_layers // 2 - 1, 42 | num_layers // 4 * 3 - 1, 43 | num_layers - 1, 44 | ] 45 | 46 | if return_multilayer: 47 | self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim] 48 | self.multilayers = multilayers 49 | else: 50 | self.feat_dim = feat_dim 51 | layer = multilayers[-1] if layer == -1 else layer 52 | self.multilayers = [layer] 53 | 54 | # define layer name (for logging) 55 | self.layer = "-".join(str(_x) for _x in self.multilayers) 56 | 57 | self._out_feature_channels = {out_feature: feat_dim} 58 | self._out_feature_strides = {out_feature: self.patch_size} 59 | self._out_features = [out_feature] 60 | 61 | def resize_pos_embed(self, image_size): 62 | assert image_size[0] % self.patch_size == 0 63 | assert image_size[1] % self.patch_size == 0 64 | self.feat_h = image_size[0] // self.patch_size 65 | self.feat_w = image_size[1] // self.patch_size 66 | embed_dim = self.vit.config.hidden_size 67 | self.vit.embeddings.patch_embeddings.image_size = image_size 68 | pos_embed = get_2d_sincos_pos_embed( 69 | embed_dim, (self.feat_h, self.feat_w), add_cls_token=True 70 | ) 71 | # there should be an easier way ... TODO 72 | device = self.vit.embeddings.patch_embeddings.projection.weight.device 73 | self.vit.embeddings.position_embeddings = nn.Parameter( 74 | torch.from_numpy(pos_embed).float().unsqueeze(0).to(device=device), 75 | requires_grad=False, 76 | ) 77 | 78 | def embed_forward(self, embedder, pixel_values): 79 | # No masking here ... 80 | batch_size, num_channels, height, width = pixel_values.shape 81 | embeddings = embedder.patch_embeddings(pixel_values) 82 | 83 | # add position embeddings w/o cls token 84 | embeddings = embeddings + embedder.position_embeddings[:, 1:, :] 85 | 86 | # append cls token 87 | cls_token = embedder.cls_token + embedder.position_embeddings[:, :1, :] 88 | cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1) 89 | embeddings = torch.cat((cls_tokens, embeddings), dim=1) 90 | 91 | return embeddings 92 | 93 | def forward(self, images): 94 | # check if positional embeddings are correct 95 | if self.image_size != images.shape[-2:]: 96 | self.resize_pos_embed(images.shape[-2:]) 97 | 98 | # from MAE implementation 99 | head_mask = self.vit.get_head_mask(None, self.vit.config.num_hidden_layers) 100 | 101 | # ---- hidden ---- 102 | embedding_output = self.embed_forward(self.vit.embeddings, images) 103 | encoder_outputs = self.vit.encoder( 104 | embedding_output, 105 | head_mask=head_mask, 106 | output_attentions=self.vit.config.output_attentions, 107 | output_hidden_states=True, 108 | return_dict=self.vit.config.return_dict, 109 | ) 110 | 111 | outputs = {} 112 | for idx, layer_i in enumerate(self.multilayers): 113 | x_i = encoder_outputs.hidden_states[layer_i] 114 | x_i = tokens_to_output( 115 | self.output, x_i[:, 1:], x_i[:, 0], (self.feat_h, self.feat_w) 116 | ) 117 | outputs[self._out_features[idx]] = x_i 118 | 119 | return outputs 120 | 121 | 122 | @BACKBONE_REGISTRY.register() 123 | def build_mae_backbone(cfg, input_shape: ShapeSpec, priors=None): 124 | checkpoint = cfg.MODEL.MAE.CHECKPOINT 125 | output = cfg.MODEL.MAE.OUTPUT 126 | layer = cfg.MODEL.MAE.LAYER 127 | return_multilayer = cfg.MODEL.MAE.RETURN_MULTILAYER 128 | 129 | bottom_up = MAEBackbone( 130 | cfg, 131 | input_shape, 132 | checkpoint=checkpoint, 133 | output=output, 134 | layer=layer, 135 | return_multilayer=return_multilayer, 136 | ) 137 | 138 | in_feature = cfg.MODEL.FPN.IN_FEATURE 139 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 140 | scale_factors = (4.0, 2.0, 1.0, 0.5) 141 | backbone = SimpleFeaturePyramid( 142 | net=bottom_up, 143 | in_feature=in_feature, 144 | out_channels=out_channels, 145 | scale_factors=scale_factors, 146 | norm=cfg.MODEL.FPN.NORM, 147 | top_block=None, 148 | square_pad=cfg.MODEL.FPN.SQUARE_PAD 149 | ) 150 | return backbone 151 | 152 | def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False): 153 | """ 154 | COPIED FROM TRANSFORMERS PACKAGE AND EDITED TO ALLOW FOR DIFFERENT WIDTH-HEIGHT 155 | Create 2D sin/cos positional embeddings. 156 | 157 | Args: 158 | embed_dim (`int`): 159 | Embedding dimension. 160 | grid_size (`int`): 161 | The grid height and width. 162 | add_cls_token (`bool`, *optional*, defaults to `False`): 163 | Whether or not to add a classification (CLS) token. 164 | 165 | Returns: 166 | (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or 167 | (1+grid_size*grid_size, embed_dim): the 168 | position embeddings (with or without classification token) 169 | """ 170 | grid_h = np.arange(grid_size[0], dtype=np.float32) 171 | grid_w = np.arange(grid_size[1], dtype=np.float32) 172 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 173 | grid = np.stack(grid, axis=0) 174 | 175 | grid = grid.reshape([2, 1, grid_size[0], grid_size[1]]) 176 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 177 | if add_cls_token: 178 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 179 | return pos_embed 180 | 181 | class TestMAEBackbone(unittest.TestCase): 182 | def setUp(self): 183 | # Mock configuration 184 | self.cfg = type('', (), {})() 185 | self.cfg.MODEL = type('', (), {})() 186 | self.cfg.MODEL.MAE = type('', (), {})() 187 | self.cfg.MODEL.MAE.CHECKPOINT = "facebook/vit-mae-base" 188 | self.cfg.MODEL.MAE.OUTPUT = "dense" 189 | self.cfg.MODEL.MAE.LAYER = -1 190 | self.cfg.MODEL.MAE.RETURN_MULTILAYER = False 191 | self.cfg.MODEL.FPN = type('', (), {})() 192 | self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat' 193 | self.cfg.MODEL.FPN.OUT_CHANNELS = 256 194 | self.cfg.MODEL.FPN.NORM = "LN" 195 | self.cfg.MODEL.FPN.FUSE_TYPE = "sum" 196 | self.cfg.MODEL.FPN.SQUARE_PAD = 1024 197 | self.input_shape = ShapeSpec(channels=3, height=1024, width=1024) 198 | 199 | def test_mae_backbone_forward(self): 200 | # Create the backbone 201 | backbone = build_mae_backbone(self.cfg, self.input_shape) 202 | # Generate a random input tensor 203 | x = torch.randn(2, 3, 1024, 1024) 204 | # Run forward pass 205 | outputs = backbone(x) 206 | print(backbone.net.output_shape()) 207 | for key, output in outputs.items(): 208 | print(key, output.shape) 209 | 210 | # print(backbone.net.vit) 211 | 212 | 213 | if __name__ == "__main__": 214 | unittest.main() -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/midas_final.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import ShapeSpec 2 | from detectron2.modeling.backbone import Backbone 3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 4 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid 5 | import torch 6 | from torch import nn 7 | from torchvision import models 8 | import torch.nn.functional as F 9 | import numpy as np 10 | import einops as E 11 | import unittest 12 | from cubercnn.modeling.backbone.dino import tokens_to_output 13 | from cubercnn.modeling.backbone.clip import resize_pos_embed 14 | # from dino import tokens_to_output 15 | # from clip import resize_pos_embed 16 | 17 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/midas_final.py 18 | class MIDASBackbone(Backbone): 19 | def __init__(self, cfg, input_shape, output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",): 20 | super().__init__() 21 | 22 | # get model 23 | midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") 24 | self.vit = midas.pretrained.model 25 | 26 | # set parameters for feature extraction 27 | self.image_size = (384, 384) 28 | self.patch_size = 16 29 | self.output = output 30 | feat_dim = 1024 31 | self.feat_dim = 1024 32 | 33 | num_layers = len(self.vit.blocks) 34 | multilayers = [ 35 | num_layers // 4 - 1, 36 | num_layers // 2 - 1, 37 | num_layers // 4 * 3 - 1, 38 | num_layers - 1, 39 | ] 40 | 41 | if return_multilayer: 42 | self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim] 43 | self.multilayers = multilayers 44 | else: 45 | self.feat_dim = feat_dim 46 | layer = multilayers[-1] if layer == -1 else layer 47 | self.multilayers = [layer] 48 | 49 | # define layer name (for logging) 50 | self.layer = "-".join(str(_x) for _x in self.multilayers) 51 | 52 | self._out_feature_channels = {out_feature: feat_dim} 53 | self._out_feature_strides = {out_feature: self.patch_size} 54 | self._out_features = [out_feature] 55 | 56 | 57 | def forward(self, x): 58 | # update shapes 59 | h, w = x.shape[2:] 60 | emb_hw = (h // self.patch_size, w // self.patch_size) 61 | # assert h == w, f"BeIT can only handle square images, not ({h}, {w})." 62 | if (h, w) != self.image_size: 63 | self.image_size = (h, w) 64 | self.vit.patch_embed.img_size = (h, w) 65 | # import pdb;pdb.set_trace() 66 | self.vit.pos_embed.data = resize_pos_embed(self.vit.pos_embed[0], emb_hw, True)[None] 67 | 68 | # actual forward from beit 69 | x = self.vit.patch_embed(x) 70 | x = torch.cat((self.vit.cls_token.expand(x.shape[0], -1, -1), x), dim=1) 71 | x = x + self.vit.pos_embed 72 | 73 | x = self.vit.norm_pre(x) 74 | 75 | embeds = [] 76 | for i, blk in enumerate(self.vit.blocks): 77 | x = blk(x) 78 | if i in self.multilayers: 79 | embeds.append(x) 80 | if i == self.layer: 81 | break 82 | 83 | # map tokens to output 84 | outputs = {} 85 | for i, x_i in enumerate(embeds): 86 | x_i = tokens_to_output(self.output, x_i[:, 1:], x_i[:, 0], emb_hw) 87 | outputs[self._out_features[i]] = x_i 88 | 89 | return outputs 90 | 91 | 92 | @BACKBONE_REGISTRY.register() 93 | def build_midas_backbone(cfg, input_shape: ShapeSpec, priors=None): 94 | output = cfg.MODEL.MIDAS.OUTPUT 95 | layer = cfg.MODEL.MIDAS.LAYER 96 | return_multilayer = cfg.MODEL.MIDAS.RETURN_MULTILAYER 97 | 98 | bottom_up = MIDASBackbone( 99 | cfg, 100 | input_shape, 101 | output=output, 102 | layer=layer, 103 | return_multilayer=return_multilayer, 104 | ) 105 | 106 | in_feature = cfg.MODEL.FPN.IN_FEATURE 107 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 108 | scale_factors = (4.0, 2.0, 1.0, 0.5) 109 | backbone = SimpleFeaturePyramid( 110 | net=bottom_up, 111 | in_feature=in_feature, 112 | out_channels=out_channels, 113 | scale_factors=scale_factors, 114 | norm=cfg.MODEL.FPN.NORM, 115 | top_block=None, 116 | square_pad=cfg.MODEL.FPN.SQUARE_PAD 117 | ) 118 | return backbone 119 | 120 | 121 | class TestMIDASBackbone(unittest.TestCase): 122 | def setUp(self): 123 | # Mock configuration 124 | self.cfg = type('', (), {})() 125 | self.cfg.MODEL = type('', (), {})() 126 | self.cfg.MODEL.MIDAS = type('', (), {})() 127 | self.cfg.MODEL.MIDAS.OUTPUT = "dense" 128 | self.cfg.MODEL.MIDAS.LAYER = -1 129 | self.cfg.MODEL.MIDAS.RETURN_MULTILAYER = False 130 | self.cfg.MODEL.FPN = type('', (), {})() 131 | self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat' 132 | self.cfg.MODEL.FPN.OUT_CHANNELS = 256 133 | self.cfg.MODEL.FPN.NORM = "LN" 134 | self.cfg.MODEL.FPN.FUSE_TYPE = "sum" 135 | self.cfg.MODEL.FPN.SQUARE_PAD = 1024 136 | self.input_shape = ShapeSpec(channels=3, height=1024, width=1024) 137 | 138 | def test_midas_backbone_forward(self): 139 | # Create the backbone 140 | backbone = build_midas_backbone(self.cfg, self.input_shape) 141 | # Generate a random input tensor 142 | x = torch.randn(2, 3, 1024, 1024) 143 | # Run forward pass 144 | outputs = backbone(x) 145 | print(backbone.net.output_shape()) 146 | for key, output in outputs.items(): 147 | print(key, output.shape) 148 | 149 | # print(backbone.net.vit) 150 | 151 | 152 | if __name__ == "__main__": 153 | unittest.main() -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/mnasnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class MNASNetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.mnasnet1_0(pretrained) 15 | base = base.layers 16 | 17 | self.base = base 18 | 19 | self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320} 20 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 21 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 22 | 23 | def forward(self, x): 24 | 25 | outputs = {} 26 | 27 | p2 = self.base[0:9](x) 28 | p3 = self.base[9](p2) 29 | p4 = self.base[10:12](p3) 30 | p5 = self.base[12:14](p4) 31 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 32 | outputs['p2'] = p2 33 | outputs['p3'] = p3 34 | outputs['p4'] = p4 35 | outputs['p5'] = p5 36 | outputs['p6'] = p6 37 | 38 | return outputs 39 | 40 | @BACKBONE_REGISTRY.register() 41 | def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 42 | """ 43 | Args: 44 | cfg: a detectron2 CfgNode 45 | 46 | Returns: 47 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 48 | """ 49 | 50 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 51 | 52 | bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 53 | in_features = cfg.MODEL.FPN.IN_FEATURES 54 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 55 | 56 | backbone = FPN( 57 | bottom_up=bottom_up, 58 | in_features=in_features, 59 | out_channels=out_channels, 60 | norm=cfg.MODEL.FPN.NORM, 61 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 62 | ) 63 | return backbone 64 | -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/resnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 6 | from detectron2.modeling.backbone.resnet import build_resnet_backbone 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | import torch.nn.functional as F 9 | 10 | from detectron2.modeling.backbone.fpn import FPN 11 | 12 | class ResNet(Backbone): 13 | def __init__(self, cfg, input_shape, pretrained=True): 14 | super().__init__() 15 | 16 | if cfg.MODEL.RESNETS.DEPTH == 18: 17 | base = models.resnet18(pretrained) 18 | self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512} 19 | elif cfg.MODEL.RESNETS.DEPTH == 34: 20 | base = models.resnet34(pretrained) 21 | self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512} 22 | elif cfg.MODEL.RESNETS.DEPTH == 50: 23 | base = models.resnet50(pretrained) 24 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048} 25 | elif cfg.MODEL.RESNETS.DEPTH == 101: 26 | base = models.resnet101(pretrained) 27 | self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048} 28 | else: 29 | raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH)) 30 | 31 | self.conv1 = base.conv1 32 | self.bn1 = base.bn1 33 | self.relu = base.relu 34 | self.maxpool = base.maxpool 35 | self.layer1 = base.layer1 36 | self.layer2 = base.layer2 37 | self.layer3 = base.layer3 38 | self.layer4 = base.layer4 39 | 40 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 41 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 42 | 43 | def forward(self, x): 44 | 45 | outputs = {} 46 | 47 | x = self.conv1(x) 48 | x = self.bn1(x) 49 | x = self.relu(x) 50 | x = self.maxpool(x) 51 | p2 = self.layer1(x) 52 | p3 = self.layer2(p2) 53 | p4 = self.layer3(p3) 54 | p5 = self.layer4(p4) 55 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 56 | 57 | outputs['p2'] = p2 58 | outputs['p3'] = p3 59 | outputs['p4'] = p4 60 | outputs['p5'] = p5 61 | outputs['p6'] = p6 62 | 63 | return outputs 64 | 65 | 66 | @BACKBONE_REGISTRY.register() 67 | def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 68 | """ 69 | Args: 70 | cfg: a detectron2 CfgNode 71 | 72 | Returns: 73 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 74 | """ 75 | 76 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 77 | 78 | if cfg.MODEL.RESNETS.TORCHVISION: 79 | bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain) 80 | 81 | else: 82 | # use the MSRA modeling logic to build the backbone. 83 | bottom_up = build_resnet_backbone(cfg, input_shape) 84 | 85 | in_features = cfg.MODEL.FPN.IN_FEATURES 86 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 87 | 88 | backbone = FPN( 89 | bottom_up=bottom_up, 90 | in_features=in_features, 91 | out_channels=out_channels, 92 | norm=cfg.MODEL.FPN.NORM, 93 | top_block=LastLevelMaxPool(), 94 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 95 | ) 96 | return backbone 97 | -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/sam.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import ShapeSpec 2 | from detectron2.modeling.backbone import Backbone 3 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 4 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool, FPN 5 | from detectron2.modeling.backbone.vit import SimpleFeaturePyramid 6 | import torch 7 | from torch import nn 8 | from torchvision import models 9 | import torch.nn.functional as F 10 | 11 | from pathlib import Path 12 | from urllib.request import urlretrieve 13 | from segment_anything import sam_model_registry 14 | import numpy as np 15 | import einops as E 16 | import unittest 17 | 18 | # reference: https://github.com/mbanani/probe3d/blob/c52d00b069d949b2f00c544d4991716df68d5233/evals/models/sam.py 19 | class SAMBackbone(Backbone): 20 | def __init__(self, cfg, input_shape, checkpoint="facebook/vit-mae-base", output="dense", layer=-1, return_multilayer=False, out_feature="last_feat",): 21 | super().__init__() 22 | 23 | assert output in ["cls", "gap", "dense", "dense-cls"] 24 | self.output = output 25 | 26 | # get model 27 | ckpt_file = "sam_vit_b_01ec64.pth" 28 | ckpt_path = Path("checkpoints") / ckpt_file 29 | 30 | ckpt_path.parent.mkdir(parents=True, exist_ok=True) 31 | 32 | if not ckpt_path.exists(): 33 | download_path = ( 34 | f"https://dl.fbaipublicfiles.com/segment_anything/{ckpt_file}" 35 | ) 36 | urlretrieve(download_path, ckpt_path) 37 | 38 | sam = sam_model_registry['vit_b'](checkpoint=ckpt_path) 39 | vit = sam.image_encoder 40 | 41 | feat_dim = vit.neck[0].in_channels 42 | emb_h, emb_w = vit.pos_embed.shape[1:3] 43 | self.patch_size = vit.patch_embed.proj.kernel_size[0] 44 | self.image_size = (emb_h * self.patch_size, emb_w * self.patch_size) 45 | assert self.patch_size == 16 46 | 47 | self.vit = vit 48 | 49 | 50 | num_layers = len(self.vit.blocks) 51 | multilayers = [ 52 | num_layers // 4 - 1, 53 | num_layers // 2 - 1, 54 | num_layers // 4 * 3 - 1, 55 | num_layers - 1, 56 | ] 57 | 58 | if return_multilayer: 59 | self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim] 60 | self.multilayers = multilayers 61 | else: 62 | self.feat_dim = feat_dim 63 | layer = multilayers[-1] if layer == -1 else layer 64 | self.multilayers = [layer] 65 | 66 | # define layer name (for logging) 67 | self.layer = "-".join(str(_x) for _x in self.multilayers) 68 | 69 | self._out_feature_channels = {out_feature: feat_dim} 70 | self._out_feature_strides = {out_feature: self.patch_size} 71 | self._out_features = [out_feature] 72 | 73 | def resize_pos_embed(self, image_size): 74 | # get embed size 75 | h, w = image_size 76 | h = h // self.patch_size 77 | w = w // self.patch_size 78 | 79 | # resize embed 80 | pos_embed = self.vit.pos_embed.data.permute(0, 3, 1, 2) 81 | pos_embed = torch.nn.functional.interpolate( 82 | pos_embed, size=(h, w), mode="bicubic" 83 | ) 84 | pos_embed = pos_embed.permute(0, 2, 3, 1) 85 | self.vit.pos_embed.data = pos_embed 86 | self.image_size = image_size 87 | 88 | def forward(self, x): 89 | _, _, h, w = x.shape 90 | assert h % self.patch_size == 0 and w % self.patch_size == 0, f"{h}, {w}" 91 | 92 | if h != self.image_size[0] or w != self.image_size[1]: 93 | self.resize_pos_embed(image_size=(h, w)) 94 | 95 | # run vit 96 | x = self.vit.patch_embed(x) 97 | if self.vit.pos_embed is not None: 98 | x = x + self.vit.pos_embed 99 | 100 | embeds = [] 101 | for i, blk in enumerate(self.vit.blocks): 102 | x = blk(x) 103 | if i in self.multilayers: 104 | embeds.append(x) 105 | if len(embeds) == len(self.multilayers): 106 | break 107 | 108 | # feat shape is batch x feat_dim x height x width 109 | embeds = [_emb.permute(0, 3, 1, 2).contiguous() for _emb in embeds] 110 | outputs = {self._out_features[i]: embeds[i] for i in range(len(self.multilayers))} 111 | return outputs 112 | 113 | 114 | @BACKBONE_REGISTRY.register() 115 | def build_sam_backbone(cfg, input_shape: ShapeSpec, priors=None): 116 | output = cfg.MODEL.SAM.OUTPUT 117 | layer = cfg.MODEL.SAM.LAYER 118 | return_multilayer = cfg.MODEL.SAM.RETURN_MULTILAYER 119 | 120 | bottom_up = SAMBackbone( 121 | cfg, 122 | input_shape, 123 | output=output, 124 | layer=layer, 125 | return_multilayer=return_multilayer, 126 | ) 127 | 128 | in_feature = cfg.MODEL.FPN.IN_FEATURE 129 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 130 | scale_factors = (4.0, 2.0, 1.0, 0.5) 131 | backbone = SimpleFeaturePyramid( 132 | net=bottom_up, 133 | in_feature=in_feature, 134 | out_channels=out_channels, 135 | scale_factors=scale_factors, 136 | norm=cfg.MODEL.FPN.NORM, 137 | top_block=None, 138 | square_pad=cfg.MODEL.FPN.SQUARE_PAD 139 | ) 140 | return backbone 141 | 142 | 143 | class TestSAMBackbone(unittest.TestCase): 144 | def setUp(self): 145 | # Mock configuration 146 | self.cfg = type('', (), {})() 147 | self.cfg.MODEL = type('', (), {})() 148 | self.cfg.MODEL.SAM = type('', (), {})() 149 | self.cfg.MODEL.SAM.OUTPUT = "dense" 150 | self.cfg.MODEL.SAM.LAYER = -1 151 | self.cfg.MODEL.SAM.RETURN_MULTILAYER = False 152 | self.cfg.MODEL.FPN = type('', (), {})() 153 | self.cfg.MODEL.FPN.IN_FEATURE = 'last_feat' 154 | self.cfg.MODEL.FPN.OUT_CHANNELS = 256 155 | self.cfg.MODEL.FPN.NORM = "LN" 156 | self.cfg.MODEL.FPN.FUSE_TYPE = "sum" 157 | self.cfg.MODEL.FPN.SQUARE_PAD = 1024 158 | self.input_shape = ShapeSpec(channels=3, height=1024, width=1024) 159 | 160 | def test_sam_backbone_forward(self): 161 | # Create the backbone 162 | backbone = build_sam_backbone(self.cfg, self.input_shape) 163 | # Generate a random input tensor 164 | x = torch.randn(2, 3, 1024, 1024) 165 | # Run forward pass 166 | outputs = backbone(x) 167 | print(backbone.net.output_shape()) 168 | for key, output in outputs.items(): 169 | print(key, output.shape) 170 | 171 | # print(backbone.net.vit) 172 | 173 | 174 | if __name__ == "__main__": 175 | unittest.main() -------------------------------------------------------------------------------- /cubercnn/modeling/backbone/shufflenet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from torchvision import models 3 | from detectron2.layers import ShapeSpec 4 | from detectron2.modeling.backbone import Backbone 5 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling.backbone.fpn import FPN 9 | 10 | class ShufflenetBackbone(Backbone): 11 | def __init__(self, cfg, input_shape, pretrained=True): 12 | super().__init__() 13 | 14 | base = models.shufflenet_v2_x1_0(pretrained) 15 | self.conv1 = base.conv1 16 | self.maxpool = base.maxpool 17 | self.stage2 = base.stage2 18 | self.stage3 = base.stage3 19 | self.stage4 = base.stage4 20 | self.conv5 = base.conv5 21 | 22 | self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464} 23 | self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64} 24 | self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6'] 25 | 26 | def forward(self, x): 27 | 28 | outputs = {} 29 | 30 | x = self.conv1(x) 31 | p2 = self.maxpool(x) 32 | p3 = self.stage2(p2) 33 | p4 = self.stage3(p3) 34 | p5 = self.stage4(p4) 35 | p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0) 36 | 37 | outputs['p2'] = p2 38 | outputs['p3'] = p3 39 | outputs['p4'] = p4 40 | outputs['p5'] = p5 41 | outputs['p6'] = p6 42 | 43 | return outputs 44 | 45 | 46 | @BACKBONE_REGISTRY.register() 47 | def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None): 48 | """ 49 | Args: 50 | cfg: a detectron2 CfgNode 51 | 52 | Returns: 53 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 54 | """ 55 | 56 | imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == '' 57 | 58 | bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain) 59 | in_features = cfg.MODEL.FPN.IN_FEATURES 60 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 61 | 62 | backbone = FPN( 63 | bottom_up=bottom_up, 64 | in_features=in_features, 65 | out_channels=out_channels, 66 | norm=cfg.MODEL.FPN.NORM, 67 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 68 | ) 69 | return backbone 70 | -------------------------------------------------------------------------------- /cubercnn/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | from .rcnn3d import * -------------------------------------------------------------------------------- /cubercnn/modeling/meta_arch/rcnn3d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from typing import Dict, List, Optional 3 | import torch 4 | import numpy as np 5 | from detectron2.layers import ShapeSpec, batched_nms 6 | from detectron2.utils.visualizer import Visualizer 7 | from detectron2.data.detection_utils import convert_image_to_rgb 8 | from detectron2.structures import Instances 9 | from detectron2.utils.events import get_event_storage 10 | from detectron2.data import MetadataCatalog 11 | 12 | from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY 13 | from detectron2.modeling.proposal_generator import build_proposal_generator 14 | from detectron2.utils.logger import _log_api_usage 15 | from detectron2.modeling.meta_arch import ( 16 | META_ARCH_REGISTRY, GeneralizedRCNN 17 | ) 18 | from cubercnn.modeling.roi_heads import build_roi_heads 19 | 20 | from detectron2.data import MetadataCatalog 21 | from pytorch3d.transforms import rotation_6d_to_matrix 22 | from cubercnn.modeling.roi_heads import build_roi_heads 23 | from cubercnn import util, vis 24 | 25 | @META_ARCH_REGISTRY.register() 26 | class RCNN3D(GeneralizedRCNN): 27 | 28 | @classmethod 29 | def from_config(cls, cfg, priors=None): 30 | backbone = build_backbone(cfg, priors=priors) 31 | return { 32 | "backbone": backbone, 33 | "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), 34 | "roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors), 35 | "input_format": cfg.INPUT.FORMAT, 36 | "vis_period": cfg.VIS_PERIOD, 37 | "pixel_mean": cfg.MODEL.PIXEL_MEAN, 38 | "pixel_std": cfg.MODEL.PIXEL_STD, 39 | } 40 | 41 | def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): 42 | 43 | if not self.training: 44 | return self.inference(batched_inputs) 45 | 46 | images = self.preprocess_image(batched_inputs) 47 | 48 | # scaling factor for the sample relative to its original scale 49 | # e.g., how much has the image been upsampled by? or downsampled? 50 | im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] 51 | 52 | # The unmodified intrinsics for the image 53 | Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] 54 | 55 | if "instances" in batched_inputs[0]: 56 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 57 | else: 58 | gt_instances = None 59 | 60 | features = self.backbone(images.tensor) 61 | proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) 62 | 63 | instances, detector_losses = self.roi_heads( 64 | images, features, proposals, 65 | Ks, im_scales_ratio, 66 | gt_instances 67 | ) 68 | 69 | if self.vis_period > 0: 70 | storage = get_event_storage() 71 | if storage.iter % self.vis_period == 0 and storage.iter > 0: 72 | self.visualize_training(batched_inputs, proposals, instances) 73 | 74 | losses = {} 75 | losses.update(detector_losses) 76 | losses.update(proposal_losses) 77 | return losses 78 | 79 | def inference( 80 | self, 81 | batched_inputs: List[Dict[str, torch.Tensor]], 82 | detected_instances: Optional[List[Instances]] = None, 83 | do_postprocess: bool = True, 84 | ): 85 | assert not self.training 86 | 87 | images = self.preprocess_image(batched_inputs) 88 | 89 | # scaling factor for the sample relative to its original scale 90 | # e.g., how much has the image been upsampled by? or downsampled? 91 | im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] 92 | 93 | # The unmodified intrinsics for the image 94 | Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] 95 | 96 | features = self.backbone(images.tensor) 97 | 98 | # Pass oracle 2D boxes into the RoI heads 99 | if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]): 100 | oracles = [b['oracle2D'] for b in batched_inputs] 101 | results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None) 102 | 103 | # normal inference 104 | else: 105 | proposals, _ = self.proposal_generator(images, features, None) 106 | if np.any(['category_list' in b for b in batched_inputs]): 107 | # Gronding DINO inference is only supported to one image at one batch 108 | results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None, category_list=batched_inputs[0]["category_list"]) 109 | else: 110 | results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None) 111 | 112 | if do_postprocess: 113 | assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." 114 | return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) 115 | else: 116 | return results 117 | 118 | def visualize_training(self, batched_inputs, proposals, instances): 119 | """ 120 | A function used to visualize images and proposals. It shows ground truth 121 | bounding boxes on the original image and up to 20 top-scoring predicted 122 | object proposals on the original image. Users can implement different 123 | visualization functions for different models. 124 | Args: 125 | batched_inputs (list): a list that contains input to the model. 126 | proposals (list): a list that contains predicted proposals. Both 127 | batched_inputs and proposals should have the same length. 128 | instances (list): a list that contains predicted RoIhead instances. Both 129 | batched_inputs and proposals should have the same length. 130 | """ 131 | 132 | storage = get_event_storage() 133 | 134 | # minimum number of boxes to try to visualize per image 135 | max_vis_prop = 20 136 | 137 | if not hasattr(self, 'thing_classes'): 138 | self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes 139 | self.num_classes = len(self.thing_classes) 140 | 141 | for input, prop, instances_i in zip(batched_inputs, proposals, instances): 142 | 143 | img = input["image"] 144 | img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) 145 | img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR 146 | img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR 147 | 148 | ''' 149 | Visualize the 2D GT and proposal predictions 150 | ''' 151 | v_gt = Visualizer(img, None) 152 | v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) 153 | anno_img = v_gt.get_image() 154 | box_size = min(len(prop.proposal_boxes), max_vis_prop) 155 | v_pred = Visualizer(img, None) 156 | v_pred = v_pred.overlay_instances( 157 | boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() 158 | ) 159 | prop_img = v_pred.get_image() 160 | vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1) 161 | vis_img_rpn = vis_img_rpn.transpose(2, 0, 1) 162 | storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn) 163 | 164 | ''' 165 | Visualize the 3D GT and predictions 166 | ''' 167 | K = torch.tensor(input['K'], device=self.device) 168 | scale = input['height']/img.shape[0] 169 | fx, sx = (val.item()/scale for val in K[0, [0, 2]]) 170 | fy, sy = (val.item()/scale for val in K[1, [1, 2]]) 171 | 172 | K_scaled = torch.tensor( 173 | [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], 174 | dtype=torch.float32, device=self.device 175 | ) @ K 176 | 177 | gts_per_image = input["instances"] 178 | 179 | gt_classes = gts_per_image.gt_classes 180 | 181 | # Filter out irrelevant groundtruth 182 | fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes) 183 | 184 | gt_classes = gt_classes[fg_selection_mask] 185 | gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes] 186 | gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes 187 | gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses 188 | 189 | # projected 2D center, depth, w, h, l, 3D center 190 | gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask] 191 | 192 | # this box may have been mirrored and scaled so 193 | # we need to recompute XYZ in 3D by backprojecting. 194 | gt_z = gt_boxes3D[:, 2] 195 | 196 | gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx 197 | gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy 198 | 199 | # put together the GT boxes 200 | gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T 201 | gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1) 202 | 203 | gt_colors = torch.tensor( 204 | [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], 205 | device=self.device 206 | )/255.0 207 | 208 | gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors) 209 | 210 | # perform a simple NMS, which is not cls dependent. 211 | keep = batched_nms( 212 | instances_i.pred_boxes.tensor, 213 | instances_i.scores, 214 | torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), 215 | self.roi_heads.box_predictor.test_nms_thresh 216 | ) 217 | 218 | keep = keep[:max_vis_prop] 219 | num_to_visualize = len(keep) 220 | 221 | pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1) 222 | pred_pose = instances_i.pred_pose[keep] 223 | 224 | pred_colors = torch.tensor( 225 | [util.get_color(i) for i in range(num_to_visualize)], 226 | device=self.device 227 | )/255.0 228 | 229 | pred_boxes = instances_i.pred_boxes[keep] 230 | pred_scores = instances_i.scores[keep] 231 | pred_classes = instances_i.pred_classes[keep] 232 | pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)] 233 | pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors) 234 | 235 | # convert to lists 236 | pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))] 237 | gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))] 238 | 239 | img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) 240 | img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) 241 | 242 | # horizontal stack 3D GT and pred left/right 243 | vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1) 244 | vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB 245 | vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1) 246 | 247 | storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d) 248 | 249 | break # only visualize one image in a batch 250 | 251 | def build_model(cfg, priors=None): 252 | """ 253 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. 254 | Note that it does not load any weights from ``cfg``. 255 | """ 256 | meta_arch = cfg.MODEL.META_ARCHITECTURE 257 | model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors) 258 | model.to(torch.device(cfg.MODEL.DEVICE)) 259 | _log_api_usage("modeling.meta_arch." + meta_arch) 260 | return model 261 | 262 | 263 | def build_backbone(cfg, input_shape=None, priors=None): 264 | """ 265 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 266 | 267 | Returns: 268 | an instance of :class:`Backbone` 269 | """ 270 | if input_shape is None: 271 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 272 | 273 | backbone_name = cfg.MODEL.BACKBONE.NAME 274 | backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors) 275 | assert isinstance(backbone, Backbone) 276 | return backbone -------------------------------------------------------------------------------- /cubercnn/modeling/proposal_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from .rpn import * 2 | -------------------------------------------------------------------------------- /cubercnn/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .roi_heads import * 2 | from .roi_heads_gdino import * -------------------------------------------------------------------------------- /cubercnn/modeling/roi_heads/cube_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.utils.registry import Registry 3 | from typing import Dict, List, Optional 4 | from detectron2.layers import ShapeSpec 5 | from torch import nn 6 | import torch 7 | import numpy as np 8 | import fvcore.nn.weight_init as weight_init 9 | 10 | from pytorch3d.transforms.rotation_conversions import _copysign 11 | from pytorch3d.transforms import ( 12 | rotation_6d_to_matrix, 13 | euler_angles_to_matrix, 14 | quaternion_to_matrix 15 | ) 16 | 17 | 18 | ROI_CUBE_HEAD_REGISTRY = Registry("ROI_CUBE_HEAD") 19 | 20 | @ROI_CUBE_HEAD_REGISTRY.register() 21 | class CubeHead(nn.Module): 22 | 23 | def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): 24 | super().__init__() 25 | 26 | #------------------------------------------- 27 | # Settings 28 | #------------------------------------------- 29 | self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES 30 | self.use_conf = cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE 31 | self.z_type = cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE 32 | self.pose_type = cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE 33 | self.cluster_bins = cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS 34 | self.shared_fc = cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC 35 | self.use_prior = cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED 36 | #------------------------------------------- 37 | # Feature generator 38 | #------------------------------------------- 39 | 40 | num_conv = cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV 41 | conv_dim = cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM 42 | num_fc = cfg.MODEL.ROI_CUBE_HEAD.NUM_FC 43 | fc_dim = cfg.MODEL.ROI_CUBE_HEAD.FC_DIM 44 | 45 | conv_dims = [conv_dim] * num_conv 46 | fc_dims = [fc_dim] * num_fc 47 | 48 | assert len(conv_dims) + len(fc_dims) > 0 49 | 50 | self._output_size = (input_shape.channels, input_shape.height, input_shape.width) 51 | 52 | if self.shared_fc: 53 | self.feature_generator = nn.Sequential() 54 | else: 55 | self.feature_generator_XY = nn.Sequential() 56 | self.feature_generator_dims = nn.Sequential() 57 | self.feature_generator_pose = nn.Sequential() 58 | self.feature_generator_Z = nn.Sequential() 59 | 60 | if self.use_conf: 61 | self.feature_generator_conf = nn.Sequential() 62 | 63 | # create fully connected layers for Cube Head 64 | for k, fc_dim in enumerate(fc_dims): 65 | 66 | fc_dim_in = int(np.prod(self._output_size)) 67 | 68 | self._output_size = fc_dim 69 | 70 | if self.shared_fc: 71 | fc = nn.Linear(fc_dim_in, fc_dim) 72 | weight_init.c2_xavier_fill(fc) 73 | self.feature_generator.add_module("fc{}".format(k + 1), fc) 74 | self.feature_generator.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 75 | 76 | else: 77 | 78 | fc = nn.Linear(fc_dim_in, fc_dim) 79 | weight_init.c2_xavier_fill(fc) 80 | self.feature_generator_dims.add_module("fc{}".format(k + 1), fc) 81 | self.feature_generator_dims.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 82 | 83 | fc = nn.Linear(fc_dim_in, fc_dim) 84 | weight_init.c2_xavier_fill(fc) 85 | self.feature_generator_XY.add_module("fc{}".format(k + 1), fc) 86 | self.feature_generator_XY.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 87 | 88 | fc = nn.Linear(fc_dim_in, fc_dim) 89 | weight_init.c2_xavier_fill(fc) 90 | self.feature_generator_pose.add_module("fc{}".format(k + 1), fc) 91 | self.feature_generator_pose.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 92 | 93 | fc = nn.Linear(fc_dim_in, fc_dim) 94 | weight_init.c2_xavier_fill(fc) 95 | self.feature_generator_Z.add_module("fc{}".format(k + 1), fc) 96 | self.feature_generator_Z.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 97 | 98 | if self.use_conf: 99 | fc = nn.Linear(fc_dim_in, fc_dim) 100 | weight_init.c2_xavier_fill(fc) 101 | self.feature_generator_conf.add_module("fc{}".format(k + 1), fc) 102 | self.feature_generator_conf.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 103 | 104 | #------------------------------------------- 105 | # 3D outputs 106 | #------------------------------------------- 107 | output_multiple_factor = self.num_classes if self.use_prior else 1 108 | # Dimensions in meters (width, height, length) 109 | self.bbox_3D_dims = nn.Linear(self._output_size, output_multiple_factor*3) 110 | nn.init.normal_(self.bbox_3D_dims.weight, std=0.001) 111 | nn.init.constant_(self.bbox_3D_dims.bias, 0) 112 | 113 | cluster_bins = self.cluster_bins if self.cluster_bins > 1 else 1 114 | 115 | # XY 116 | self.bbox_3D_center_deltas = nn.Linear(self._output_size, output_multiple_factor*2) 117 | nn.init.normal_(self.bbox_3D_center_deltas.weight, std=0.001) 118 | nn.init.constant_(self.bbox_3D_center_deltas.bias, 0) 119 | 120 | # Pose 121 | if self.pose_type == '6d': 122 | self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*6) 123 | 124 | elif self.pose_type == 'quaternion': 125 | self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*4) 126 | 127 | elif self.pose_type == 'euler': 128 | self.bbox_3D_pose = nn.Linear(self._output_size, output_multiple_factor*3) 129 | 130 | else: 131 | raise ValueError('Cuboid pose type {} is not recognized'.format(self.pose_type)) 132 | 133 | nn.init.normal_(self.bbox_3D_pose.weight, std=0.001) 134 | nn.init.constant_(self.bbox_3D_pose.bias, 0) 135 | 136 | # Z 137 | self.bbox_3D_center_depth = nn.Linear(self._output_size, output_multiple_factor*cluster_bins) 138 | nn.init.normal_(self.bbox_3D_center_depth.weight, std=0.001) 139 | nn.init.constant_(self.bbox_3D_center_depth.bias, 0) 140 | 141 | # Optionally, box confidence 142 | if self.use_conf: 143 | self.bbox_3D_uncertainty = nn.Linear(self._output_size, output_multiple_factor*1) 144 | nn.init.normal_(self.bbox_3D_uncertainty.weight, std=0.001) 145 | nn.init.constant_(self.bbox_3D_uncertainty.bias, 5) 146 | 147 | 148 | def forward(self, x, num_boxes_per_image: Optional[List[int]] = None): 149 | 150 | n = x.shape[0] 151 | 152 | box_z = None 153 | box_uncert = None 154 | box_2d_deltas = None 155 | 156 | if self.shared_fc: 157 | features = self.feature_generator(x) 158 | box_2d_deltas = self.bbox_3D_center_deltas(features) 159 | box_dims = self.bbox_3D_dims(features) 160 | box_pose = self.bbox_3D_pose(features) 161 | box_z = self.bbox_3D_center_depth(features) 162 | 163 | if self.use_conf: 164 | box_uncert = self.bbox_3D_uncertainty(features).clip(0.01) 165 | else: 166 | 167 | box_2d_deltas = self.bbox_3D_center_deltas(self.feature_generator_XY(x)) 168 | box_dims = self.bbox_3D_dims(self.feature_generator_dims(x)) 169 | box_pose = self.bbox_3D_pose(self.feature_generator_pose(x)) 170 | box_z = self.bbox_3D_center_depth(self.feature_generator_Z(x)) 171 | 172 | if self.use_conf: 173 | box_uncert = self.bbox_3D_uncertainty(self.feature_generator_conf(x)).clip(0.01) 174 | 175 | # Pose 176 | if self.pose_type == '6d': 177 | box_pose = rotation_6d_to_matrix(box_pose.view(-1, 6)) 178 | 179 | elif self.pose_type == 'quaternion': 180 | quats = box_pose.view(-1, 4) 181 | quats_scales = (quats * quats).sum(1) 182 | quats = quats / _copysign(torch.sqrt(quats_scales), quats[:, 0])[:, None] 183 | box_pose = quaternion_to_matrix(quats) 184 | 185 | elif self.pose_type == 'euler': 186 | box_pose = euler_angles_to_matrix(box_pose.view(-1, 3), 'XYZ') 187 | if self.use_prior: 188 | box_2d_deltas = box_2d_deltas.view(n, self.num_classes, 2) 189 | box_dims = box_dims.view(n, self.num_classes, 3) 190 | box_pose = box_pose.view(n, self.num_classes, 3, 3) 191 | 192 | if self.cluster_bins > 1: 193 | if self.use_prior: 194 | box_z = box_z.view(n, self.cluster_bins, self.num_classes, -1) 195 | else: 196 | box_z = box_z.view(n, self.cluster_bins, -1) 197 | 198 | else: 199 | if self.use_prior: 200 | box_z = box_z.view(n, self.num_classes, -1) 201 | else: 202 | box_z = box_z.view(n, -1) 203 | 204 | return box_2d_deltas, box_z, box_dims, box_pose, box_uncert 205 | 206 | 207 | def build_cube_head(cfg, input_shape: Dict[str, ShapeSpec]): 208 | name = cfg.MODEL.ROI_CUBE_HEAD.NAME 209 | return ROI_CUBE_HEAD_REGISTRY.get(name)(cfg, input_shape) -------------------------------------------------------------------------------- /cubercnn/modeling/roi_heads/fast_rcnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from re import L 3 | import torch 4 | from torch.nn import functional as F 5 | from typing import List, Tuple 6 | 7 | from fvcore.nn import giou_loss, smooth_l1_loss 8 | from detectron2.utils.events import get_event_storage 9 | from detectron2.layers import cat, cross_entropy, nonzero_tuple, batched_nms 10 | from detectron2.structures import Instances, Boxes 11 | from detectron2.modeling.roi_heads.fast_rcnn import ( 12 | FastRCNNOutputLayers, _log_classification_stats 13 | ) 14 | from cubercnn.modeling.proposal_generator.rpn import matched_pairwise_iou 15 | 16 | def fast_rcnn_inference( 17 | boxes: List[torch.Tensor], 18 | scores: List[torch.Tensor], 19 | image_shapes: List[Tuple[int, int]], 20 | score_thresh: float, 21 | nms_thresh: float, 22 | topk_per_image: int, 23 | ): 24 | """ 25 | Call `fast_rcnn_inference_single_image` for all images. 26 | 27 | Args: 28 | boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic 29 | boxes for each image. Element i has shape (Ri, K * 4) if doing 30 | class-specific regression, or (Ri, 4) if doing class-agnostic 31 | regression, where Ri is the number of predicted objects for image i. 32 | This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. 33 | scores (list[Tensor]): A list of Tensors of predicted class scores for each image. 34 | Element i has shape (Ri, K + 1), where Ri is the number of predicted objects 35 | for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. 36 | image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. 37 | score_thresh (float): Only return detections with a confidence score exceeding this 38 | threshold. 39 | nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. 40 | topk_per_image (int): The number of top scoring detections to return. Set < 0 to return 41 | all detections. 42 | 43 | Returns: 44 | instances: (list[Instances]): A list of N instances, one for each image in the batch, 45 | that stores the topk most confidence detections. 46 | kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates 47 | the corresponding boxes/scores index in [0, Ri) from the input, for image i. 48 | """ 49 | result_per_image = [ 50 | fast_rcnn_inference_single_image( 51 | boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image 52 | ) 53 | for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) 54 | ] 55 | return [x[0] for x in result_per_image], [x[1] for x in result_per_image] 56 | 57 | def fast_rcnn_inference_single_image( 58 | boxes, 59 | scores, 60 | image_shape: Tuple[int, int], 61 | score_thresh: float, 62 | nms_thresh: float, 63 | topk_per_image: int, 64 | ): 65 | """ 66 | Single-image inference. Return bounding-box detection results by thresholding 67 | on scores and applying non-maximum suppression (NMS). 68 | 69 | Args: 70 | Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes 71 | per image. 72 | 73 | Returns: 74 | Same as `fast_rcnn_inference`, but for only one image. 75 | """ 76 | valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) 77 | if not valid_mask.all(): 78 | boxes = boxes[valid_mask] 79 | scores = scores[valid_mask] 80 | 81 | scores = scores[:, :-1] 82 | num_bbox_reg_classes = boxes.shape[1] // 4 83 | 84 | # Convert to Boxes to use the `clip` function ... 85 | boxes = Boxes(boxes.reshape(-1, 4)) 86 | boxes.clip(image_shape) 87 | boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 88 | 89 | # 1. Filter results based on detection scores. It can make NMS more efficient 90 | # by filtering out low-confidence detections. 91 | filter_mask = scores > score_thresh # R x K 92 | 93 | # R' x 2. First column contains indices of the R predictions; 94 | # Second column contains indices of classes. 95 | filter_inds = filter_mask.nonzero() 96 | if num_bbox_reg_classes == 1: 97 | boxes = boxes[filter_inds[:, 0], 0] 98 | else: 99 | boxes = boxes[filter_mask] 100 | 101 | scores_full = scores[filter_inds[:, 0]] 102 | scores = scores[filter_mask] 103 | 104 | # 2. Apply NMS for each class independently. 105 | keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) 106 | if topk_per_image >= 0: 107 | keep = keep[:topk_per_image] 108 | 109 | boxes, scores, filter_inds, scores_full = boxes[keep], scores[keep], filter_inds[keep], scores_full[keep] 110 | 111 | result = Instances(image_shape) 112 | result.pred_boxes = Boxes(boxes) 113 | result.scores = scores 114 | result.scores_full = scores_full 115 | result.pred_classes = filter_inds[:, 1] 116 | return result, filter_inds[:, 0] 117 | 118 | 119 | class FastRCNNOutputs(FastRCNNOutputLayers): 120 | 121 | def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]): 122 | """ 123 | Args: 124 | predictions: return values of :meth:`forward()`. 125 | proposals (list[Instances]): proposals that match the features that were 126 | used to compute predictions. The ``proposal_boxes`` field is expected. 127 | 128 | Returns: 129 | list[Instances]: same as `fast_rcnn_inference`. 130 | list[Tensor]: same as `fast_rcnn_inference`. 131 | """ 132 | boxes = self.predict_boxes(predictions, proposals) 133 | scores = self.predict_probs(predictions, proposals) 134 | 135 | image_shapes = [x.image_size for x in proposals] 136 | return fast_rcnn_inference( 137 | boxes, 138 | scores, 139 | image_shapes, 140 | self.test_score_thresh, 141 | self.test_nms_thresh, 142 | self.test_topk_per_image, 143 | ) 144 | 145 | def losses(self, predictions, proposals): 146 | """ 147 | Args: 148 | predictions: return values of :meth:`forward()`. 149 | proposals (list[Instances]): proposals that match the features that were used 150 | to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``, 151 | ``gt_classes`` are expected. 152 | 153 | Returns: 154 | Dict[str, Tensor]: dict of losses 155 | """ 156 | scores, proposal_deltas = predictions 157 | 158 | # parse classification outputs 159 | gt_classes = ( 160 | cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) 161 | ) 162 | 163 | # parse box regression outputs 164 | if len(proposals): 165 | proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4 166 | assert not proposal_boxes.requires_grad, "Proposals should not require gradients!" 167 | # If "gt_boxes" does not exist, the proposals must be all negative and 168 | # should not be included in regression loss computation. 169 | # Here we just use proposal_boxes as an arbitrary placeholder because its 170 | # value won't be used in self.box_reg_loss(). 171 | gt_boxes = cat( 172 | [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals], 173 | dim=0, 174 | ) 175 | else: 176 | proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device) 177 | 178 | 179 | normalize_factor = max(gt_classes.numel(), 1.0) 180 | 181 | ''' 182 | Standard Faster R-CNN losses 183 | ''' 184 | _log_classification_stats(scores, gt_classes) 185 | loss_cls = cross_entropy(scores, gt_classes, reduction="mean") 186 | loss_box_reg = self.box_reg_loss(proposal_boxes, gt_boxes, proposal_deltas, gt_classes, reduction="none") 187 | loss_box_reg = (loss_box_reg).sum() / normalize_factor 188 | 189 | losses = { 190 | "BoxHead/loss_cls": loss_cls, 191 | "BoxHead/loss_box_reg": loss_box_reg, 192 | } 193 | 194 | return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()} 195 | 196 | def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes, reduction='mean'): 197 | """ 198 | Args: 199 | All boxes are tensors with the same shape Rx(4 or 5). 200 | gt_classes is a long tensor of shape R, the gt class label of each proposal. 201 | R shall be the number of proposals. 202 | """ 203 | box_dim = proposal_boxes.shape[1] # 4 or 5 204 | 205 | # Regression loss is only computed for foreground proposals (those matched to a GT) 206 | fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] 207 | if pred_deltas.shape[1] == box_dim: # cls-agnostic regression 208 | fg_pred_deltas = pred_deltas[fg_inds] 209 | else: 210 | fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[ 211 | fg_inds, gt_classes[fg_inds] 212 | ] 213 | 214 | if reduction == 'mean': 215 | if self.box_reg_loss_type == "smooth_l1": 216 | gt_pred_deltas = self.box2box_transform.get_deltas( 217 | proposal_boxes[fg_inds], 218 | gt_boxes[fg_inds], 219 | ) 220 | loss_box_reg = smooth_l1_loss( 221 | fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum" 222 | ) 223 | elif self.box_reg_loss_type == "giou": 224 | fg_pred_boxes = self.box2box_transform.apply_deltas( 225 | fg_pred_deltas, proposal_boxes[fg_inds] 226 | ) 227 | loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") 228 | else: 229 | raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") 230 | 231 | # The reg loss is normalized using the total number of regions (R), not the number 232 | # of foreground regions even though the box regression loss is only defined on 233 | # foreground regions. Why? Because doing so gives equal training influence to 234 | # each foreground example. To see how, consider two different minibatches: 235 | # (1) Contains a single foreground region 236 | # (2) Contains 100 foreground regions 237 | # If we normalize by the number of foreground regions, the single example in 238 | # minibatch (1) will be given 100 times as much influence as each foreground 239 | # example in minibatch (2). Normalizing by the total number of regions, R, 240 | # means that the single example in minibatch (1) and each of the 100 examples 241 | # in minibatch (2) are given equal influence. 242 | return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty 243 | 244 | elif reduction == 'none': 245 | if self.box_reg_loss_type == "smooth_l1": 246 | gt_pred_deltas = self.box2box_transform.get_deltas( 247 | proposal_boxes[fg_inds], 248 | gt_boxes[fg_inds], 249 | ) 250 | loss_box_reg = smooth_l1_loss( 251 | fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="none" 252 | ) 253 | else: 254 | raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") 255 | 256 | # return non-reduced type 257 | return loss_box_reg 258 | 259 | else: 260 | raise ValueError(f"Invalid bbox reg reduction type '{reduction}'") 261 | 262 | -------------------------------------------------------------------------------- /cubercnn/modeling/roi_heads/roi_heads_gdino.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import sys 3 | sys.path.append('./GroundingDino/') 4 | from cubercnn.modeling.roi_heads.roi_heads import * 5 | 6 | from torchvision.ops import nms 7 | 8 | # GroundingDINO imports 9 | from groundingdino.models import build_model 10 | from groundingdino.util.slconfig import SLConfig 11 | from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap 12 | from groundingdino.util.vl_utils import create_positive_map_from_span 13 | from transformers import AutoTokenizer 14 | 15 | 16 | def load_model(model_config_path, model_checkpoint_path, cpu_only=False): 17 | args = SLConfig.fromfile(model_config_path) 18 | args.device = "cuda" if not cpu_only else "cpu" 19 | model = build_model(args) 20 | checkpoint = torch.load(model_checkpoint_path, map_location="cpu", weights_only=True) 21 | load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) 22 | _ = model.eval() 23 | return model 24 | 25 | 26 | @ROI_HEADS_REGISTRY.register() 27 | class ROIHeads3DGDINO(ROIHeads3D): 28 | 29 | @configurable 30 | def __init__( 31 | self, 32 | *, 33 | ignore_thresh: float, 34 | cube_head: nn.Module, 35 | cube_pooler: nn.Module, 36 | loss_w_3d: float, 37 | loss_w_xy: float, 38 | loss_w_z: float, 39 | loss_w_dims: float, 40 | loss_w_pose: float, 41 | loss_w_joint: float, 42 | use_confidence: float, 43 | inverse_z_weight: bool, 44 | z_type: str, 45 | pose_type: str, 46 | cluster_bins: int, 47 | priors = None, 48 | dims_priors_enabled = None, 49 | dims_priors_func = None, 50 | disentangled_loss=None, 51 | virtual_depth=None, 52 | virtual_focal=None, 53 | test_scale=None, 54 | allocentric_pose=None, 55 | chamfer_pose=None, 56 | scale_roi_boxes=None, 57 | **kwargs, 58 | ): 59 | super().__init__( 60 | ignore_thresh=ignore_thresh, 61 | cube_head=cube_head, 62 | cube_pooler=cube_pooler, 63 | loss_w_3d=loss_w_3d, 64 | loss_w_xy=loss_w_xy, 65 | loss_w_z=loss_w_z, 66 | loss_w_dims=loss_w_dims, 67 | loss_w_pose=loss_w_pose, 68 | loss_w_joint=loss_w_joint, 69 | use_confidence=use_confidence, 70 | inverse_z_weight=inverse_z_weight, 71 | z_type=z_type, 72 | pose_type=pose_type, 73 | cluster_bins=cluster_bins, 74 | priors=priors, 75 | dims_priors_enabled=dims_priors_enabled, 76 | dims_priors_func=dims_priors_func, 77 | disentangled_loss=disentangled_loss, 78 | virtual_depth=virtual_depth, 79 | virtual_focal=virtual_focal, 80 | test_scale=test_scale, 81 | allocentric_pose=allocentric_pose, 82 | chamfer_pose=chamfer_pose, 83 | scale_roi_boxes=scale_roi_boxes, 84 | **kwargs 85 | ) 86 | 87 | self.groundingdino_model = load_model( 88 | "./configs/GroundingDINO_SwinB_cfg.py", 89 | "./checkpoints/groundingdino_swinb_cogcoor.pth", 90 | cpu_only=False 91 | ) 92 | 93 | def forward(self, images, features, proposals, Ks, im_scales_ratio, targets=None, category_list=None): 94 | 95 | im_dims = [image.shape[1:] for image in images] 96 | 97 | # del images 98 | 99 | if self.training: 100 | proposals = self.label_and_sample_proposals(proposals, targets) 101 | 102 | del targets 103 | 104 | if self.training: 105 | 106 | losses = self._forward_box(features, proposals) 107 | if self.loss_w_3d > 0: 108 | instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio) 109 | losses.update(losses_cube) 110 | 111 | return instances_3d, losses 112 | 113 | else: 114 | 115 | # when oracle is available, by pass the box forward. 116 | # simulate the predicted instances by creating a new 117 | # instance for each passed in image. 118 | if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]): 119 | pred_instances = [] 120 | for proposal, im_dim in zip(proposals, im_dims): 121 | 122 | pred_instances_i = Instances(im_dim) 123 | pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D']) 124 | pred_instances_i.pred_classes = proposal['gt_classes'] 125 | pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float() 126 | pred_instances.append(pred_instances_i) 127 | else: 128 | pred_instances = self._forward_box(features, proposals) 129 | 130 | if category_list: 131 | filtered_texts = [ [cat] for cat in category_list] 132 | 133 | # Return empty Instances object if no valid text is found 134 | if not filtered_texts: 135 | target = Instances(pred_instances[0].image_size) 136 | target.pred_classes = torch.tensor([], dtype=torch.int64) # Empty class tensor 137 | target.pred_boxes = Boxes(torch.tensor([], dtype=torch.float32).view(-1, 4)) # Empty boxes tensor 138 | target.scores = torch.tensor([], dtype=torch.float32) # Empty scores tensor 139 | target = target.to(device=pred_instances[0].scores.device) 140 | 141 | else: 142 | 143 | # use grounding dino prediction 144 | configs = { 145 | "groundingdino_model": self.groundingdino_model, 146 | "image": images[0][[2, 1, 0], :, :], 147 | "text_prompt": filtered_texts, 148 | "box_threshold": 0.001, 149 | "text_threshold": 0.25, 150 | "token_spans": None, 151 | "cpu_only": False 152 | } 153 | 154 | ov_pred_instances = grounding_dino_inference_detector(configs) 155 | 156 | # init target 157 | target = Instances(pred_instances[0].image_size) 158 | 159 | # add classes, 2D boxes, scores 160 | class_names = ov_pred_instances["labels"] 161 | # h, w = pred_instances[0].image_size 162 | target.pred_classes = torch.tensor([filtered_texts.index([class_name]) for class_name in class_names]) 163 | target.pred_boxes = Boxes( ov_pred_instances["bboxes"]) 164 | # max_scores = [torch.max(score_tensor).item() for score_tensor in ov_pred_instances["scores"]] 165 | # target.scores = torch.tensor(max_scores).float() 166 | target.scores = ov_pred_instances["scores"] 167 | target = target.to(device=pred_instances[0].scores.device) 168 | 169 | if self.loss_w_3d > 0: 170 | pred_instances = self._forward_cube(features, [target,], Ks, im_dims, im_scales_ratio) 171 | return pred_instances, {} 172 | 173 | 174 | def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None): 175 | assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!" 176 | cap_list = [cat[0] for cat in caption ] 177 | caption = " . ".join(cap_list) 178 | caption = caption.lower() 179 | caption = caption.strip() 180 | if not caption.endswith("."): 181 | caption = caption + " ." 182 | device = "cuda" if not cpu_only else "cpu" 183 | model = model.to(device) 184 | image = image.to(device) 185 | with torch.no_grad(): 186 | outputs = model(image[None], captions=[caption]) 187 | logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256) 188 | boxes = outputs["pred_boxes"][0] # (nq, 4) 189 | 190 | all_logits = [] 191 | 192 | # filter output 193 | if token_spans is None: 194 | tokenlizer = model.tokenizer 195 | tokenized = tokenlizer(caption) 196 | phrases_logits = get_phrase_logits_from_token_logits(logits, tokenized, tokenlizer, cap_list) 197 | filt_mask = phrases_logits.max(dim=1)[0] > box_threshold 198 | im_logits_filt = phrases_logits[filt_mask] 199 | boxes_filt = boxes[filt_mask].cpu() 200 | 201 | im_pred_scores, im_pred_classes = im_logits_filt.max(dim = -1) 202 | all_logits = im_pred_scores.cpu() 203 | pred_phrases = [cap_list[idx] for idx in im_pred_classes] 204 | 205 | else: 206 | # given-phrase mode 207 | positive_maps = create_positive_map_from_span( 208 | model.tokenizer(caption), 209 | token_span=token_spans 210 | ).to(image.device) # n_phrase, 256 211 | 212 | logits_for_phrases = positive_maps @ logits.T # n_phrase, nq 213 | all_logits = [] 214 | all_phrases = [] 215 | all_boxes = [] 216 | for (token_span, logit_phr) in zip(token_spans, logits_for_phrases): 217 | # get phrase 218 | phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span]) 219 | # get mask 220 | filt_mask = logit_phr > box_threshold 221 | # filt box 222 | all_boxes.append(boxes[filt_mask]) 223 | # filt logits 224 | all_logits.append(logit_phr[filt_mask]) 225 | if with_logits: 226 | logit_phr_num = logit_phr[filt_mask] 227 | all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num]) 228 | else: 229 | all_phrases.extend([phrase for _ in range(len(filt_mask))]) 230 | boxes_filt = torch.cat(all_boxes, dim=0).cpu() 231 | pred_phrases = all_phrases 232 | 233 | return boxes_filt, pred_phrases, all_logits 234 | 235 | 236 | def grounding_dino_inference_detector(config): 237 | image = config["image"] 238 | text_prompt = config["text_prompt"] 239 | box_threshold = config["box_threshold"] 240 | text_threshold = config["text_threshold"] 241 | token_spans = config["token_spans"] 242 | cpu_only = config["cpu_only"] 243 | groundingdino_model = config["groundingdino_model"] 244 | 245 | if token_spans is not None: 246 | text_threshold = None 247 | print("Using token_spans. Set the text_threshold to None.") 248 | 249 | boxes_filt, pred_phrases, all_logits = get_grounding_output( 250 | groundingdino_model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans=eval(f"{token_spans}") 251 | ) 252 | h, w = image.shape[1:] 253 | boxes_filt = box_cxcywh_to_xyxy(boxes_filt * torch.tensor([w, h, w, h])) 254 | nms_idx = nms(boxes_filt, all_logits, 0.5) 255 | all_logits = all_logits[nms_idx] 256 | boxes_filt = boxes_filt[nms_idx] 257 | pred_phrases = [pred_phrases[idx] for idx in nms_idx] 258 | ov_pred_instances = {} 259 | ov_pred_instances["scores"] = all_logits 260 | ov_pred_instances["bboxes"] = boxes_filt 261 | ov_pred_instances["labels"] = pred_phrases 262 | 263 | return ov_pred_instances 264 | 265 | 266 | def box_cxcywh_to_xyxy(x): 267 | x_c, y_c, w, h = x.unbind(-1) 268 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 269 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 270 | return torch.stack(b, dim=-1) 271 | 272 | 273 | def get_phrase_logits_from_token_logits( 274 | token_logits: torch.Tensor, tokenized: Dict, tokenizer: AutoTokenizer, cap_list: List 275 | ): 276 | if token_logits.dim() == 2: # (num of query, 256) 277 | tokenized_phrases = tokenizer(cap_list, add_special_tokens=False)['input_ids'] 278 | begin_id = 1 279 | phrase_logits = [] 280 | ids = list(range(len(tokenized['input_ids']))) 281 | phrases_ids = [] 282 | for phrase_tokens in tokenized_phrases: 283 | end_id = begin_id + len(phrase_tokens) 284 | assert phrase_tokens == tokenized['input_ids'][begin_id : end_id], "assert error!!!" 285 | phrases_ids.append(ids[begin_id : end_id]) 286 | begin_id = end_id + 1 287 | for phrase_ids in phrases_ids: 288 | # import pdb;pdb.set_trace() 289 | phrase_logit = token_logits[:, phrase_ids].sum(dim=-1) 290 | phrase_logits.append(phrase_logit) 291 | phrase_logits = torch.stack(phrase_logits, dim=1) 292 | return phrase_logits 293 | else: 294 | raise NotImplementedError("token_logits must be 1-dim") -------------------------------------------------------------------------------- /cubercnn/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import * 2 | from .checkpoint import * -------------------------------------------------------------------------------- /cubercnn/solver/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import torch 3 | from typing import Any, Dict, List, Set 4 | from detectron2.solver.build import maybe_add_gradient_clipping 5 | 6 | def build_optimizer(cfg, model): 7 | norm_module_types = ( 8 | torch.nn.BatchNorm1d, 9 | torch.nn.BatchNorm2d, 10 | torch.nn.BatchNorm3d, 11 | torch.nn.SyncBatchNorm, 12 | torch.nn.GroupNorm, 13 | torch.nn.InstanceNorm1d, 14 | torch.nn.InstanceNorm2d, 15 | torch.nn.InstanceNorm3d, 16 | torch.nn.LayerNorm, 17 | torch.nn.LocalResponseNorm, 18 | ) 19 | params: List[Dict[str, Any]] = [] 20 | memo: Set[torch.nn.parameter.Parameter] = set() 21 | for module in model.modules(): 22 | for key, value in module.named_parameters(recurse=False): 23 | if not value.requires_grad: 24 | continue 25 | # Avoid duplicating parameters 26 | if value in memo: 27 | continue 28 | memo.add(value) 29 | 30 | lr = cfg.SOLVER.BASE_LR 31 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 32 | 33 | if isinstance(module, norm_module_types) and (cfg.SOLVER.WEIGHT_DECAY_NORM is not None): 34 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM 35 | 36 | elif key == "bias": 37 | if (cfg.SOLVER.BIAS_LR_FACTOR is not None): 38 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 39 | if (cfg.SOLVER.WEIGHT_DECAY_BIAS is not None): 40 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 41 | 42 | # these params do not need weight decay at all 43 | # TODO parameterize these in configs instead. 44 | if key in ['priors_dims_per_cat', 'priors_z_scales', 'priors_z_stats']: 45 | weight_decay = 0.0 46 | 47 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 48 | 49 | if cfg.SOLVER.TYPE == 'sgd': 50 | optimizer = torch.optim.SGD( 51 | params, 52 | cfg.SOLVER.BASE_LR, 53 | momentum=cfg.SOLVER.MOMENTUM, 54 | nesterov=cfg.SOLVER.NESTEROV, 55 | weight_decay=cfg.SOLVER.WEIGHT_DECAY 56 | ) 57 | elif cfg.SOLVER.TYPE == 'adam': 58 | optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, eps=1e-02) 59 | elif cfg.SOLVER.TYPE == 'adam+amsgrad': 60 | optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02) 61 | elif cfg.SOLVER.TYPE == 'adamw': 62 | optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, eps=1e-02) 63 | elif cfg.SOLVER.TYPE == 'adamw+amsgrad': 64 | optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02) 65 | else: 66 | raise ValueError('{} is not supported as an optimizer.'.format(cfg.SOLVER.TYPE)) 67 | 68 | optimizer = maybe_add_gradient_clipping(cfg, optimizer) 69 | return optimizer 70 | 71 | def freeze_bn(network): 72 | 73 | for _, module in network.named_modules(): 74 | if isinstance(module, torch.nn.BatchNorm2d): 75 | module.eval() 76 | module.track_running_stats = False 77 | -------------------------------------------------------------------------------- /cubercnn/solver/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.checkpoint import PeriodicCheckpointer 3 | from typing import Any 4 | 5 | class PeriodicCheckpointerOnlyOne(PeriodicCheckpointer): 6 | def step(self, iteration: int, **kwargs: Any) -> None: 7 | """ 8 | Perform the appropriate action at the given iteration. 9 | 10 | Args: 11 | iteration (int): the current iteration, ranged in [0, max_iter-1]. 12 | kwargs (Any): extra data to save, same as in 13 | :meth:`Checkpointer.save`. 14 | """ 15 | iteration = int(iteration) 16 | additional_state = {"iteration": iteration} 17 | additional_state.update(kwargs) 18 | 19 | if (iteration + 1) % self.period == 0: 20 | 21 | # simply save a single recent model 22 | self.checkpointer.save( 23 | "{}_recent".format(self.file_prefix), **additional_state 24 | ) 25 | 26 | if self.max_iter is not None: 27 | if iteration >= self.max_iter - 1: 28 | self.checkpointer.save(f"{self.file_prefix}_final", **additional_state) -------------------------------------------------------------------------------- /cubercnn/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .model_zoo import * 3 | from .math_util import * -------------------------------------------------------------------------------- /cubercnn/util/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from detectron2.utils.file_io import PathHandler, PathManager 3 | 4 | __all__ = ["CubeRCNNHandler"] 5 | 6 | class CubeRCNNHandler(PathHandler): 7 | """ 8 | Resolves CubeRCNN's model zoo files. 9 | """ 10 | 11 | PREFIX = "cubercnn://" 12 | CUBERCNN_PREFIX = "https://dl.fbaipublicfiles.com/cubercnn/" 13 | 14 | def _get_supported_prefixes(self): 15 | return [self.PREFIX] 16 | 17 | def _get_local_path(self, path): 18 | name = path[len(self.PREFIX) :] 19 | return PathManager.get_local_path(self.CUBERCNN_PREFIX + name) 20 | 21 | def _open(self, path, mode="r", **kwargs): 22 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 23 | 24 | 25 | PathManager.register_handler(CubeRCNNHandler()) -------------------------------------------------------------------------------- /cubercnn/util/util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import json 3 | import pickle 4 | import cv2 5 | from time import time 6 | import numpy as np 7 | import os 8 | import shutil 9 | import scipy.io 10 | from PIL import Image 11 | from glob import glob 12 | from difflib import SequenceMatcher 13 | import matplotlib.colors as mplc 14 | 15 | def file_parts(file_path): 16 | 17 | base_path, tail = os.path.split(file_path) 18 | name, ext = os.path.splitext(tail) 19 | 20 | return base_path, name, ext 21 | 22 | def save_json(path, data): 23 | 24 | with open(path, 'w') as fp: 25 | json.dump(data, fp) 26 | 27 | def load_json(path): 28 | 29 | with open(path, 'r') as fp: 30 | data = json.load(fp) 31 | 32 | return data 33 | 34 | def load_mat(path): 35 | 36 | data = scipy.io.loadmat(path, struct_as_record=False, squeeze_me=True) 37 | 38 | return data 39 | 40 | def pickle_write(file_path, obj): 41 | 42 | with open(file_path, 'wb') as file: 43 | pickle.dump(obj, file) 44 | 45 | 46 | def pickle_read(file_path, latin=False, iso8859=False, bytes=False): 47 | 48 | 49 | with open(file_path, 'rb') as file: 50 | if bytes: 51 | obj = pickle.load(file, encoding='bytes') 52 | elif latin: 53 | obj = pickle.load(file, encoding='latin1') 54 | elif iso8859: 55 | obj = pickle.load(file, encoding='iso-8859-1') 56 | 57 | # default encoding 58 | else: 59 | obj = pickle.load(file) 60 | 61 | 62 | return obj 63 | 64 | def imread(path): 65 | return cv2.imread(path) 66 | 67 | # much faster than reading the entire image, just to get the width, height 68 | def imreadstats(path): 69 | 70 | im = Image.open(path) 71 | width, height = im.size 72 | 73 | return width, height 74 | 75 | def imwrite(im, path): 76 | cv2.imwrite(path, im) 77 | 78 | def compute_eta(start_time, idx, total): 79 | """ 80 | Computes estimated time left for an iterative function to finish. 81 | Args: 82 | start_time (int): the time the function started at (e.g from time()) 83 | idx (int): the index the function is currently on, or has completed. 84 | total (int): the total amount that needs to pass for completion. 85 | Returns: 86 | time_str (str): convenient string to display the time remaining 87 | in seconds, minutes or hours depending on magnitude. 88 | dt (float): the average change in seconds per iteration. 89 | """ 90 | 91 | # cannot be less than 1 92 | idx = max(idx, 1) 93 | 94 | dt = (time() - start_time)/idx 95 | timeleft = np.max([dt * (total - idx), 0]) 96 | if timeleft > 3600: time_str = '{:.1f}h'.format(timeleft / 3600); 97 | elif timeleft > 60: time_str = '{:.1f}m'.format(timeleft / 60); 98 | else: time_str = '{:.1f}s'.format(timeleft); 99 | 100 | return time_str, dt 101 | 102 | def list_files(base_dir, file_pattern): 103 | """ 104 | Returns a list of files given a directory and pattern 105 | The results are sorted alphabetically 106 | Example: 107 | files = list_files('path/to/images/', '*.jpg') 108 | """ 109 | return sorted(glob(os.path.join(base_dir) + file_pattern)) 110 | 111 | def list_subdirectories(path, include_files=False): 112 | 113 | # this lists everything. 114 | if include_files: 115 | return sorted(glob(os.path.join(path, '*'))) 116 | 117 | # only subdirectories. 118 | else: 119 | return [fpath for fpath in glob(os.path.join(path, '*')) if os.path.isdir(fpath)] 120 | 121 | def mkdir_if_missing(directory, delete_if_exist=False): 122 | 123 | if delete_if_exist and os.path.exists(directory): shutil.rmtree(directory) 124 | 125 | # check if not exist, then make 126 | if not os.path.exists(directory): 127 | os.makedirs(directory) 128 | 129 | # All coco categories, together with their nice-looking visualization colors 130 | # It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json 131 | COCO_CATEGORIES = [ 132 | {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, 133 | {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, 134 | {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, 135 | {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, 136 | {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, 137 | {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, 138 | {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, 139 | {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, 140 | {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, 141 | {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, 142 | {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, 143 | {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, 144 | {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, 145 | {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, 146 | {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, 147 | {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, 148 | {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, 149 | {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, 150 | {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, 151 | {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, 152 | {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, 153 | {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, 154 | {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, 155 | {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, 156 | {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, 157 | {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, 158 | {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, 159 | {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, 160 | {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, 161 | {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, 162 | {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, 163 | {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, 164 | {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, 165 | {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, 166 | {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, 167 | {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, 168 | {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, 169 | {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, 170 | {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, 171 | {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, 172 | {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, 173 | {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, 174 | {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, 175 | {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, 176 | {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, 177 | {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, 178 | {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, 179 | {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, 180 | {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, 181 | {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, 182 | {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, 183 | {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, 184 | {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, 185 | {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, 186 | {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, 187 | {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, 188 | {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, 189 | {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, 190 | {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, 191 | {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, 192 | {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, 193 | {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, 194 | {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, 195 | {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, 196 | {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, 197 | {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, 198 | {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, 199 | {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, 200 | {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, 201 | {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, 202 | {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, 203 | {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, 204 | {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, 205 | {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, 206 | {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, 207 | {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, 208 | {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, 209 | {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, 210 | {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, 211 | {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, 212 | {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"}, 213 | {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"}, 214 | {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"}, 215 | {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"}, 216 | {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"}, 217 | {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"}, 218 | {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"}, 219 | {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"}, 220 | {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"}, 221 | {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"}, 222 | {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"}, 223 | {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"}, 224 | {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"}, 225 | {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"}, 226 | {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"}, 227 | {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"}, 228 | {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"}, 229 | {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"}, 230 | {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"}, 231 | {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"}, 232 | {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"}, 233 | {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"}, 234 | {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"}, 235 | {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"}, 236 | {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"}, 237 | {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"}, 238 | {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"}, 239 | {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"}, 240 | {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"}, 241 | {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"}, 242 | {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"}, 243 | {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"}, 244 | {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"}, 245 | {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"}, 246 | {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"}, 247 | {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"}, 248 | {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"}, 249 | {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"}, 250 | {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"}, 251 | {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"}, 252 | {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"}, 253 | {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"}, 254 | {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"}, 255 | {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"}, 256 | {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"}, 257 | {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"}, 258 | {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"}, 259 | {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"}, 260 | {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"}, 261 | {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"}, 262 | {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"}, 263 | {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"}, 264 | {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},] 265 | 266 | 267 | _colors = [cat['color'] for cat in COCO_CATEGORIES] 268 | 269 | def _jitter(color): 270 | """ 271 | Randomly modifies given color to produce a slightly different color than the color given. 272 | Args: 273 | color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color 274 | picked. The values in the list are in the [0.0, 1.0] range. 275 | Returns: 276 | jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the 277 | color after being jittered. The values in the list are in the [0.0, 1.0] range. 278 | """ 279 | color = [c/255.0 for c in color] 280 | color = mplc.to_rgb(color) 281 | vec = np.random.rand(3) 282 | 283 | # better to do it in another color space 284 | vec = vec / np.linalg.norm(vec) * 0.5 285 | res = np.clip(vec + color, 0, 1) 286 | return [c*255.0 for c in res] 287 | 288 | 289 | def get_color(ind=None, hex=False): 290 | 291 | if ind is None: 292 | ind = np.random.randint(len(_colors)) 293 | 294 | color = _jitter(_colors[ind % len(_colors)]) 295 | 296 | if hex: 297 | return '#%02x%02x%02x' % (color[0], color[1], color[2]) 298 | 299 | else: 300 | return color 301 | 302 | def string_similarity(text1, text2): 303 | return SequenceMatcher(None, text1, text2).ratio() -------------------------------------------------------------------------------- /cubercnn/vis/__init__.py: -------------------------------------------------------------------------------- 1 | from .vis import * -------------------------------------------------------------------------------- /cubercnn/vis/logperf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | from termcolor import colored 3 | import itertools 4 | from tabulate import tabulate 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def print_ap_category_histogram(dataset, results): 10 | """ 11 | Prints AP and AR performance for each category. 12 | Args: 13 | results: dictionary; each entry contains information for a dataset 14 | """ 15 | num_classes = len(results) 16 | N_COLS = 10 17 | data = list( 18 | itertools.chain( 19 | *[ 20 | [ 21 | cat, 22 | out["AP2D"], 23 | out["AP3D"], 24 | out.get("AR2D", "-"), 25 | out.get("AR3D", "-") 26 | ] 27 | for cat, out in results.items() 28 | ] 29 | ) 30 | ) 31 | data.extend([None] * (N_COLS - (len(data) % N_COLS))) 32 | data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) 33 | table = tabulate( 34 | data, 35 | headers=["category", "AP2D", "AP3D", "AR2D", "AR3D"] * (N_COLS // 5), 36 | tablefmt="pipe", 37 | numalign="left", 38 | stralign="center", 39 | ) 40 | logger.info( 41 | "Performance for each of {} categories on {}:\n".format(num_classes, dataset) 42 | + colored(table, "cyan") 43 | ) 44 | 45 | 46 | def print_ap_analysis_histogram(results): 47 | """ 48 | Prints AP performance for various IoU thresholds and (near, medium, far) objects. 49 | Args: 50 | results: dictionary. Each entry in results contains outputs for a dataset 51 | """ 52 | metric_names = ["AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F", "AR2D", "AR3D"] 53 | N_COLS = 10 54 | data = [] 55 | for name, metrics in results.items(): 56 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AP3D@15"], metrics["AP3D@25"], metrics["AP3D@50"], metrics["AP3D-N"], metrics["AP3D-M"], metrics["AP3D-F"], 57 | metrics["AR2D"], metrics["AR3D"]] 58 | data.append(data_item) 59 | table = tabulate( 60 | data, 61 | headers=["Dataset", "#iters", "AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F", "AR2D", "AR3D"], 62 | tablefmt="grid", 63 | numalign="left", 64 | stralign="center", 65 | ) 66 | logger.info( 67 | "Per-dataset performance analysis on test set:\n" 68 | + colored(table, "cyan") 69 | ) 70 | 71 | 72 | def print_ap_dataset_histogram(results): 73 | """ 74 | Prints AP performance for each dataset. 75 | Args: 76 | results: list of dicts. Each entry in results contains outputs for a dataset 77 | """ 78 | metric_names = ["AP2D", "AP3D"] 79 | N_COLS = 4 80 | data = [] 81 | for name, metrics in results.items(): 82 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]] 83 | data.append(data_item) 84 | table = tabulate( 85 | data, 86 | headers=["Dataset", "#iters", "AP2D", "AP3D"], 87 | tablefmt="grid", 88 | numalign="left", 89 | stralign="center", 90 | ) 91 | logger.info( 92 | "Per-dataset performance on test set:\n" 93 | + colored(table, "cyan") 94 | ) 95 | 96 | 97 | def print_ap_omni_histogram(results): 98 | """ 99 | Prints AP and AR performance for Omni3D dataset. 100 | Args: 101 | results: list of dicts. Each entry in results contains outputs for a dataset 102 | """ 103 | metric_names = ["AP2D", "AP3D", "AR2D", "AR3D"] 104 | N_COLS = 4 105 | data = [] 106 | for name, metrics in results.items(): 107 | data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AR2D"], metrics["AR3D"]] 108 | data.append(data_item) 109 | table = tabulate( 110 | data, 111 | headers=["Dataset", "#iters", "AP2D", "AP3D", "AR2D", "AR3D"], 112 | tablefmt="grid", 113 | numalign="left", 114 | stralign="center", 115 | ) 116 | logger.info( 117 | "Performance on Omni3D:\n" 118 | + colored(table, "magenta") 119 | ) 120 | 121 | def print_ap_hard_easy_for_novel(easy_metrics_formatted, hard_metrics_formatted): 122 | table_data = [ 123 | ["Easy Novel", easy_metrics_formatted['AP2D'], easy_metrics_formatted['AP3D'], 124 | easy_metrics_formatted['AR2D'], easy_metrics_formatted['AR3D']], 125 | ["Hard Novel", hard_metrics_formatted['AP2D'], hard_metrics_formatted['AP3D'], 126 | hard_metrics_formatted['AR2D'], hard_metrics_formatted['AR3D']] 127 | ] 128 | 129 | table = tabulate( 130 | table_data, 131 | headers=["Subset", "AP2D", "AP3D", "AR2D", "AR3D"], 132 | tablefmt="grid" 133 | ) 134 | 135 | logger.info("Novel Categories Evaluation Results on Easy and Hard subsets:\n" + table) 136 | -------------------------------------------------------------------------------- /datasets/ARKitScenes/download_arkitscenes_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved 4 | 5 | wget https://dl.fbaipublicfiles.com/omni3d_data/ARKitScenes_images.zip 6 | unzip ARKitScenes_images.zip -------------------------------------------------------------------------------- /datasets/Omni3D/download_omni3d_json.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved 4 | 5 | wget https://dl.fbaipublicfiles.com/omni3d_data/Omni3D_json.zip 6 | unzip Omni3D_json.zip -------------------------------------------------------------------------------- /datasets/coco_examples/000000044260.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000044260.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000088432.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000088432.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000101762.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000101762.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000120584.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000120584.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000128148.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000128148.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000162543.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000162543.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000164115.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000164115.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000311950.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000311950.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/000000429011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/datasets/coco_examples/000000429011.jpg -------------------------------------------------------------------------------- /datasets/coco_examples/labels.json: -------------------------------------------------------------------------------- 1 | { 2 | "000000044260": [ 3 | "apple" 4 | ], 5 | "000000088432": [ 6 | "truck", 7 | "traffic light", 8 | "fire hydrant" 9 | ], 10 | "000000101762": [ 11 | "bicycle", 12 | "cat" 13 | ], 14 | "000000120584": [ 15 | "clock" 16 | ], 17 | "000000128148": [ 18 | "book", 19 | "chair", 20 | "potted plant", 21 | "couch", 22 | "dining table" 23 | ], 24 | "000000162543": [ 25 | "elephant" 26 | ], 27 | "000000164115": [ 28 | "surfboard" 29 | ], 30 | "000000311950": [ 31 | "hot dog" 32 | ], 33 | "000000429011": [ 34 | "truck", 35 | "car" 36 | ] 37 | } -------------------------------------------------------------------------------- /datasets/objectron/download_objectron_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Meta, Inc. and its affiliates. All Rights Reserved 4 | 5 | wget https://dl.fbaipublicfiles.com/omni3d_data/objectron_images.zip 6 | unzip objectron_images.zip -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates 2 | import logging 3 | import os 4 | import argparse 5 | import sys 6 | import numpy as np 7 | from collections import OrderedDict 8 | import torch 9 | 10 | from detectron2.checkpoint import DetectionCheckpointer 11 | from detectron2.config import get_cfg 12 | from detectron2.engine import default_argument_parser, default_setup, launch 13 | from detectron2.data import transforms as T 14 | 15 | logger = logging.getLogger("detectron2") 16 | 17 | sys.dont_write_bytecode = True 18 | sys.path.append(os.getcwd()) 19 | np.set_printoptions(suppress=True) 20 | 21 | from cubercnn.config import get_cfg_defaults 22 | from cubercnn.modeling.proposal_generator import RPNWithIgnore 23 | from cubercnn.modeling.roi_heads import ROIHeads3D 24 | from cubercnn.modeling.meta_arch import RCNN3D, build_model 25 | from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone 26 | from cubercnn import util, vis 27 | from pycocotools.coco import COCO 28 | from tqdm import tqdm 29 | 30 | 31 | def do_test(args, cfg, model): 32 | 33 | list_of_ims = util.list_files(os.path.join(args.input_folder, ''), '*') 34 | list_of_ims = [ im for im in list_of_ims if not im.endswith('.json')] 35 | list_of_cats_per_img = util.load_json(args.labels_file) 36 | 37 | model.eval() 38 | 39 | focal_length = args.focal_length 40 | principal_point = args.principal_point 41 | thres = args.threshold 42 | 43 | output_dir = cfg.OUTPUT_DIR 44 | min_size = cfg.INPUT.MIN_SIZE_TEST 45 | max_size = cfg.INPUT.MAX_SIZE_TEST 46 | augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")]) 47 | 48 | util.mkdir_if_missing(output_dir) 49 | 50 | for path in tqdm(list_of_ims): 51 | im_name = util.file_parts(path)[1] 52 | im = util.imread(path) 53 | cats = list_of_cats_per_img[im_name] 54 | if cats == []: 55 | continue 56 | if im is None: 57 | continue 58 | 59 | image_shape = im.shape[:2] # h, w 60 | 61 | h, w = image_shape 62 | 63 | if focal_length == 0: 64 | focal_length_ndc = 4.0 65 | focal_length = focal_length_ndc * h / 2 66 | 67 | if len(principal_point) == 0: 68 | px, py = w/2, h/2 69 | else: 70 | px, py = principal_point 71 | 72 | K = np.array([ 73 | [focal_length, 0.0, px], 74 | [0.0, focal_length, py], 75 | [0.0, 0.0, 1.0] 76 | ]) 77 | 78 | aug_input = T.AugInput(im) 79 | _ = augmentations(aug_input) 80 | image = aug_input.image 81 | 82 | batched = [{ 83 | 'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))).cuda(), 84 | 'height': image_shape[0], 'width': image_shape[1], 'K': K, 'category_list': cats 85 | }] 86 | dets = model(batched)[0]['instances'] 87 | n_det = len(dets) 88 | 89 | meshes = [] 90 | meshes_text = [] 91 | 92 | if n_det > 0: 93 | for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip( 94 | dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions, 95 | dets.pred_pose, dets.scores, dets.pred_classes 96 | )): 97 | 98 | # skip 99 | if score < thres: 100 | continue 101 | 102 | cat = cats[cat_idx] 103 | 104 | bbox3D = center_cam.tolist() + dimensions.tolist() 105 | meshes_text.append('{} {:.2f}'.format(cat, score)) 106 | color = [c/255.0 for c in util.get_color(idx)] 107 | box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color) 108 | meshes.append(box_mesh) 109 | 110 | print('File: {} with {} dets'.format(im_name, len(meshes))) 111 | 112 | if len(meshes) > 0: 113 | im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85) 114 | im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1) 115 | if args.display: 116 | vis.imshow(im_concat) 117 | 118 | util.imwrite(im_concat, os.path.join(output_dir, im_name+'_combine.jpg')) 119 | # util.imwrite(im_drawn_rgb, os.path.join(output_dir, im_name+'_boxes.jpg')) 120 | # util.imwrite(im_topdown, os.path.join(output_dir, im_name+'_novel.jpg')) 121 | else: 122 | util.imwrite(im, os.path.join(output_dir, im_name+'_boxes.jpg')) 123 | 124 | def setup(args): 125 | """ 126 | Create configs and perform basic setups. 127 | """ 128 | cfg = get_cfg() 129 | get_cfg_defaults(cfg) 130 | 131 | config_file = args.config_file 132 | 133 | # store locally if needed 134 | if config_file.startswith(util.CubeRCNNHandler.PREFIX): 135 | config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file) 136 | 137 | cfg.merge_from_file(config_file) 138 | cfg.merge_from_list(args.opts) 139 | cfg.freeze() 140 | default_setup(cfg, args) 141 | return cfg 142 | 143 | def main(args): 144 | cfg = setup(args) 145 | model = build_model(cfg) 146 | 147 | logger.info("Model:\n{}".format(model)) 148 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 149 | cfg.MODEL.WEIGHTS, resume=True 150 | ) 151 | 152 | with torch.no_grad(): 153 | do_test(args, cfg, model) 154 | 155 | if __name__ == "__main__": 156 | 157 | parser = argparse.ArgumentParser( 158 | epilog=None, formatter_class=argparse.RawDescriptionHelpFormatter, 159 | ) 160 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 161 | parser.add_argument('--input-folder', type=str, help='list of image folders to process', required=True) 162 | parser.add_argument('--labels-file', type=str, help='path to labels file', required=True) 163 | parser.add_argument("--focal-length", type=float, default=0, help="focal length for image inputs (in px)") 164 | parser.add_argument("--principal-point", type=float, default=[], nargs=2, help="principal point for image inputs (in px)") 165 | parser.add_argument("--threshold", type=float, default=0.25, help="threshold on score for visualizing") 166 | parser.add_argument("--display", default=False, action="store_true", help="Whether to show the images in matplotlib",) 167 | 168 | parser.add_argument("--eval-only", default=True, action="store_true", help="perform evaluation only") 169 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 170 | parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") 171 | parser.add_argument( 172 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 173 | ) 174 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 175 | parser.add_argument( 176 | "--dist-url", 177 | default="tcp://127.0.0.1:{}".format(port), 178 | help="initialization URL for pytorch distributed backend. See " 179 | "https://pytorch.org/docs/stable/distributed.html for details.", 180 | ) 181 | parser.add_argument( 182 | "opts", 183 | help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. " 184 | "See config references at " 185 | "https://detectron2.readthedocs.io/modules/config.html#config-references", 186 | default=None, 187 | nargs=argparse.REMAINDER, 188 | ) 189 | 190 | args = parser.parse_args() 191 | 192 | print("Command Line Args:", args) 193 | launch( 194 | main, 195 | args.num_gpus, 196 | num_machines=args.num_machines, 197 | machine_rank=args.machine_rank, 198 | dist_url=args.dist_url, 199 | args=(args,), 200 | ) -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # -*- coding: utf-8 -*- 3 | 4 | wget -P datasets https://huggingface.co/datasets/uva-cv-lab/ovmono3d_data/resolve/main/ovmono3d_data.zip 5 | unzip datasets/ovmono3d_data.zip -d datasets/Omni3D 6 | 7 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | pip install git+https://github.com/facebookresearch/pytorch3d.git@055ab3a 2 | pip install git+https://github.com/yaojin17/detectron2.git # slightly modified detectron2 for OVMono3D 3 | pip install cython opencv-python scipy pandas einops open_clip_torch open3d 4 | 5 | pip install git+https://github.com/apple/ml-depth-pro.git@b2cd0d5 6 | pip install git+https://github.com/facebookresearch/segment-anything.git@dca509f 7 | pip install git+https://github.com/IDEA-Research/GroundingDINO.git@856dde2 8 | 9 | mkdir -p checkpoints 10 | wget -P ./checkpoints/ https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth 11 | wget -P checkpoints https://ml-site.cdn-apple.com/models/depth-pro/depth_pro.pt 12 | wget -P checkpoints https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth 13 | huggingface-cli download uva-cv-lab/ovmono3d_lift ovmono3d_lift.pth --local-dir checkpoints 14 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UVA-Computer-Vision-Lab/ovmono3d/c50e08a12c2fe452fca4f09a4127669ec60a058e/tools/__init__.py -------------------------------------------------------------------------------- /tools/eval_ovmono3d_geo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import torch 5 | import numpy as np 6 | from collections import OrderedDict 7 | from detectron2.data import MetadataCatalog, DatasetCatalog 8 | from detectron2.utils.file_io import PathManager 9 | from detectron2.utils.logger import setup_logger 10 | import detectron2.utils.comm as comm 11 | 12 | sys.dont_write_bytecode = True 13 | sys.path.append(os.getcwd()) 14 | np.set_printoptions(suppress=True) 15 | from cubercnn.data import ( 16 | get_filter_settings_from_cfg, 17 | simple_register, 18 | get_omni3d_categories 19 | ) 20 | from cubercnn.evaluation import Omni3DEvaluationHelper 21 | from cubercnn import util 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | def setup_categories(category_path): 26 | """Setup category mapping""" 27 | metadata = util.load_json(category_path) 28 | thing_classes = metadata['thing_classes'] 29 | id_map = {int(key):val for key, val in metadata['thing_dataset_id_to_contiguous_id'].items()} 30 | MetadataCatalog.get('omni3d_model').thing_classes = thing_classes 31 | MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id = id_map 32 | 33 | def evaluate_predictions( 34 | dataset_names, 35 | prediction_paths, 36 | filter_settings, 37 | output_dir, 38 | category_path, 39 | eval_mode="novel", 40 | iter_label='final' 41 | ): 42 | """ 43 | Evaluate predictions from pre-computed prediction files. 44 | 45 | Args: 46 | dataset_names (list): List of dataset names to evaluate 47 | prediction_paths (dict): Dictionary mapping dataset names to prediction file paths 48 | filter_settings (dict): Filter settings for evaluation 49 | output_dir (str): Output directory for evaluation results 50 | category_path (str): Path to category metadata json file 51 | eval_mode (str): Evaluation mode, either "novel" or "base" 52 | iter_label (str): Label for the iteration being evaluated 53 | """ 54 | # Setup logging 55 | os.makedirs(output_dir, exist_ok=True) 56 | setup_logger(output=output_dir, name="cubercnn") 57 | 58 | # Setup categories 59 | setup_categories(category_path) 60 | 61 | # Initialize evaluation helper 62 | thing_classes = ['monitor', 'bag', 'dresser', 'board', 'printer', 'keyboard', 'painting', 'drawers', 'microwave', 'computer', 'kitchen pan', 'potted plant', 'tissues', 'rack', 'tray', 'toys', 'phone', 'podium', 'cart', 'soundsystem', 'fireplace', 'tram'] 63 | filter_settings['category_names'] = thing_classes 64 | eval_helper = Omni3DEvaluationHelper( 65 | dataset_names=dataset_names, 66 | filter_settings=filter_settings, 67 | output_folder=output_dir, 68 | iter_label=iter_label, 69 | only_2d=False, 70 | eval_categories=thing_classes 71 | ) 72 | 73 | # Load and evaluate predictions for each dataset 74 | for dataset_name in dataset_names: 75 | logger.info(f"Evaluating predictions for {dataset_name}") 76 | # to get the thing_classes and thing_dataset_id_to_contiguous_id for the MetadataCatalog.get(dataset_name) 77 | DatasetCatalog.get(dataset_name) 78 | # Load predictions 79 | pred_path = prediction_paths[dataset_name] 80 | if not os.path.exists(pred_path): 81 | raise FileNotFoundError(f"Prediction file not found: {pred_path}") 82 | 83 | with PathManager.open(pred_path, "rb") as f: 84 | predictions = torch.load(f) 85 | 86 | # Add predictions to evaluator 87 | eval_helper.add_predictions(dataset_name, predictions) 88 | 89 | # Run evaluation 90 | eval_helper.evaluate(dataset_name) 91 | 92 | # Save predictions if needed 93 | eval_helper.save_predictions(dataset_name) 94 | 95 | # Summarize results 96 | eval_helper.summarize_all() 97 | 98 | def main(): 99 | """Main function demonstrating how to use the evaluation script""" 100 | 101 | dataset_names = ["SUNRGBD_test_novel", "KITTI_test_novel", "ARKitScenes_test_novel"] 102 | prediction_paths = { 103 | "SUNRGBD_test_novel": "./output/ovmono3d_geo/SUNRGBD_test_novel.pth", 104 | "KITTI_test_novel": "./output/ovmono3d_geo/KITTI_test_novel.pth", 105 | "ARKitScenes_test_novel": "./output/ovmono3d_geo/ARKitScenes_test_novel.pth" 106 | } 107 | 108 | # Setup filter settings 109 | filter_settings = { 110 | 'visibility_thres': 0.33333333, 111 | 'truncation_thres': 0.33333333, 112 | 'min_height_thres': 0.0625, 113 | 'max_depth': 100000000.0, 114 | 'category_names': None, # Will be set based on category_path 115 | 'ignore_names': ['dontcare', 'ignore', 'void'], 116 | 'trunc_2D_boxes': True, 117 | 'modal_2D_boxes': False, 118 | 'max_height_thres': 1.5, 119 | } 120 | 121 | # Set paths 122 | output_dir = "./output/ovmono3d_geo" 123 | category_path = "./configs/category_meta.json" 124 | 125 | # Run evaluation 126 | evaluate_predictions( 127 | dataset_names=dataset_names, 128 | prediction_paths=prediction_paths, 129 | filter_settings=filter_settings, 130 | output_dir=output_dir, 131 | category_path=category_path, 132 | eval_mode="novel", 133 | iter_label='final' 134 | ) 135 | 136 | if __name__ == "__main__": 137 | main() -------------------------------------------------------------------------------- /tools/ovmono3d_geo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | import os 4 | import sys 5 | import torch 6 | import numpy as np 7 | import pdb 8 | import cv2 9 | import open3d as o3d 10 | import matplotlib.pyplot as plt 11 | import matplotlib.patches as patches 12 | from pytorch3d.transforms import ( 13 | rotation_6d_to_matrix, 14 | matrix_to_rotation_6d, 15 | ) 16 | from sklearn.decomposition import PCA 17 | from sklearn.cluster import DBSCAN 18 | from segment_anything import SamPredictor, sam_model_registry 19 | import glob 20 | from pytorch3d import _C 21 | import depth_pro 22 | import tqdm 23 | from sklearn.utils import shuffle 24 | 25 | sys.dont_write_bytecode = True 26 | sys.path.append(os.getcwd()) 27 | np.set_printoptions(suppress=True) 28 | 29 | from cubercnn.data import xywh_to_xyxy 30 | import cubercnn.util as util 31 | 32 | def project_3d_to_2d(X, Y, Z, K): 33 | 34 | fx = K[0, 0] 35 | fy = K[1, 1] 36 | cx = K[0, 2] 37 | cy = K[1, 2] 38 | 39 | x = (fx * X) / Z + cx 40 | y = (fy * Y) / Z + cy 41 | 42 | return x, y 43 | 44 | 45 | def get_dims(bbox3d): 46 | x = np.sqrt(np.sum((bbox3d[0] - bbox3d[1]) * (bbox3d[0] - bbox3d[1]))) 47 | y = np.sqrt(np.sum((bbox3d[0] - bbox3d[3]) * (bbox3d[0] - bbox3d[3]))) 48 | z = np.sqrt(np.sum((bbox3d[0] - bbox3d[4]) * (bbox3d[0] - bbox3d[4]))) 49 | return np.array([z, y, x]) 50 | 51 | def get_pose(bbox3d_a, bbox3d_b): 52 | # assume a and b share the same bbox center and have same dimension 53 | center = np.mean(bbox3d_a, axis=0) 54 | dim_a = get_dims(bbox3d_a) 55 | dim_b = get_dims(bbox3d_b) 56 | bbox3d_a -= center 57 | bbox3d_b -= center 58 | U, _, Vt = np.linalg.svd(bbox3d_a.T @ bbox3d_b, full_matrices=True) 59 | R = U @ Vt 60 | if np.linalg.det(R) < 0: 61 | U[:, -1] *= -1 62 | R = U @ Vt 63 | return R 64 | 65 | 66 | def auto_downsample(points, max_points): 67 | """ 68 | If the number of points exceeds max_points, randomly sample down to max_points. 69 | Otherwise, return the original point cloud. 70 | 71 | Parameters: 72 | points (numpy.ndarray): Input point cloud with shape (N, D), where N is the number of points, and D is the dimension. 73 | max_points (int): The maximum number of points to retain. 74 | 75 | Returns: 76 | sampled_points (numpy.ndarray): The downsampled point cloud. 77 | """ 78 | num_points = len(points) 79 | if num_points > max_points: 80 | # Randomly sample points 81 | sampled_points = shuffle(points, random_state=42)[:max_points] 82 | print(f"Points downsampled from {num_points} to {max_points}.") 83 | else: 84 | sampled_points = points 85 | print(f"Points remain unchanged: {num_points}.") 86 | return sampled_points 87 | 88 | # (3) for each annotation, load image, run seg anything, unproject, clustering, 3D bbox, save to new annotations 89 | def build_lineset(bbox3d, color=[1,0,0], flip=True): 90 | if flip: 91 | flip_matrix = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) 92 | bbox3d_flip = bbox3d.dot(flip_matrix) 93 | else: 94 | bbox3d_flip = bbox3d.copy() 95 | lines = [[0, 1], [1, 2], [2, 3], [0, 3], 96 | [4, 5], [5, 6], [6, 7], [4, 7], 97 | [0, 4], [1, 5], [2, 6], [3, 7]] 98 | # Use the same color for all lines 99 | colors = [color for _ in range(len(lines))] 100 | line_set = o3d.geometry.LineSet() 101 | line_set.points = o3d.utility.Vector3dVector(bbox3d_flip) 102 | line_set.lines = o3d.utility.Vector2iVector(lines) 103 | line_set.colors = o3d.utility.Vector3dVector(colors) 104 | return line_set 105 | 106 | def gen_8corners(x_min, y_min, z_min, cx, cy, cz): 107 | corners_flag = [[0,0,0], [1,0,0], [1,1,0], [0,1,0], 108 | [0,0,1], [1,0,1], [1,1,1], [0,1,1]] 109 | corners = [] 110 | for flag in corners_flag: 111 | c = np.array([x_min, y_min, z_min]) + np.array(flag) * np.array([cx, cy, cz]) 112 | corners.append(c) 113 | return np.array(corners) 114 | 115 | def heading2rotmat(heading_angle): 116 | rotmat = np.zeros((3,3)) 117 | rotmat[1, 1] = 1 118 | cosval = np.cos(heading_angle) 119 | sinval = np.sin(heading_angle) 120 | rotmat[0, 0] = cosval 121 | rotmat[0, 2] = -sinval 122 | rotmat[2, 0] = sinval 123 | rotmat[2, 2] = cosval 124 | return rotmat 125 | 126 | 127 | def build_pseudo_bbox3d_from_mask2d_outlier(mask2d, depth, K): 128 | frustum = [] 129 | depth = np.array(depth) # HxW 130 | 131 | ys, xs = np.where(mask2d > 0.5) 132 | # (1) generate mask 133 | for y, x in zip(ys, xs): 134 | # (2) unproject 2d points (visualize in 3D) 135 | z = depth[y, x] 136 | x_3d = z * (x - K[0, 2]) / K[0, 0] 137 | y_3d = z * (y - K[1, 2]) / K[1, 1] 138 | frustum.append([x_3d, -y_3d, -z]) # flip 139 | frustum = np.array(frustum) 140 | 141 | # (3) fit 3D bounding boxes (visualize in 3D) 142 | xyz_offset = np.mean(frustum, axis=0) 143 | xyz = frustum - xyz_offset 144 | pca = PCA(2) 145 | pca.fit(xyz[:, [0, 2]]) # xz plane 146 | yaw_vec = pca.components_[0, :] 147 | yaw = np.arctan2(yaw_vec[1], yaw_vec[0]) 148 | xyz_tmp = xyz.copy() 149 | pose = heading2rotmat(-yaw) 150 | xyz_tmp = (pose @ xyz_tmp[:,:3].T).T 151 | xyz_tmp += xyz_offset 152 | 153 | # remove outliers 154 | eps=0.01 155 | min_samples=100 156 | trial_time = 0 157 | # print(len(xyz_tmp)) 158 | max_points = 40000 159 | xyz_tmp = auto_downsample(xyz_tmp, max_points) 160 | while True: 161 | trial_time += 1 162 | if trial_time > 4: 163 | xyz_clean = xyz_tmp.copy() 164 | break 165 | db = DBSCAN(eps=eps, min_samples=min_samples).fit(xyz_tmp) 166 | xyz_clean = [] 167 | count_points = 0 168 | for cluster in np.unique(db.labels_): 169 | if cluster < 0: 170 | continue 171 | cluster_ind = np.where(db.labels_ == cluster)[0] 172 | if cluster_ind.shape[0] / xyz_tmp.shape[0] < 0.1 or cluster_ind.shape[0] <=100: 173 | continue 174 | xyz_clean.append(xyz_tmp[cluster_ind, :]) 175 | count_points += len(cluster_ind) 176 | if count_points > 0.5 * len(xyz_tmp): 177 | xyz_clean = np.concatenate(xyz_clean, axis=0) 178 | print("%d --> %d" % (len(xyz_tmp), len(xyz_clean))) 179 | break 180 | else: 181 | eps = 2 * eps 182 | print("try once more: eps = %f" % eps) 183 | # xyz_clean = xyz_tmp 184 | 185 | x_min = xyz_tmp[:,0].min() 186 | x_max = xyz_tmp[:,0].max() 187 | y_max = xyz_tmp[:,1].min() 188 | y_min = xyz_tmp[:,1].max() 189 | z_max = xyz_tmp[:,2].min() 190 | z_min = xyz_tmp[:,2].max() 191 | dx_orig = x_max-x_min 192 | dy_orig = y_max-y_min 193 | dz_orig = z_max-z_min 194 | 195 | x_min = xyz_clean[:,0].min() 196 | x_max = xyz_clean[:,0].max() 197 | y_max = xyz_clean[:,1].min() 198 | y_min = xyz_clean[:,1].max() 199 | z_max = xyz_clean[:,2].min() 200 | z_min = xyz_clean[:,2].max() 201 | dx = x_max-x_min 202 | dy = y_max-y_min 203 | dz = z_max-z_min 204 | # 8 corners 205 | bbox3d_pseudo = gen_8corners(x_min, y_min, z_min, dx, dy, dz) 206 | bbox3d_pseudo -= xyz_offset 207 | bbox = heading2rotmat(yaw) @ bbox3d_pseudo.T 208 | bbox = bbox.T + xyz_offset 209 | lineset = build_lineset(bbox, color=[0,0,1], flip=False) 210 | return bbox, lineset, (dx, dy, dz), yaw 211 | 212 | 213 | def run_seg_anything(model, im, bbox2D): 214 | model.set_image(im, image_format="BGR") 215 | bbox = np.array(bbox2D) # XYXY 216 | masks, _, _ = model.predict(box=bbox) 217 | return masks 218 | 219 | 220 | def run_one_2dbox_to_3d(depth_o3d, mask2d, rgb_o3d, K): 221 | 222 | rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( 223 | color=rgb_o3d, 224 | depth=depth_o3d, 225 | depth_scale=1.0, 226 | depth_trunc=1000.0, 227 | convert_rgb_to_intensity=False 228 | ) 229 | # try: 230 | if True: 231 | print("start build pseudo bbox3d") 232 | bbox3d_pseudo, _, _, yaw = build_pseudo_bbox3d_from_mask2d_outlier( 233 | mask2d, rgbd_image.depth, K 234 | ) 235 | print("end build pseudo bbox3d") 236 | 237 | flip_matrix = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) 238 | bbox3d_pseudo = bbox3d_pseudo.dot(flip_matrix) 239 | 240 | # center, dimension, then get the pose 241 | # such that conver from (center, dimension, pose) to 8 corners 242 | # aligning with the pseudo label 243 | cube_dims = torch.from_numpy(get_dims(bbox3d_pseudo)).unsqueeze(0) 244 | cube_3d = torch.from_numpy(np.mean(bbox3d_pseudo, axis=0)).unsqueeze(0) 245 | cube_pose = torch.eye(3).unsqueeze(0) 246 | bbox3d_infer = util.get_cuboid_verts_faces( 247 | torch.cat((cube_3d, cube_dims), dim=1), 248 | cube_pose, 249 | )[0] 250 | bbox3d_infer = bbox3d_infer.squeeze().numpy() 251 | 252 | cube_pose_new = get_pose(bbox3d_pseudo, bbox3d_infer) 253 | bbox3d_infer2 = util.get_cuboid_verts_faces( 254 | torch.cat((cube_3d, cube_dims), dim=1), 255 | cube_pose_new, 256 | )[0] 257 | bbox3d_infer2 = bbox3d_infer2.squeeze().numpy() 258 | return cube_3d.tolist(), cube_dims.tolist(), cube_pose_new.tolist(), bbox3d_infer2.tolist() 259 | 260 | 261 | dataset_list = { 262 | 'KITTI_test_novel': './datasets/Omni3D/gdino_kitti_novel_oracle_2d.json', 263 | 'ARKitScenes_test_novel': './datasets/Omni3D/gdino_arkitscenes_novel_oracle_2d.json', 264 | 'SUNRGBD_test_novel': './datasets/Omni3D/gdino_sunrgbd_novel_oracle_2d.json',} 265 | 266 | # Load model and preprocessing transform 267 | depthpro_model, depthpro_transform = depth_pro.create_model_and_transforms(device=torch.device("cuda"),precision=torch.float16) 268 | depthpro_model.eval() 269 | 270 | ckpt = "./checkpoints/sam_vit_h_4b8939.pth" 271 | sam = sam_model_registry["default"](checkpoint=ckpt).to(device="cuda") 272 | seg_predictor = SamPredictor(sam) 273 | 274 | threshold = 0.30 275 | 276 | for dataset_name, dataset_pth in dataset_list.items(): 277 | with open(dataset_pth, 'r') as f: 278 | dataset = json.load(f) 279 | root = "./datasets/" 280 | with open(os.path.join(root, "Omni3D", f"{dataset_name}.json"), "r") as file: 281 | gt_anns = json.load(file) 282 | imgid2path = {} 283 | for img in gt_anns["images"]: 284 | imgid2path[img['id']] = img['file_path'] 285 | new_dataset = [] 286 | for img in tqdm.tqdm(dataset): 287 | im_path = os.path.join(root, imgid2path[img['image_id']]) 288 | 289 | # Load and preprocess an image. 290 | image, _, f_px = depth_pro.load_rgb(im_path) 291 | image = depthpro_transform(image) 292 | 293 | # Run inference. 294 | prediction = depthpro_model.infer(image, f_px=f_px) 295 | depth = prediction["depth"] # Depth in [m]. 296 | 297 | depth_numpy = depth.cpu().numpy().astype(np.float32) 298 | 299 | depth_o3d = o3d.geometry.Image(depth_numpy) 300 | new_instances = [] 301 | rgb = cv2.imread(im_path) 302 | rgb_o3d = o3d.io.read_image(im_path) 303 | K = np.array(img['K']) 304 | for ins in img["instances"]: 305 | if ins['score'] < threshold: 306 | continue 307 | bbox2D = xywh_to_xyxy(ins["bbox"]) 308 | mask2D = run_seg_anything(seg_predictor, rgb, bbox2D) 309 | mask2d = mask2D[2, :, :] # largest mask 310 | cube_3d, cube_dims, cube_pose_new, bbox3d_infer2 = run_one_2dbox_to_3d(depth_o3d, mask2d, rgb_o3d, K) 311 | 312 | new_instance = {key: value for key, value in ins.items() if key in ['category_id', 'bbox', 'score', 'category_name']} 313 | new_instance["image_id"] = img['image_id'] 314 | new_instance["bbox3D"] = bbox3d_infer2 315 | new_instance["depth"] = cube_3d[0][-1] 316 | 317 | new_instance["center_cam"] = cube_3d[0] 318 | new_instance["dimensions"] = cube_dims[0] 319 | new_instance["pose"] = cube_pose_new 320 | x, y = project_3d_to_2d(cube_3d[0][0], cube_3d[0][1], cube_3d[0][2], K) 321 | new_instance["center_2D"] = [x, y] 322 | new_instances.append(new_instance) 323 | 324 | new_img = {key: value for key, value in img.items()} 325 | new_img["instances"] = new_instances 326 | new_dataset.append(new_img) 327 | # Create output directory if it doesn't exist 328 | output_dir = "./output/ovmono3d_geo" 329 | os.makedirs(output_dir, exist_ok=True) 330 | 331 | torch.save(new_dataset, f"{output_dir}/{dataset_name}.pth") 332 | --------------------------------------------------------------------------------