├── .gitignore ├── ACKNOWLEDGMENTS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── IS_Net ├── Inference.py ├── __init__.py ├── basics.py ├── data_loader_cache.py ├── hce_metric_main.py ├── models │ ├── __init__.py │ └── isnet.py ├── pytorch18.yml ├── requirements.txt └── train_valid_inference_main.py ├── LICENSE ├── MODEL_LICENSE ├── README.md ├── configs └── config_stage3.yaml ├── data ├── __init__.py └── data_preprocessor.py ├── docs └── inference-pipe.png ├── examples ├── co3dv2-samples │ ├── 195_20989_41543 │ │ ├── 047.jpg │ │ ├── 047.txt │ │ ├── 047_depth.png │ │ ├── 047_ext.txt │ │ ├── 051.jpg │ │ ├── 051.txt │ │ ├── 051_depth.png │ │ ├── 051_ext.txt │ │ ├── 060.jpg │ │ ├── 060.txt │ │ ├── 060_depth.png │ │ ├── 060_ext.txt │ │ ├── 084.jpg │ │ ├── 084.txt │ │ ├── 084_depth.png │ │ ├── 084_ext.txt │ │ ├── 122.jpg │ │ ├── 122.txt │ │ ├── 122_depth.png │ │ ├── 122_ext.txt │ │ ├── 126.jpg │ │ ├── 126.txt │ │ ├── 126_depth.png │ │ ├── 126_ext.txt │ │ ├── 161.jpg │ │ ├── 161.txt │ │ ├── 161_depth.png │ │ ├── 161_ext.txt │ │ ├── 164.jpg │ │ ├── 164.txt │ │ ├── 164_depth.png │ │ └── 164_ext.txt │ ├── 198_21285_41285 │ │ ├── 000.jpg │ │ ├── 000.txt │ │ ├── 000_depth.png │ │ ├── 000_ext.txt │ │ ├── 038.jpg │ │ ├── 038.txt │ │ ├── 038_depth.png │ │ ├── 038_ext.txt │ │ ├── 051.jpg │ │ ├── 051.txt │ │ ├── 051_depth.png │ │ ├── 051_ext.txt │ │ ├── 074.jpg │ │ ├── 074.txt │ │ ├── 074_depth.png │ │ ├── 074_ext.txt │ │ ├── 121.jpg │ │ ├── 121.txt │ │ ├── 121_depth.png │ │ ├── 121_ext.txt │ │ ├── 122.jpg │ │ ├── 122.txt │ │ ├── 122_depth.png │ │ ├── 122_ext.txt │ │ ├── 155.jpg │ │ ├── 155.txt │ │ ├── 155_depth.png │ │ ├── 155_ext.txt │ │ ├── 176.jpg │ │ ├── 176.txt │ │ ├── 176_depth.png │ │ └── 176_ext.txt │ ├── 201_21613_43652 │ │ ├── 000.jpg │ │ ├── 000.txt │ │ ├── 000_depth.png │ │ ├── 000_ext.txt │ │ ├── 027.jpg │ │ ├── 027.txt │ │ ├── 027_depth.png │ │ ├── 027_ext.txt │ │ ├── 038.jpg │ │ ├── 038.txt │ │ ├── 038_depth.png │ │ ├── 038_ext.txt │ │ ├── 051.jpg │ │ ├── 051.txt │ │ ├── 051_depth.png │ │ ├── 051_ext.txt │ │ ├── 060.jpg │ │ ├── 060.txt │ │ ├── 060_depth.png │ │ ├── 060_ext.txt │ │ ├── 067.jpg │ │ ├── 067.txt │ │ ├── 067_depth.png │ │ ├── 067_ext.txt │ │ ├── 099.jpg │ │ ├── 099.txt │ │ ├── 099_depth.png │ │ ├── 099_ext.txt │ │ ├── 143.jpg │ │ ├── 143.txt │ │ ├── 143_depth.png │ │ └── 143_ext.txt │ ├── 31_1359_4114 │ │ ├── 010.jpg │ │ ├── 010.txt │ │ ├── 010_depth.png │ │ ├── 010_ext.txt │ │ ├── 017.jpg │ │ ├── 017.txt │ │ ├── 017_depth.png │ │ ├── 017_ext.txt │ │ ├── 041.jpg │ │ ├── 041.txt │ │ ├── 041_depth.png │ │ ├── 041_ext.txt │ │ ├── 072.jpg │ │ ├── 072.txt │ │ ├── 072_depth.png │ │ ├── 072_ext.txt │ │ ├── 099.jpg │ │ ├── 099.txt │ │ ├── 099_depth.png │ │ ├── 099_ext.txt │ │ ├── 107.jpg │ │ ├── 107.txt │ │ ├── 107_depth.png │ │ ├── 107_ext.txt │ │ ├── 118.jpg │ │ ├── 118.txt │ │ ├── 118_depth.png │ │ ├── 118_ext.txt │ │ ├── 130.jpg │ │ ├── 130.txt │ │ ├── 130_depth.png │ │ └── 130_ext.txt │ └── 422_58670_113666 │ │ ├── 000.jpg │ │ ├── 000.txt │ │ ├── 000_depth.png │ │ ├── 000_ext.txt │ │ ├── 028.jpg │ │ ├── 028.txt │ │ ├── 028_depth.png │ │ ├── 028_ext.txt │ │ ├── 044.jpg │ │ ├── 044.txt │ │ ├── 044_depth.png │ │ ├── 044_ext.txt │ │ ├── 062.jpg │ │ ├── 062.txt │ │ ├── 062_depth.png │ │ ├── 062_ext.txt │ │ ├── 068.jpg │ │ ├── 068.txt │ │ ├── 068_depth.png │ │ ├── 068_ext.txt │ │ ├── 074.jpg │ │ ├── 074.txt │ │ ├── 074_depth.png │ │ ├── 074_ext.txt │ │ ├── 098.jpg │ │ ├── 098.txt │ │ ├── 098_depth.png │ │ ├── 098_ext.txt │ │ ├── 101.jpg │ │ ├── 101.txt │ │ ├── 101_depth.png │ │ └── 101_ext.txt ├── single-view │ ├── armor.png │ ├── armor.txt │ ├── ghost.png │ ├── ghost.txt │ ├── jacket.png │ ├── pile.png │ ├── pile.txt │ └── skull.png └── unposed-samples │ ├── arkitscenes │ ├── 41069043 │ │ ├── 061.png │ │ ├── 061.txt │ │ ├── 072.png │ │ ├── 072.txt │ │ ├── 081.png │ │ └── 081.txt │ └── 41125709 │ │ ├── 052.png │ │ ├── 052.txt │ │ ├── 053.png │ │ ├── 053.txt │ │ ├── 054.png │ │ └── 054.txt │ └── co3dv2 │ ├── 195_20989_41543 │ ├── 051.jpg │ ├── 051.txt │ ├── 084.jpg │ ├── 084.txt │ ├── 126.jpg │ └── 126.txt │ ├── 198_21285_41285 │ ├── 000.jpg │ ├── 000.txt │ ├── 038.jpg │ ├── 038.txt │ ├── 074.jpg │ └── 074.txt │ ├── 201_21613_43652 │ ├── 000.jpg │ ├── 000.txt │ ├── 038.jpg │ ├── 038.txt │ ├── 067.jpg │ └── 067.txt │ ├── 31_1359_4114 │ ├── 010.jpg │ ├── 010.txt │ ├── 041.jpg │ ├── 041.txt │ ├── 072.jpg │ └── 072.txt │ └── 422_58670_113666 │ ├── 000.jpg │ ├── 000.txt │ ├── 062.jpg │ ├── 062.txt │ ├── 101.jpg │ └── 101.txt ├── model ├── dinov2.py ├── dinov2_adaln │ ├── __init__.py │ └── adaln.py ├── dit.py ├── feature_extractors.py ├── hunyuan.py ├── inference │ ├── __init__.py │ └── ddpm.py ├── load.py └── utils │ ├── __init__.py │ ├── nn.py │ ├── normalize.py │ ├── pos_encoder.py │ └── rays.py ├── pipeline_depth_prediction.py ├── pipeline_novel_view_synthesis.py ├── pipeline_pose_estimation.py ├── pipeline_single_to_3d.py ├── pipeline_unposed_few_shot_to_3d.py ├── requirements.txt ├── scripts ├── depth_prediction.sh ├── novel_view_synthesis.sh ├── pose_estimation.sh ├── single_view_to_3d.sh ├── unposed_fewshot_to_3d_arkitscenes.sh └── unposed_fewshot_to_3d_co3dv2.sh ├── splatfacto_matrix3d ├── __init__.py ├── batch_full_images_datamanager.py ├── splatfacto.py ├── splatfacto_configs.py └── strategy.py └── utils ├── camera_utils.py ├── data_utils.py ├── train_utils.py ├── vis.py └── write_videos.py /.gitignore: -------------------------------------------------------------------------------- 1 | # debug files 2 | debug/ 3 | sd-model-finetuned 4 | logs 5 | outputs 6 | checkpoints 7 | results 8 | 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | .DS_Store 141 | .vscode 142 | 143 | __MACOSX 144 | -------------------------------------------------------------------------------- /ACKNOWLEDGMENTS: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | Portions of this Matrix3D Software may utilize the following copyrighted 3 | material, the use of which is hereby acknowledged. 4 | 5 | _____________________ 6 | 7 | The HuggingFace Team (https://github.com/huggingface/diffusers) 8 | This Software uses code from the diffusers library from the HuggingFace 9 | Team, which is distributed under Apache license. 10 | 11 | Copyright 2024 The HuggingFace Team. All rights reserved. 12 | 13 | Licensed under the Apache License, Version 2.0 (the "License"); 14 | you may not use this file except in compliance with the License. 15 | You may obtain a copy of the License at 16 | 17 | http://www.apache.org/licenses/LICENSE-2.0 18 | 19 | Unless required by applicable law or agreed to in writing, software 20 | distributed under the License is distributed on an "AS IS" BASIS, 21 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | See the License for the specific language governing permissions and 23 | limitations under the License. 24 | 25 | The Nerfstudio Team (https://github.com/nerfstudio-project/nerfstudio) 26 | This Software uses code from the nerfstudio library, which is distributed 27 | under Apache license. 28 | 29 | Copyright 2023 The Nerfstudio Team 30 | 31 | Licensed under the Apache License, Version 2.0 (the "License"); 32 | you may not use this file except in compliance with the License. 33 | You may obtain a copy of the License at 34 | 35 | http://www.apache.org/licenses/LICENSE-2.0 36 | 37 | Unless required by applicable law or agreed to in writing, software 38 | distributed under the License is distributed on an "AS IS" BASIS, 39 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | See the License for the specific language governing permissions and 41 | limitations under the License. 42 | 43 | Xuebin Qin (https://github.com/xuebinqin/DIS) 44 | This Software uses code from the DIS library, which is distributed under Apache 45 | license. 46 | 47 | Copyright 2024 Xuebin Qin. All rights reserved. 48 | 49 | Licensed under the Apache License, Version 2.0 (the "License"); 50 | you may not use this file except in compliance with the License. 51 | You may obtain a copy of the License at 52 | 53 | http://www.apache.org/licenses/LICENSE-2.0 54 | 55 | Unless required by applicable law or agreed to in writing, software 56 | distributed under the License is distributed on an "AS IS" BASIS, 57 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 58 | See the License for the specific language governing permissions and 59 | limitations under the License. 60 | 61 | Jason Y. Zhang (https://github.com/jasonyzhang/RayDiffusion) 62 | This Software uses code from the RayDiffusion library, 63 | which is distributed under MIT license. 64 | 65 | Copyright (c) 2024 Jason Y. Zhang 66 | 67 | Permission is hereby granted, free of charge, to any person obtaining a copy 68 | of this software and associated documentation files (the "Software"), to deal 69 | in the Software without restriction, including without limitation the rights 70 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 71 | copies of the Software, and to permit persons to whom the Software is 72 | furnished to do so, subject to the following conditions: 73 | 74 | The above copyright notice and this permission notice shall be included in all 75 | copies or substantial portions of the Software. 76 | 77 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 78 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 79 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 80 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 81 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 82 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 83 | SOFTWARE. 84 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, 71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository. 4 | 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged. 6 | 7 | ## Before you get started 8 | 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE). 10 | 11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). 12 | -------------------------------------------------------------------------------- /IS_Net/Inference.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import time 7 | import numpy as np 8 | from skimage import io 9 | import time 10 | from glob import glob 11 | from tqdm import tqdm 12 | 13 | import torch, gc 14 | import torch.nn as nn 15 | from torch.autograd import Variable 16 | import torch.optim as optim 17 | import torch.nn.functional as F 18 | from torchvision.transforms.functional import normalize 19 | 20 | from models import * 21 | 22 | 23 | if __name__ == "__main__": 24 | dataset_path="../demo_datasets/your_dataset" #Your dataset path 25 | model_path="../saved_models/IS-Net/isnet-general-use.pth" # the model path 26 | result_path="../demo_datasets/your_dataset_result" #The folder path that you want to save the results 27 | input_size=[1024,1024] 28 | net=ISNetDIS() 29 | 30 | if torch.cuda.is_available(): 31 | net.load_state_dict(torch.load(model_path)) 32 | net=net.cuda() 33 | else: 34 | net.load_state_dict(torch.load(model_path,map_location="cpu")) 35 | net.eval() 36 | im_list = glob(dataset_path+"/*.jpg")+glob(dataset_path+"/*.JPG")+glob(dataset_path+"/*.jpeg")+glob(dataset_path+"/*.JPEG")+glob(dataset_path+"/*.png")+glob(dataset_path+"/*.PNG")+glob(dataset_path+"/*.bmp")+glob(dataset_path+"/*.BMP")+glob(dataset_path+"/*.tiff")+glob(dataset_path+"/*.TIFF") 37 | with torch.no_grad(): 38 | for i, im_path in tqdm(enumerate(im_list), total=len(im_list)): 39 | print("im_path: ", im_path) 40 | im = io.imread(im_path) 41 | if len(im.shape) < 3: 42 | im = im[:, :, np.newaxis] 43 | im_shp=im.shape[0:2] 44 | im_tensor = torch.tensor(im, dtype=torch.float32).permute(2,0,1) 45 | im_tensor = F.upsample(torch.unsqueeze(im_tensor,0), input_size, mode="bilinear").type(torch.uint8) 46 | image = torch.divide(im_tensor,255.0) 47 | image = normalize(image,[0.5,0.5,0.5],[1.0,1.0,1.0]) 48 | 49 | if torch.cuda.is_available(): 50 | image=image.cuda() 51 | result=net(image) 52 | result=torch.squeeze(F.upsample(result[0][0],im_shp,mode='bilinear'),0) 53 | ma = torch.max(result) 54 | mi = torch.min(result) 55 | result = (result-mi)/(ma-mi) 56 | im_name=im_path.split('/')[-1].split('.')[0] 57 | io.imsave(os.path.join(result_path,im_name+".png"),(result*255).permute(1,2,0).cpu().data.numpy().astype(np.uint8)) 58 | -------------------------------------------------------------------------------- /IS_Net/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # -------------------------------------------------------------------------------- /IS_Net/basics.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2' 7 | from skimage import io, transform 8 | import torch 9 | import torchvision 10 | from torch.autograd import Variable 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.utils.data import Dataset, DataLoader 14 | from torchvision import transforms, utils 15 | import torch.optim as optim 16 | 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | from PIL import Image 20 | import glob 21 | 22 | def mae_torch(pred,gt): 23 | 24 | h,w = gt.shape[0:2] 25 | sumError = torch.sum(torch.absolute(torch.sub(pred.float(), gt.float()))) 26 | maeError = torch.divide(sumError,float(h)*float(w)*255.0+1e-4) 27 | 28 | return maeError 29 | 30 | def f1score_torch(pd,gt): 31 | 32 | # print(gt.shape) 33 | gtNum = torch.sum((gt>128).float()*1) ## number of ground truth pixels 34 | 35 | pp = pd[gt>128] 36 | nn = pd[gt<=128] 37 | 38 | pp_hist =torch.histc(pp,bins=255,min=0,max=255) 39 | nn_hist = torch.histc(nn,bins=255,min=0,max=255) 40 | 41 | 42 | pp_hist_flip = torch.flipud(pp_hist) 43 | nn_hist_flip = torch.flipud(nn_hist) 44 | 45 | pp_hist_flip_cum = torch.cumsum(pp_hist_flip, dim=0) 46 | nn_hist_flip_cum = torch.cumsum(nn_hist_flip, dim=0) 47 | 48 | precision = (pp_hist_flip_cum)/(pp_hist_flip_cum + nn_hist_flip_cum + 1e-4)#torch.divide(pp_hist_flip_cum,torch.sum(torch.sum(pp_hist_flip_cum, nn_hist_flip_cum), 1e-4)) 49 | recall = (pp_hist_flip_cum)/(gtNum + 1e-4) 50 | f1 = (1+0.3)*precision*recall/(0.3*precision+recall + 1e-4) 51 | 52 | return torch.reshape(precision,(1,precision.shape[0])),torch.reshape(recall,(1,recall.shape[0])),torch.reshape(f1,(1,f1.shape[0])) 53 | 54 | 55 | def f1_mae_torch(pred, gt, valid_dataset, idx, mybins, hypar): 56 | 57 | import time 58 | tic = time.time() 59 | 60 | if(len(gt.shape)>2): 61 | gt = gt[:,:,0] 62 | 63 | pre, rec, f1 = f1score_torch(pred,gt) 64 | mae = mae_torch(pred,gt) 65 | 66 | 67 | # hypar["valid_out_dir"] = hypar["valid_out_dir"]+"-eval" ### 68 | if(hypar["valid_out_dir"]!=""): 69 | if(not os.path.exists(hypar["valid_out_dir"])): 70 | os.mkdir(hypar["valid_out_dir"]) 71 | dataset_folder = os.path.join(hypar["valid_out_dir"],valid_dataset.dataset["data_name"][idx]) 72 | if(not os.path.exists(dataset_folder)): 73 | os.mkdir(dataset_folder) 74 | io.imsave(os.path.join(dataset_folder,valid_dataset.dataset["im_name"][idx]+".png"),pred.cpu().data.numpy().astype(np.uint8)) 75 | print(valid_dataset.dataset["im_name"][idx]+".png") 76 | print("time for evaluation : ", time.time()-tic) 77 | 78 | return pre.cpu().data.numpy(), rec.cpu().data.numpy(), f1.cpu().data.numpy(), mae.cpu().data.numpy() 79 | -------------------------------------------------------------------------------- /IS_Net/hce_metric_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import numpy as np 6 | from skimage import io 7 | import matplotlib.pyplot as plt 8 | import cv2 as cv 9 | from skimage.morphology import skeletonize 10 | from skimage.morphology import erosion, dilation, disk 11 | from skimage.measure import label 12 | 13 | import os 14 | import sys 15 | from tqdm import tqdm 16 | from glob import glob 17 | import pickle as pkl 18 | 19 | def filter_bdy_cond(bdy_, mask, cond): 20 | 21 | cond = cv.dilate(cond.astype(np.uint8),disk(1)) 22 | labels = label(mask) # find the connected regions 23 | lbls = np.unique(labels) # the indices of the connected regions 24 | indep = np.ones(lbls.shape[0]) # the label of each connected regions 25 | indep[0] = 0 # 0 indicate the background region 26 | 27 | boundaries = [] 28 | h,w = cond.shape[0:2] 29 | ind_map = np.zeros((h,w)) 30 | indep_cnt = 0 31 | 32 | for i in range(0,len(bdy_)): 33 | tmp_bdies = [] 34 | tmp_bdy = [] 35 | for j in range(0,bdy_[i].shape[0]): 36 | r, c = bdy_[i][j,0,1],bdy_[i][j,0,0] 37 | 38 | if(np.sum(cond[r,c])==0 or ind_map[r,c]!=0): 39 | if(len(tmp_bdy)>0): 40 | tmp_bdies.append(tmp_bdy) 41 | tmp_bdy = [] 42 | continue 43 | tmp_bdy.append([c,r]) 44 | ind_map[r,c] = ind_map[r,c] + 1 45 | indep[labels[r,c]] = 0 # indicates part of the boundary of this region needs human correction 46 | if(len(tmp_bdy)>0): 47 | tmp_bdies.append(tmp_bdy) 48 | 49 | # check if the first and the last boundaries are connected 50 | # if yes, invert the first boundary and attach it after the last boundary 51 | if(len(tmp_bdies)>1): 52 | first_x, first_y = tmp_bdies[0][0] 53 | last_x, last_y = tmp_bdies[-1][-1] 54 | if((abs(first_x-last_x)==1 and first_y==last_y) or 55 | (first_x==last_x and abs(first_y-last_y)==1) or 56 | (abs(first_x-last_x)==1 and abs(first_y-last_y)==1) 57 | ): 58 | tmp_bdies[-1].extend(tmp_bdies[0][::-1]) 59 | del tmp_bdies[0] 60 | 61 | for k in range(0,len(tmp_bdies)): 62 | tmp_bdies[k] = np.array(tmp_bdies[k])[:,np.newaxis,:] 63 | if(len(tmp_bdies)>0): 64 | boundaries.extend(tmp_bdies) 65 | 66 | return boundaries, np.sum(indep) 67 | 68 | # this function approximate each boundary by DP algorithm 69 | # https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm 70 | def approximate_RDP(boundaries,epsilon=1.0): 71 | 72 | boundaries_ = [] 73 | boundaries_len_ = [] 74 | pixel_cnt_ = 0 75 | 76 | # polygon approximate of each boundary 77 | for i in range(0,len(boundaries)): 78 | boundaries_.append(cv.approxPolyDP(boundaries[i],epsilon,False)) 79 | 80 | # count the control points number of each boundary and the total control points number of all the boundaries 81 | for i in range(0,len(boundaries_)): 82 | boundaries_len_.append(len(boundaries_[i])) 83 | pixel_cnt_ = pixel_cnt_ + len(boundaries_[i]) 84 | 85 | return boundaries_, boundaries_len_, pixel_cnt_ 86 | 87 | 88 | def relax_HCE(gt, rs, gt_ske, relax=5, epsilon=2.0): 89 | # print("max(gt_ske): ", np.amax(gt_ske)) 90 | # gt_ske = gt_ske>128 91 | # print("max(gt_ske): ", np.amax(gt_ske)) 92 | 93 | # Binarize gt 94 | if(len(gt.shape)>2): 95 | gt = gt[:,:,0] 96 | 97 | epsilon_gt = 128#(np.amin(gt)+np.amax(gt))/2.0 98 | gt = (gt>epsilon_gt).astype(np.uint8) 99 | 100 | # Binarize rs 101 | if(len(rs.shape)>2): 102 | rs = rs[:,:,0] 103 | epsilon_rs = 128#(np.amin(rs)+np.amax(rs))/2.0 104 | rs = (rs>epsilon_rs).astype(np.uint8) 105 | 106 | Union = np.logical_or(gt,rs) 107 | TP = np.logical_and(gt,rs) 108 | FP = rs - TP 109 | FN = gt - TP 110 | 111 | # relax the Union of gt and rs 112 | Union_erode = Union.copy() 113 | Union_erode = cv.erode(Union_erode.astype(np.uint8),disk(1),iterations=relax) 114 | 115 | # --- get the relaxed False Positive regions for computing the human efforts in correcting them --- 116 | FP_ = np.logical_and(FP,Union_erode) # get the relaxed FP 117 | for i in range(0,relax): 118 | FP_ = cv.dilate(FP_.astype(np.uint8),disk(1)) 119 | FP_ = np.logical_and(FP_, 1-np.logical_or(TP,FN)) 120 | FP_ = np.logical_and(FP, FP_) 121 | 122 | # --- get the relaxed False Negative regions for computing the human efforts in correcting them --- 123 | FN_ = np.logical_and(FN,Union_erode) # preserve the structural components of FN 124 | ## recover the FN, where pixels are not close to the TP borders 125 | for i in range(0,relax): 126 | FN_ = cv.dilate(FN_.astype(np.uint8),disk(1)) 127 | FN_ = np.logical_and(FN_,1-np.logical_or(TP,FP)) 128 | FN_ = np.logical_and(FN,FN_) 129 | FN_ = np.logical_or(FN_, np.logical_xor(gt_ske,np.logical_and(TP,gt_ske))) # preserve the structural components of FN 130 | 131 | ## 2. =============Find exact polygon control points and independent regions============== 132 | ## find contours from FP_ 133 | ctrs_FP, hier_FP = cv.findContours(FP_.astype(np.uint8), cv.RETR_TREE, cv.CHAIN_APPROX_NONE) 134 | ## find control points and independent regions for human correction 135 | bdies_FP, indep_cnt_FP = filter_bdy_cond(ctrs_FP, FP_, np.logical_or(TP,FN_)) 136 | ## find contours from FN_ 137 | ctrs_FN, hier_FN = cv.findContours(FN_.astype(np.uint8), cv.RETR_TREE, cv.CHAIN_APPROX_NONE) 138 | ## find control points and independent regions for human correction 139 | bdies_FN, indep_cnt_FN = filter_bdy_cond(ctrs_FN, FN_, 1-np.logical_or(np.logical_or(TP,FP_),FN_)) 140 | 141 | poly_FP, poly_FP_len, poly_FP_point_cnt = approximate_RDP(bdies_FP,epsilon=epsilon) 142 | poly_FN, poly_FN_len, poly_FN_point_cnt = approximate_RDP(bdies_FN,epsilon=epsilon) 143 | 144 | return poly_FP_point_cnt, indep_cnt_FP, poly_FN_point_cnt, indep_cnt_FN 145 | 146 | def compute_hce(pred_root,gt_root,gt_ske_root): 147 | 148 | gt_name_list = glob(pred_root+'/*.png') 149 | gt_name_list = sorted([x.split('/')[-1] for x in gt_name_list]) 150 | 151 | hces = [] 152 | for gt_name in tqdm(gt_name_list, total=len(gt_name_list)): 153 | gt_path = os.path.join(gt_root, gt_name) 154 | pred_path = os.path.join(pred_root, gt_name) 155 | 156 | gt = cv.imread(gt_path, cv.IMREAD_GRAYSCALE) 157 | pred = cv.imread(pred_path, cv.IMREAD_GRAYSCALE) 158 | 159 | ske_path = os.path.join(gt_ske_root,gt_name) 160 | if os.path.exists(ske_path): 161 | ske = cv.imread(ske_path,cv.IMREAD_GRAYSCALE) 162 | ske = ske>128 163 | else: 164 | ske = skeletonize(gt>128) 165 | 166 | FP_points, FP_indep, FN_points, FN_indep = relax_HCE(gt, pred,ske) 167 | print(gt_path.split('/')[-1],FP_points, FP_indep, FN_points, FN_indep) 168 | hces.append([FP_points, FP_indep, FN_points, FN_indep, FP_points+FP_indep+FN_points+FN_indep]) 169 | 170 | hce_metric ={'names': gt_name_list, 171 | 'hces': hces} 172 | 173 | 174 | file_metric = open(pred_root+'/hce_metric.pkl','wb') 175 | pkl.dump(hce_metric,file_metric) 176 | # file_metrics.write(cmn_metrics) 177 | file_metric.close() 178 | 179 | return np.mean(np.array(hces)[:,-1]) 180 | 181 | def main(): 182 | 183 | gt_root = "../DIS5K/DIS-VD/gt" 184 | gt_ske_root = "" 185 | pred_root = "../Results/isnet(ours)/DIS-VD" 186 | 187 | print("The average HCE metric: ", compute_hce(pred_root,gt_root,gt_ske_root)) 188 | 189 | 190 | if __name__ == '__main__': 191 | main() 192 | -------------------------------------------------------------------------------- /IS_Net/models/__init__.py: -------------------------------------------------------------------------------- 1 | from IS_Net.models.isnet import ISNetGTEncoder, ISNetDIS 2 | -------------------------------------------------------------------------------- /IS_Net/pytorch18.yml: -------------------------------------------------------------------------------- 1 | name: pytorch18 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - pytorch 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=main 9 | - _openmp_mutex=4.5=1_gnu 10 | - blas=1.0=mkl 11 | - brotli=1.0.9=he6710b0_2 12 | - bzip2=1.0.8=h7b6447c_0 13 | - ca-certificates=2022.2.1=h06a4308_0 14 | - certifi=2021.10.8=py37h06a4308_2 15 | - cloudpickle=2.0.0=pyhd3eb1b0_0 16 | - colorama=0.4.4=pyhd3eb1b0_0 17 | - cudatoolkit=10.2.89=hfd86e86_1 18 | - cycler=0.11.0=pyhd3eb1b0_0 19 | - cytoolz=0.11.0=py37h7b6447c_0 20 | - dask-core=2021.10.0=pyhd3eb1b0_0 21 | - ffmpeg=4.3=hf484d3e_0 22 | - fonttools=4.25.0=pyhd3eb1b0_0 23 | - freetype=2.11.0=h70c0345_0 24 | - fsspec=2022.2.0=pyhd3eb1b0_0 25 | - gmp=6.2.1=h2531618_2 26 | - gnutls=3.6.15=he1e5248_0 27 | - imageio=2.9.0=pyhd3eb1b0_0 28 | - intel-openmp=2021.4.0=h06a4308_3561 29 | - jpeg=9b=h024ee3a_2 30 | - kiwisolver=1.3.2=py37h295c915_0 31 | - lame=3.100=h7b6447c_0 32 | - lcms2=2.12=h3be6417_0 33 | - ld_impl_linux-64=2.35.1=h7274673_9 34 | - libffi=3.3=he6710b0_2 35 | - libgcc-ng=9.3.0=h5101ec6_17 36 | - libgfortran-ng=7.5.0=ha8ba4b0_17 37 | - libgfortran4=7.5.0=ha8ba4b0_17 38 | - libgomp=9.3.0=h5101ec6_17 39 | - libiconv=1.15=h63c8f33_5 40 | - libidn2=2.3.2=h7f8727e_0 41 | - libpng=1.6.37=hbc83047_0 42 | - libstdcxx-ng=9.3.0=hd4cf53a_17 43 | - libtasn1=4.16.0=h27cfd23_0 44 | - libtiff=4.2.0=h85742a9_0 45 | - libunistring=0.9.10=h27cfd23_0 46 | - libuv=1.40.0=h7b6447c_0 47 | - libwebp-base=1.2.2=h7f8727e_0 48 | - locket=0.2.1=py37h06a4308_2 49 | - lz4-c=1.9.3=h295c915_1 50 | - matplotlib-base=3.5.1=py37ha18d171_1 51 | - mkl=2021.4.0=h06a4308_640 52 | - mkl-service=2.4.0=py37h7f8727e_0 53 | - mkl_fft=1.3.1=py37hd3c417c_0 54 | - mkl_random=1.2.2=py37h51133e4_0 55 | - munkres=1.1.4=py_0 56 | - ncurses=6.3=h7f8727e_2 57 | - nettle=3.7.3=hbbd107a_1 58 | - networkx=2.6.3=pyhd3eb1b0_0 59 | - ninja=1.10.2=py37hd09550d_3 60 | - numpy=1.21.2=py37h20f2e39_0 61 | - numpy-base=1.21.2=py37h79a1101_0 62 | - olefile=0.46=py37_0 63 | - openh264=2.1.1=h4ff587b_0 64 | - openssl=1.1.1n=h7f8727e_0 65 | - packaging=21.3=pyhd3eb1b0_0 66 | - partd=1.2.0=pyhd3eb1b0_1 67 | - pillow=8.0.0=py37h9a89aac_0 68 | - pip=21.2.2=py37h06a4308_0 69 | - pyparsing=3.0.4=pyhd3eb1b0_0 70 | - python=3.7.11=h12debd9_0 71 | - python-dateutil=2.8.2=pyhd3eb1b0_0 72 | - pytorch=1.8.0=py3.7_cuda10.2_cudnn7.6.5_0 73 | - pywavelets=1.1.1=py37h7b6447c_2 74 | - pyyaml=6.0=py37h7f8727e_1 75 | - readline=8.1.2=h7f8727e_1 76 | - scikit-image=0.15.0=py37hb3f55d8_2 77 | - scipy=1.7.3=py37hc147768_0 78 | - setuptools=58.0.4=py37h06a4308_0 79 | - six=1.16.0=pyhd3eb1b0_1 80 | - sqlite=3.38.0=hc218d9a_0 81 | - tk=8.6.11=h1ccaba5_0 82 | - toolz=0.11.2=pyhd3eb1b0_0 83 | - torchaudio=0.8.0=py37 84 | - torchvision=0.9.0=py37_cu102 85 | - tqdm=4.63.0=pyhd8ed1ab_0 86 | - typing_extensions=3.10.0.2=pyh06a4308_0 87 | - wheel=0.37.1=pyhd3eb1b0_0 88 | - xz=5.2.5=h7b6447c_0 89 | - yaml=0.2.5=h7b6447c_0 90 | - zlib=1.2.11=h7f8727e_4 91 | - zstd=1.4.9=haebb681_0 92 | prefix: /home/solar/anaconda3/envs/pytorch18 93 | -------------------------------------------------------------------------------- /IS_Net/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | _libgcc_mutex=0.1=main 5 | _openmp_mutex=4.5=1_gnu 6 | blas=1.0=mkl 7 | brotli=1.0.9=he6710b0_2 8 | bzip2=1.0.8=h7b6447c_0 9 | ca-certificates=2022.2.1=h06a4308_0 10 | certifi=2021.10.8=py37h06a4308_2 11 | cloudpickle=2.0.0=pyhd3eb1b0_0 12 | colorama=0.4.4=pyhd3eb1b0_0 13 | cudatoolkit=10.2.89=hfd86e86_1 14 | cycler=0.11.0=pyhd3eb1b0_0 15 | cytoolz=0.11.0=py37h7b6447c_0 16 | dask-core=2021.10.0=pyhd3eb1b0_0 17 | ffmpeg=4.3=hf484d3e_0 18 | fonttools=4.25.0=pyhd3eb1b0_0 19 | freetype=2.11.0=h70c0345_0 20 | fsspec=2022.2.0=pyhd3eb1b0_0 21 | gmp=6.2.1=h2531618_2 22 | gnutls=3.6.15=he1e5248_0 23 | imageio=2.9.0=pyhd3eb1b0_0 24 | intel-openmp=2021.4.0=h06a4308_3561 25 | jpeg=9b=h024ee3a_2 26 | kiwisolver=1.3.2=py37h295c915_0 27 | lame=3.100=h7b6447c_0 28 | lcms2=2.12=h3be6417_0 29 | ld_impl_linux-64=2.35.1=h7274673_9 30 | libffi=3.3=he6710b0_2 31 | libgcc-ng=9.3.0=h5101ec6_17 32 | libgfortran-ng=7.5.0=ha8ba4b0_17 33 | libgfortran4=7.5.0=ha8ba4b0_17 34 | libgomp=9.3.0=h5101ec6_17 35 | libiconv=1.15=h63c8f33_5 36 | libidn2=2.3.2=h7f8727e_0 37 | libpng=1.6.37=hbc83047_0 38 | libstdcxx-ng=9.3.0=hd4cf53a_17 39 | libtasn1=4.16.0=h27cfd23_0 40 | libtiff=4.2.0=h85742a9_0 41 | libunistring=0.9.10=h27cfd23_0 42 | libuv=1.40.0=h7b6447c_0 43 | libwebp-base=1.2.2=h7f8727e_0 44 | locket=0.2.1=py37h06a4308_2 45 | lz4-c=1.9.3=h295c915_1 46 | matplotlib-base=3.5.1=py37ha18d171_1 47 | mkl=2021.4.0=h06a4308_640 48 | mkl-service=2.4.0=py37h7f8727e_0 49 | mkl_fft=1.3.1=py37hd3c417c_0 50 | mkl_random=1.2.2=py37h51133e4_0 51 | munkres=1.1.4=py_0 52 | ncurses=6.3=h7f8727e_2 53 | nettle=3.7.3=hbbd107a_1 54 | networkx=2.6.3=pyhd3eb1b0_0 55 | ninja=1.10.2=py37hd09550d_3 56 | numpy=1.21.2=py37h20f2e39_0 57 | numpy-base=1.21.2=py37h79a1101_0 58 | olefile=0.46=py37_0 59 | openh264=2.1.1=h4ff587b_0 60 | openssl=1.1.1n=h7f8727e_0 61 | packaging=21.3=pyhd3eb1b0_0 62 | partd=1.2.0=pyhd3eb1b0_1 63 | pillow=8.0.0=py37h9a89aac_0 64 | pip=21.2.2=py37h06a4308_0 65 | pyparsing=3.0.4=pyhd3eb1b0_0 66 | python=3.7.11=h12debd9_0 67 | python-dateutil=2.8.2=pyhd3eb1b0_0 68 | pytorch=1.8.0=py3.7_cuda10.2_cudnn7.6.5_0 69 | pywavelets=1.1.1=py37h7b6447c_2 70 | pyyaml=6.0=py37h7f8727e_1 71 | readline=8.1.2=h7f8727e_1 72 | scikit-image=0.15.0=py37hb3f55d8_2 73 | scipy=1.7.3=py37hc147768_0 74 | setuptools=58.0.4=py37h06a4308_0 75 | six=1.16.0=pyhd3eb1b0_1 76 | sqlite=3.38.0=hc218d9a_0 77 | tk=8.6.11=h1ccaba5_0 78 | toolz=0.11.2=pyhd3eb1b0_0 79 | torchaudio=0.8.0=py37 80 | torchvision=0.9.0=py37_cu102 81 | tqdm=4.63.0=pyhd8ed1ab_0 82 | typing_extensions=3.10.0.2=pyh06a4308_0 83 | wheel=0.37.1=pyhd3eb1b0_0 84 | xz=5.2.5=h7b6447c_0 85 | yaml=0.2.5=h7b6447c_0 86 | zlib=1.2.11=h7f8727e_4 87 | zstd=1.4.9=haebb681_0 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2025 Apple Inc. All Rights Reserved. 2 | 3 | IMPORTANT: This Apple software is supplied to you by Apple 4 | Inc. ("Apple") in consideration of your agreement to the following 5 | terms, and your use, installation, modification or redistribution of 6 | this Apple software constitutes acceptance of these terms. If you do 7 | not agree with these terms, please do not use, install, modify or 8 | redistribute this Apple software. 9 | 10 | In consideration of your agreement to abide by the following terms, and 11 | subject to these terms, Apple grants you a personal, non-exclusive 12 | license, under Apple's copyrights in this original Apple software (the 13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple 14 | Software, with or without modifications, in source and/or binary forms; 15 | provided that if you redistribute the Apple Software in its entirety and 16 | without modifications, you must retain this notice and the following 17 | text and disclaimers in all such redistributions of the Apple Software. 18 | Neither the name, trademarks, service marks or logos of Apple Inc. may 19 | be used to endorse or promote products derived from the Apple Software 20 | without specific prior written permission from Apple. Except as 21 | expressly stated in this notice, no other rights or licenses, express or 22 | implied, are granted by Apple herein, including but not limited to any 23 | patent rights that may be infringed by your derivative works or by other 24 | works in which the Apple Software may be incorporated. 25 | 26 | The Apple Software is provided by Apple on an "AS IS" basis. APPLE 27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION 28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS 29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND 30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. 31 | 32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL 33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, 36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED 37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), 38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE 39 | POSSIBILITY OF SUCH DAMAGE. 40 | -------------------------------------------------------------------------------- /MODEL_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025 Apple Inc. All Rights Reserved. 2 | 3 | Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is specifically 4 | developed and released by Apple Inc. ("Apple") for the sole purpose of scientific 5 | research of artificial intelligence and machine-learning technology. “Apple Machine 6 | Learning Research Model” means the model, including but not limited to algorithms, 7 | formulas, trained model weights, parameters, configurations, checkpoints, and any 8 | related materials (including documentation). 9 | 10 | This Apple Machine Learning Research Model is provided to You by Apple in 11 | consideration of your agreement to the following terms, and your use, modification, 12 | creation of Model Derivatives, and or redistribution of the Apple Machine Learning 13 | Research Model constitutes acceptance of this Agreement. If You do not agree with 14 | these terms, please do not use, modify, create Model Derivatives of, or distribute this 15 | Apple Machine Learning Research Model or Model Derivatives. 16 | 17 | 1. License Scope: In consideration of your agreement to abide by the following 18 | terms, and subject to these terms, Apple hereby grants you a personal, non- 19 | exclusive, worldwide, non-transferable, royalty-free, revocable, and limited 20 | license, to use, copy, modify, distribute, and create Model Derivatives (defined 21 | below) of the Apple Machine Learning Research Model exclusively for Research 22 | Purposes. You agree that any Model Derivatives You may create or that may be 23 | created for You will be limited to Research Purposes as well. “Research 24 | Purposes” means non-commercial scientific research and academic 25 | development activities, such as experimentation, analysis, testing conducted by 26 | You with the sole intent to advance scientific knowledge and research. 27 | “Research Purposes” does not include any commercial exploitation, product 28 | development or use in any commercial product or service. 29 | 30 | 2. Distribution of Apple Machine Learning Research Model and Model Derivatives: 31 | If you choose to redistribute Apple Machine Learning Research Model or its 32 | Model Derivatives, you must provide a copy of this Agreement to such third 33 | party, and ensure that the following attribution notice be provided: “Apple 34 | Machine Learning Research Model is licensed under the Apple Machine 35 | Learning Research Model License Agreement.” Additionally, all Model 36 | Derivatives must clearly be identified as such, including disclosure of 37 | modifications and changes made to the Apple Machine Learning Research 38 | Model. The name, trademarks, service marks or logos of Apple may not be used 39 | to endorse or promote Model Derivatives or the relationship between You and 40 | Apple. “Model Derivatives” means any models or any other artifacts created by 41 | modifications, improvements, adaptations, alterations to the architecture, 42 | algorithm or training processes of the Apple Machine Learning Research Model, 43 | or by any retraining, fine-tuning of the Apple Machine Learning Research 44 | Model. 45 | 46 | 3. No Other License: Except as expressly stated in this notice, no other rights or 47 | licenses, express or implied, are granted by Apple herein, including but not 48 | limited to any patent, trademark, and similar intellectual property rights 49 | worldwide that may be infringed by the Apple Machine Learning Research 50 | Model, the Model Derivatives or by other works in which the Apple Machine 51 | Learning Research Model may be incorporated. 52 | 53 | 4. Compliance with Laws: Your use of Apple Machine Learning Research Model 54 | must be in compliance with all applicable laws and regulations. 55 | 56 | 5. Term and Termination: The term of this Agreement will begin upon your 57 | acceptance of this Agreement or use of the Apple Machine Learning Research 58 | Model and will continue until terminated in accordance with the following terms. 59 | Apple may terminate this Agreement at any time if You are in breach of any term 60 | or condition of this Agreement. Upon termination of this Agreement, You must 61 | cease to use all Apple Machine Learning Research Models and Model 62 | Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will 63 | survive termination. 64 | 65 | 6. Disclaimer and Limitation of Liability: This Apple Machine Learning Research 66 | Model and any outputs generated by the Apple Machine Learning Research 67 | Model are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, 68 | EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED 69 | WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS 70 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE MACHINE 71 | LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY THE APPLE 72 | MACHINE LEARNING RESEARCH MODEL. You are solely responsible for 73 | determining the appropriateness of using or redistributing the Apple Machine 74 | Learning Research Model and any outputs of the Apple Machine Learning 75 | Research Model and assume any risks associated with Your use of the Apple 76 | Machine Learning Research Model and any output and results. IN NO EVENT 77 | SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR 78 | CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE, 79 | REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF THE APPLE 80 | MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE 81 | MACHINE LEARNING RESEARCH MODEL, HOWEVER CAUSED AND 82 | WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING 83 | NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN 84 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 85 | 86 | 7. Governing Law: This Agreement will be governed by and construed under the 87 | laws of the State of California without regard to its choice of law principles. The 88 | Convention on Contracts for the International Sale of Goods shall not apply to 89 | the Agreement except that the arbitration clause and any arbitration hereunder 90 | shall be governed by the Federal Arbitration Act, Chapters 1 and 2. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Matrix3D: Large Photogrammetry Model All-in-One 2 | Yuanxun Lu1\*, Jingyang Zhang2\*, Tian Fang2, Jean-Daniel Nahmias2, Yanghai Tsin2, Long Quan3, Xun Cao1, Yao Yao1†, Shiwei Li2 3 | 1Nanjing University, 2Apple, 3HKUST 4 | \*Equal contribution Corresponding author 5 | 6 | ### [Project Page](https://nju-3dv.github.io/projects/matrix3d/) | [Paper](https://arxiv.org/abs/2502.07685) | [Weights](#environment-setup) 7 | 8 | This is the official implementation of Matrix3D, a unified model that performs several photogrammetry subtasks, including pose estimation, depth prediction, and novel view synthesis using the same model. 9 | 10 | This repository includes the model inference pipeline and the modified 3DGS reconstruction pipeline for 3D reconstruction. 11 | 12 |

13 | 14 |

15 |

16 | Matrix3D supports various photogrammetry tasks via masked inference. 17 |
18 | 19 | ## Environment Setup 20 | 21 | - This project is successfully tested on Ubuntu 20.04 with PyTorch 2.4 (Python 3.10). We recommend creating a new environment and install necessary dependencies: 22 | 23 | ``` 24 | conda create -y -n matrix3d python=3.10 25 | conda activate matrix3d 26 | # Here we take Pytorch 2.4 with cuda 11.8 as an example 27 | # If you install a different PyTorch version, please select a matched xformers/pytorch3d version 28 | pip install torch==2.4.0 torchvision==0.19.0 xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu118 29 | pip install --extra-index-url https://miropsota.github.io/torch_packages_builder pytorch3d==0.7.7+pt2.4.0cu118 30 | pip install -r requirements.txt 31 | # fixed the requirement conflicts from nerfstudio 32 | pip install timm==1.0.11 33 | ``` 34 | Some dependencies may require CUDA with the same version used by `torch` in your system, and the installation may not work out of the box. Please refer to their official repo for troubleshooting. 35 | 36 | * Download the Pre-trained model: 37 | * Download the checkpoints: [matrix3d_512.pt](https://ml-site.cdn-apple.com/models/matrix3d/matrix3d_512.pt) 38 | * Create a `checkpoints` folder and put the pre-trained model into it. 39 | * (Optional) Download `IS-Net` checkpoint if you would like to use single-view to 3d reconstruction: 40 | * Download the pre-trained model `isnet-general-use.pth` from the [DIS official repo](https://github.com/xuebinqin/DIS) and also put it into the `checkpoints` folder. 41 | 42 | ## Run Demo 43 | 44 | - Matrix3D supports several photogrammetry tasks and their dynamic compositions via masked inference. Here we provide several example scripts on the CO3Dv2 dataset. All results will be saved to the `results` folder by default. 45 | 46 | - **Novel View Synthesis** 47 | 48 | ``` 49 | sh scripts/novel_view_synthesis.sh examples/co3dv2-samples/31_1359_4114 50 | ``` 51 | 52 | This script demonstrates the usage of novel view synthesis from single-view image input. 53 | 54 | For all diffusion sampling tasks, we use indicators `mod_flags` and `view_ids` to control the input states in `L48-L56`. You could try to set a different modality flag or view numbers to achieve different tasks, such as predict novel views from 2 posed RGB images. 55 | 56 | - **Pose Estimation** 57 | 58 | ``` 59 | sh scripts/pose_estimation.sh examples/co3dv2-samples/31_1359_4114 60 | ``` 61 | 62 | This script demonstrates the usage of pose prediction from images. The saved `*.png` and `*.html` file demonstrates a visual comparison between predictions and groundtruth values. 63 | 64 | Replace the data root to an unposed data folder like `examples/unposed-samples/co3dv2/201_21613_43652` would generate the results without comparisons to groundtruth poses. 65 | 66 | It is **strongly recommended** to provide the camera intrinsics saved in the .txt files since the model is trained with known camera intrinsics. If not, the processor would set a default fov=60 and performance may degrade. You could also change the default Fov value by passing `--default_fov`. 67 | 68 | - **Depth Prediction** 69 | 70 | ``` 71 | sh scripts/depth_prediction.sh examples/co3dv2-samples/31_1359_4114 72 | ``` 73 | 74 | This script demonstrates the usage of depth prediction from several posed images. The back-projected groundtruth and prediction point clouds can be found in the folder. 75 | 76 | - By dynamically combining the above tasks, one could later apply a modified 3DGS pipeline to achieve 3D reconstruction from various inputs, even with unknown camera parameters. In the following, we provide two specific examples: 77 | 78 | - **Single-view to 3D** 79 | 80 | ``` 81 | sh scripts/single_view_to_3d.sh single-view-to-3d examples/single-view/skull.png 82 | ``` 83 | 84 | The 3DGS rendering results are saved in `results/single-view-to-3d/skull/3DGS-render-traj.mp4`. 85 | 86 | In this task, camera Fov is set to 60 by default, while you could also manually set it by creating a `$name.txt` file along with the image. The dataprocessor would automatically load it. For example, you could replace the `skull.png` with `ghost.png`. 87 | 88 | Please check the `examples/single-view` folder for more examples. 89 | 90 | - **Unposed Few-shot to 3D** 91 | 92 | ``` 93 | sh scripts/unposed_fewshot_to_3d_co3dv2.sh unposed-fewshot-to-3d examples/unposed-samples/co3dv2/31_1359_4114 94 | ``` 95 | 96 | This script demonstrates a reconstruction process from unposed images in CO3Dv2 dataset. Note that the camera trajectories of novel views are sampled on fitted splines from predicted poses and designed to work under object-centric scenes. The specific interpolation video is saved as `3DGS-render-traj1.mp4` by default. You could also change to apply reconstruction on arkitscenes data as follows: 97 | 98 | ``` 99 | sh scripts/unposed_fewshot_to_3d_arkitscenes.sh unposed-fewshot-to-3d examples/unposed-samples/arkitscenes/41069043 100 | ``` 101 | 102 | The only difference lies in the splined camera generation while the 3DGS part is exactly same. You may need to tune the parameters of trajectory generation and 3DGS reconstruction for different datasets to achieve higher performance. 103 | 104 | - Based on the examples above, you can flexibly define specifically tailored tasks by combining different inputs. 105 | 106 | - Notes: 107 | 108 | - When trying on the diffusion process, please carefully assign the values of indicators `mods_flags` and `view_ids`. Besides, the model is trained with a maximum view number of 8, so do not set `view_ids` larger than 8 views. 109 | - The example data in `examples/co3dv2-samples` and `examples/unposed-samples` are part of CO3Dv2 and ARKitScenes datasets. The camera extrinsic is saved in FOV values or Blender camera coordinates. In processing, we would convert them into PyTorch3D cameras, and these part codes could be found in `L654-659` from `data/data_preprocessor.py`. Therefore, it is easy for users to change to different camera representations, e.g., you could apply the official Pytorch3D conversion function `pytorch3d.utils.cameras_from_opencv_projection` to convert OpenCV cameras into Pytorch3D cameras. 110 | 111 | 112 | ## License 113 | 114 | This sample code is released under the [LICENSE](LICENSE) terms. 115 | 116 | ## Citation 117 | ``` 118 | @article{lu2025matrix3d, 119 | title={Matrix3D: Large Photogrammetry Model All-in-One}, 120 | author={Lu, Yuanxun and Zhang, Jingyang and Fang, Tian and Nahmias, Jean-Daniel and Tsin, Yanghai and Quan, Long and Cao, Xun and Yao, Yao and Li, Shiwei}, 121 | journal={Computer Vision and Pattern Recognition (CVPR)}, 122 | year={2025} 123 | } 124 | ``` 125 | -------------------------------------------------------------------------------- /configs/config_stage3.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | val_inference_steps: 50 3 | val_scheduler: DDPM 4 | val_height: 512 5 | val_width: 512 6 | 7 | eval: 8 | val_inference_steps: 50 9 | scheduler: DDPM 10 | save_image: True 11 | 12 | modalities: 13 | rgb: 14 | dimensions: 2 15 | height: 64 16 | width: 64 17 | patch_size: 2 18 | dense: True 19 | gen_channel: 20 | latent: 4 21 | cond_channel: 22 | dino: 768 23 | latent: 4 24 | ray: 25 | dimensions: 2 26 | height: 32 27 | width: 32 28 | patch_size: 1 29 | dense: True 30 | gen_channel: 31 | dir: 3 32 | moment: 3 33 | cond_channel: 34 | dir: 3 35 | moment: 3 36 | depth: 37 | dimensions: 2 38 | height: &depth_size 128 39 | width: 128 40 | patch_size: 4 41 | dense: False 42 | seq_len: &depth_token_num 1024 43 | gen_channel: 44 | disparity: 1 45 | gen_aux_channel: 46 | valid: 1 47 | cond_channel: 48 | disparity: 1 49 | valid: 1 50 | local_caption: 51 | dimensions: 1 52 | cond_channel: 53 | latent: 1024 54 | global_caption: 55 | dimensions: 1 56 | cond_channel: 57 | latent: 1024 58 | 59 | data: 60 | shuffle: True 61 | modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption'] 62 | modalities_probs: [[2, 2, 1], [2, 2, 1], [2, 2, 1], [0, 1, 0], [0, 1, 1]] # each modality contains three probs: gens/conds/not_used. no need to sum to 1, would be normalized automatically 63 | dataset_supported_modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption'] 64 | dataset_type: 'object-centric' # 'object-centric', 'scenes' 65 | shift_scales: # mean/scale 66 | rgb: [0.000, 1.000] 67 | ray: 68 | dirs: [0.000, 1.800] 69 | moms: [-0.100, 2.700] 70 | origins: [-0.145, 1.700] # origin & direction are only used when use_plucker==False 71 | directions: [0.145, 1.715] 72 | depth: [1.100, 2.000] 73 | num_view: [2, 8] 74 | num_batch_per_scene: null 75 | cond_size: 896 76 | gen_size: 512 77 | raymap_size: 32 # this should be consistent to the model rays config! 78 | use_plucker: True # if False, use ray origins and directions instead 79 | use_background: True 80 | use_depth_valid_only: True 81 | background_color: "white" 82 | # relative_pose: "raydiffusion_refcam" 83 | pose_trans_jitter: 0.0 84 | relative_rot: True 85 | relative_trans: 1.0 86 | pre_resize: 512 # always pre-resize images to 512, and predict the KRT of 512-sized images 87 | depth_size: *depth_size # TODO: this is a temp setting, should align with the patch size in model config! 88 | depth_samples_per_images: *depth_token_num 89 | center_crop_min_scale: 1.0 # in [0.0, 1.0], 0.6 90 | center_crop_max_jitter: 0.0 # in pixel 15 91 | per_sample_aug_enable: False # For vae training only, set this to false for all other paras 92 | per_sample_aug: 93 | depth: 94 | rotate: [-30.0, 30.0] 95 | scale: [0.5, 1.0] 96 | value_scales: [0.5, 2.0] 97 | raydiffusion_official: False 98 | dataset_overwrite: 99 | mvimgnet: 100 | dataset_supported_modalities: ['rgb', 'ray', 'local_caption', 'global_caption'] 101 | use_background: False 102 | co3dv2: 103 | use_background: False 104 | realestate10k: 105 | dataset_supported_modalities: ['rgb', 'ray', 'local_caption', 'global_caption'] 106 | dataset_type: 'scenes' 107 | use_background: False 108 | hypersim: 109 | dataset_type: 'scenes' 110 | use_background: False 111 | arkitscenes: 112 | dataset_supported_modalities: ['rgb', 'depth', 'global_caption', 'local_caption'] 113 | dataset_type: 'scenes' 114 | use_background: False 115 | dtu: 116 | dataset_supported_modalities: ['rgb', 'ray', 'depth'] 117 | use_background: False 118 | mipnerf360: 119 | dataset_supported_modalities: ['rgb', 'ray'] 120 | use_background: False 121 | llff: 122 | dataset_supported_modalities: ['rgb', 'ray'] 123 | dataset_type: 'scenes' 124 | use_background: False 125 | validation_overwrite: 126 | num_batch_per_scene: 1 127 | center_crop_min_scale: 1.0 128 | center_crop_max_jitter: 0.0 129 | pose_trans_jitter: 0.0 130 | modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption'] 131 | modalities_probs: [[2, 2, 0], [2, 2, 0], [2, 2, 0], [0, 1, 0], [0, 1, 0]] # each modality contains three probs: gens/conds/not_used. no need to sum to 1, would be normalized automatically 132 | num_view: [2, 8] 133 | evaluation_overwrite: 134 | shuffle: False 135 | center_crop_min_scale: 1.0 136 | center_crop_max_jitter: 0.0 137 | pose_trans_jitter: 0.0 138 | modalities_probs: [[2, 2, 0], [2, 2, 0], [2, 2, 0], [0, 1, 0], [0, 1, 0]] 139 | inference_overwrite: 140 | shuffle: False 141 | num_batch_per_scene: 1 142 | num_view: 8 143 | 144 | model: 145 | model_type: dit 146 | hidden_size: 1024 147 | depth: 40 148 | encoder: True 149 | encoder_depth: 20 150 | decoder_url: Tencent-Hunyuan/HunyuanDiT-Diffusers 151 | scheduler_url: Tencent-Hunyuan/HunyuanDiT-Diffusers 152 | peripheral_url: Tencent-Hunyuan/HunyuanDiT-Diffusers 153 | qk_norm: rmsnorm # "layernorm" or None 154 | mod_norm: True # modality-specific normalization 155 | pe_config: # choose from 'sinusoid', 'sinusoid_all' and 'rope' 156 | pos: 157 | type: rope 158 | base_size: 64 159 | view: 160 | type: sinusoid_all 161 | base: 70007 162 | max: 8 163 | zero_init: False 164 | mod: 165 | type: sinusoid_all 166 | base: 30003 167 | max: 5 # Local (view-dependent): RGB + Ray + Disparity + Text Global (view-independent): global text description 168 | zero_init: False -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | from .data_preprocessor import Preprocessor -------------------------------------------------------------------------------- /docs/inference-pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/docs/inference-pipe.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/047.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/047.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/047.txt: -------------------------------------------------------------------------------- 1 | 1202.11938 2 | 1202.11938 3 | 340.00000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/047_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/047_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/047_ext.txt: -------------------------------------------------------------------------------- 1 | 0.03058 2 | -0.93401 3 | 0.35594 4 | 0.72073 5 | -0.13265 6 | 0.34916 7 | 0.92763 8 | 1.12437 9 | -0.99069 10 | -0.07559 11 | -0.11322 12 | -11.29426 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/051.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/051.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/051.txt: -------------------------------------------------------------------------------- 1 | 1206.45020 2 | 1206.45020 3 | 340.00000 4 | 605.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/051_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/051_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/051_ext.txt: -------------------------------------------------------------------------------- 1 | -0.16954 2 | -0.97649 3 | 0.13312 4 | 0.47076 5 | -0.12253 6 | 0.15491 7 | 0.98030 8 | 1.05989 9 | -0.97788 10 | 0.14989 11 | -0.14591 12 | -12.12915 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/060.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/060.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/060.txt: -------------------------------------------------------------------------------- 1 | 1196.35779 2 | 1196.35779 3 | 340.00000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/060_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/060_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/060_ext.txt: -------------------------------------------------------------------------------- 1 | -0.55893 2 | -0.75033 3 | -0.35298 4 | 0.22116 5 | -0.23823 6 | -0.26243 7 | 0.93508 8 | 0.66358 9 | -0.79425 10 | 0.60674 11 | -0.03207 12 | -13.40600 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/084.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/084.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/084.txt: -------------------------------------------------------------------------------- 1 | 1206.86255 2 | 1206.86255 3 | 339.00000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/084_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/084_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/084_ext.txt: -------------------------------------------------------------------------------- 1 | -0.58065 2 | 0.63914 3 | -0.50432 4 | 0.05580 5 | -0.81253 6 | -0.49396 7 | 0.30951 8 | 0.52403 9 | -0.05130 10 | 0.58949 11 | 0.80614 12 | -11.59393 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/122.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/122.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/122.txt: -------------------------------------------------------------------------------- 1 | 1214.67126 2 | 1214.67126 3 | 340.00000 4 | 605.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/122_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/122_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/122_ext.txt: -------------------------------------------------------------------------------- 1 | 0.51593 2 | 0.49504 3 | 0.69911 4 | 0.39027 5 | -0.78450 6 | 0.60082 7 | 0.15351 8 | 0.70223 9 | -0.34405 10 | -0.62765 11 | 0.69834 12 | -11.89899 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/126.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/126.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/126.txt: -------------------------------------------------------------------------------- 1 | 1238.41101 2 | 1238.41101 3 | 340.00000 4 | 605.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/126_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/126_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/126_ext.txt: -------------------------------------------------------------------------------- 1 | 0.55931 2 | 0.14791 3 | 0.81565 4 | 0.43170 5 | -0.68172 6 | 0.64188 7 | 0.35107 8 | 1.23148 9 | -0.47162 10 | -0.75241 11 | 0.45985 12 | -12.58318 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/161.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/161.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/161.txt: -------------------------------------------------------------------------------- 1 | 1202.45288 2 | 1202.45288 3 | 339.50000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/161_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/161_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/161_ext.txt: -------------------------------------------------------------------------------- 1 | -0.63609 2 | -0.63176 3 | -0.44302 4 | 0.61886 5 | -0.28926 6 | -0.33704 7 | 0.89595 8 | 0.64184 9 | -0.71534 10 | 0.69806 11 | 0.03165 12 | -13.78141 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/164.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/164.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/164.txt: -------------------------------------------------------------------------------- 1 | 1205.65637 2 | 1205.65637 3 | 339.50000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/164_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/164_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/195_20989_41543/164_ext.txt: -------------------------------------------------------------------------------- 1 | -0.68144 2 | -0.54741 3 | -0.48578 4 | 0.28494 5 | -0.32165 6 | -0.37220 7 | 0.87064 8 | 0.67835 9 | -0.65740 10 | 0.74954 11 | 0.07756 12 | -13.76035 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/000.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/000.txt: -------------------------------------------------------------------------------- 1 | 2601.26636 2 | 2601.26636 3 | 945.50000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/000_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/000_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/000_ext.txt: -------------------------------------------------------------------------------- 1 | -0.00096 2 | 1.00000 3 | 0.00071 4 | -0.33653 5 | -1.00000 6 | -0.00096 7 | -0.00079 8 | 0.69979 9 | -0.00079 10 | -0.00071 11 | 1.00000 12 | -15.21722 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/038.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/038.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/038.txt: -------------------------------------------------------------------------------- 1 | 2602.71191 2 | 2602.71191 3 | 944.50000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/038_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/038_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/038_ext.txt: -------------------------------------------------------------------------------- 1 | 0.52150 2 | -0.42915 3 | 0.73748 4 | -0.02598 5 | -0.47699 6 | 0.57002 7 | 0.66900 8 | 1.60313 9 | -0.70747 10 | -0.70065 11 | 0.09257 12 | -16.45008 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/051.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/051.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/051.txt: -------------------------------------------------------------------------------- 1 | 2619.51001 2 | 2619.51001 3 | 945.00000 4 | 531.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/051_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/051_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/051_ext.txt: -------------------------------------------------------------------------------- 1 | 0.08393 2 | -0.98197 3 | 0.16938 4 | -1.09742 5 | -0.27862 6 | 0.14008 7 | 0.95013 8 | 1.32587 9 | -0.95673 10 | -0.12694 11 | -0.26184 12 | -16.52538 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/074.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/074.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/074.txt: -------------------------------------------------------------------------------- 1 | 2551.71631 2 | 2551.71631 3 | 945.50000 4 | 531.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/074_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/074_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/074_ext.txt: -------------------------------------------------------------------------------- 1 | -0.53447 2 | -0.30849 3 | -0.78688 4 | 0.88837 5 | -0.53093 6 | -0.60184 7 | 0.59657 8 | 1.00150 9 | -0.65761 10 | 0.73663 11 | 0.15787 12 | -14.44483 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/121.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/121.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/121.txt: -------------------------------------------------------------------------------- 1 | 2577.85205 2 | 2577.85205 3 | 945.50000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/121_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/121_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/121_ext.txt: -------------------------------------------------------------------------------- 1 | 0.45418 2 | 0.63051 3 | 0.62943 4 | 0.15200 5 | -0.85992 6 | 0.49495 7 | 0.12471 8 | 1.70349 9 | -0.23291 10 | -0.59790 11 | 0.76699 12 | -15.90632 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/122.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/122.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/122.txt: -------------------------------------------------------------------------------- 1 | 2582.54541 2 | 2582.54541 3 | 945.00000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/122_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/122_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/122_ext.txt: -------------------------------------------------------------------------------- 1 | 0.52242 2 | 0.40567 3 | 0.75001 4 | 0.08422 5 | -0.78922 6 | 0.56304 7 | 0.24519 8 | 1.42875 9 | -0.32282 10 | -0.72001 11 | 0.61431 12 | -16.41819 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/155.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/155.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/155.txt: -------------------------------------------------------------------------------- 1 | 2578.92993 2 | 2578.92993 3 | 945.00000 4 | 531.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/155_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/155_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/155_ext.txt: -------------------------------------------------------------------------------- 1 | -0.04384 2 | -0.99894 3 | 0.01393 4 | -0.39690 5 | -0.28022 6 | 0.02567 7 | 0.95959 8 | 1.39255 9 | -0.95894 10 | 0.03817 11 | -0.28105 12 | -16.14845 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/176.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/176.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/176.txt: -------------------------------------------------------------------------------- 1 | 2588.79712 2 | 2588.79712 3 | 945.00000 4 | 530.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/176_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/176_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/198_21285_41285/176_ext.txt: -------------------------------------------------------------------------------- 1 | -0.58317 2 | -0.17790 3 | -0.79263 4 | 0.00340 5 | -0.56393 6 | -0.61365 7 | 0.55264 8 | 0.67918 9 | -0.58472 10 | 0.76927 11 | 0.25754 12 | -14.50829 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/000.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/000.txt: -------------------------------------------------------------------------------- 1 | 811.81531 2 | 811.81531 3 | 237.50000 4 | 418.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/000_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/000_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/000_ext.txt: -------------------------------------------------------------------------------- 1 | 0.04156 2 | 0.99838 3 | 0.03880 4 | -0.47825 5 | -0.99905 6 | 0.04204 7 | -0.01169 8 | 1.66573 9 | -0.01330 10 | -0.03828 11 | 0.99918 12 | -17.26369 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/027.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/027.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/027.txt: -------------------------------------------------------------------------------- 1 | 813.34485 2 | 813.34485 3 | 237.50000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/027_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/027_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/027_ext.txt: -------------------------------------------------------------------------------- 1 | -0.76536 2 | 0.21894 3 | -0.60522 4 | -0.97550 5 | -0.52489 6 | -0.75651 7 | 0.39010 8 | 0.31043 9 | -0.37245 10 | 0.61624 11 | 0.69392 12 | -16.39164 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/038.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/038.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/038.txt: -------------------------------------------------------------------------------- 1 | 812.97998 2 | 812.97998 3 | 237.00000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/038_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/038_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/038_ext.txt: -------------------------------------------------------------------------------- 1 | -0.67144 2 | -0.47797 3 | -0.56632 4 | -1.71187 5 | -0.12918 6 | -0.67701 7 | 0.72455 8 | 0.03485 9 | -0.72972 10 | 0.55964 11 | 0.39282 12 | -16.93951 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/051.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/051.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/051.txt: -------------------------------------------------------------------------------- 1 | 818.33954 2 | 818.33954 3 | 237.50000 4 | 419.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/051_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/051_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/051_ext.txt: -------------------------------------------------------------------------------- 1 | -0.06066 2 | -0.99810 3 | -0.01033 4 | -1.12151 5 | 0.17837 6 | -0.02102 7 | 0.98374 8 | 1.85056 9 | -0.98209 10 | 0.05783 11 | 0.17931 12 | -18.70874 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/060.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/060.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/060.txt: -------------------------------------------------------------------------------- 1 | 821.59253 2 | 821.59253 3 | 237.50000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/060_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/060_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/060_ext.txt: -------------------------------------------------------------------------------- 1 | 0.43974 2 | -0.78924 3 | 0.42863 4 | -0.50657 5 | 0.09524 6 | 0.51554 7 | 0.85156 8 | 1.77140 9 | -0.89306 10 | -0.33364 11 | 0.30187 12 | -18.07548 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/067.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/067.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/067.txt: -------------------------------------------------------------------------------- 1 | 814.11646 2 | 814.11646 3 | 237.50000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/067_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/067_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/067_ext.txt: -------------------------------------------------------------------------------- 1 | 0.65844 2 | -0.47740 3 | 0.58184 4 | -0.69980 5 | -0.07103 6 | 0.73021 7 | 0.67952 8 | 1.48863 9 | -0.74927 10 | -0.48875 11 | 0.44689 12 | -16.04611 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/099.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/099.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/099.txt: -------------------------------------------------------------------------------- 1 | 817.50549 2 | 817.50549 3 | 237.50000 4 | 419.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/099_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/099_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/099_ext.txt: -------------------------------------------------------------------------------- 1 | -0.33144 2 | 0.91811 3 | -0.21731 4 | -0.77130 5 | -0.94027 6 | -0.34041 7 | -0.00411 8 | -0.49282 9 | -0.07775 10 | 0.20297 11 | 0.97609 12 | -14.93493 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/143.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/143.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/143.txt: -------------------------------------------------------------------------------- 1 | 813.87244 2 | 813.87244 3 | 236.50000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/143_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/143_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/201_21613_43652/143_ext.txt: -------------------------------------------------------------------------------- 1 | -0.41307 2 | -0.83543 3 | -0.36253 4 | -0.94164 5 | 0.04783 6 | -0.41743 7 | 0.90745 8 | -0.04272 9 | -0.90944 10 | 0.35750 11 | 0.21239 12 | -18.61402 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/010.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/010.txt: -------------------------------------------------------------------------------- 1 | 1894.62366 2 | 1894.62366 3 | 899.50000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/010_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/010_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/010_ext.txt: -------------------------------------------------------------------------------- 1 | -0.48597 2 | 0.78452 3 | -0.38519 4 | -0.33164 5 | -0.87386 6 | -0.44330 7 | 0.19961 8 | 1.50121 9 | -0.01416 10 | 0.43361 11 | 0.90099 12 | -14.69950 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/017.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/017.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/017.txt: -------------------------------------------------------------------------------- 1 | 1877.22815 2 | 1877.22815 3 | 899.00000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/017_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/017_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/017_ext.txt: -------------------------------------------------------------------------------- 1 | -0.67710 2 | 0.43300 3 | -0.59501 4 | -0.31781 5 | -0.69160 6 | -0.65071 7 | 0.31349 8 | 1.46675 9 | -0.25144 10 | 0.62377 11 | 0.74006 12 | -13.76519 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/041.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/041.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/041.txt: -------------------------------------------------------------------------------- 1 | 1841.41174 2 | 1841.41174 3 | 900.00000 4 | 506.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/041_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/041_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/041_ext.txt: -------------------------------------------------------------------------------- 1 | -0.09783 2 | -0.96769 3 | -0.23238 4 | -0.89872 5 | 0.15031 6 | -0.24519 7 | 0.95775 8 | 1.10148 9 | -0.98379 10 | 0.05877 11 | 0.16945 12 | -12.55235 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/072.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/072.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/072.txt: -------------------------------------------------------------------------------- 1 | 1810.31189 2 | 1810.31189 3 | 899.50000 4 | 506.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/072_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/072_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/072_ext.txt: -------------------------------------------------------------------------------- 1 | 0.71123 2 | 0.32405 3 | 0.62381 4 | -1.13456 5 | -0.64928 6 | 0.64295 7 | 0.40628 8 | 1.63326 9 | -0.26942 10 | -0.69398 11 | 0.66768 12 | -14.39440 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/099.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/099.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/099.txt: -------------------------------------------------------------------------------- 1 | 1877.19104 2 | 1877.19104 3 | 899.50000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/099_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/099_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/099_ext.txt: -------------------------------------------------------------------------------- 1 | -0.27407 2 | 0.94088 3 | -0.19909 4 | -0.06255 5 | -0.95632 6 | -0.24475 7 | 0.15986 8 | 1.67176 9 | 0.10169 10 | 0.23420 11 | 0.96686 12 | -15.96998 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/107.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/107.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/107.txt: -------------------------------------------------------------------------------- 1 | 1880.61340 2 | 1880.61340 3 | 899.50000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/107_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/107_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/107_ext.txt: -------------------------------------------------------------------------------- 1 | -0.60230 2 | 0.62224 3 | -0.50005 4 | -0.95324 5 | -0.78889 6 | -0.55970 7 | 0.25374 8 | 1.60947 9 | -0.12199 10 | 0.54731 11 | 0.82799 12 | -14.19761 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/118.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/118.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/118.txt: -------------------------------------------------------------------------------- 1 | 1869.02002 2 | 1869.02002 3 | 899.50000 4 | 506.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/118_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/118_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/118_ext.txt: -------------------------------------------------------------------------------- 1 | -0.70884 2 | -0.21607 3 | -0.67147 4 | -0.24353 5 | -0.34049 6 | -0.72887 7 | 0.59398 8 | 1.05765 9 | -0.61775 10 | 0.64966 11 | 0.44308 12 | -12.96866 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/130.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/130.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/130.txt: -------------------------------------------------------------------------------- 1 | 1849.60815 2 | 1849.60815 3 | 899.00000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/130_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/130_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/31_1359_4114/130_ext.txt: -------------------------------------------------------------------------------- 1 | -0.24049 2 | -0.90529 3 | -0.35018 4 | -0.82363 5 | 0.09862 6 | -0.38168 7 | 0.91902 8 | 1.25895 9 | -0.96563 10 | 0.18648 11 | 0.18107 12 | -12.66461 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/000.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/000.txt: -------------------------------------------------------------------------------- 1 | 556.14813 2 | 556.14813 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/000_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/000_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/000_ext.txt: -------------------------------------------------------------------------------- 1 | -0.88550 2 | 0.35273 3 | -0.30244 4 | 0.10619 5 | -0.42134 6 | -0.88396 7 | 0.20269 8 | 0.85745 9 | -0.19585 10 | 0.30691 11 | 0.93137 12 | -6.91243 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/028.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/028.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/028.txt: -------------------------------------------------------------------------------- 1 | 533.91620 2 | 533.91620 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/028_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/028_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/028_ext.txt: -------------------------------------------------------------------------------- 1 | -0.19513 2 | 0.97940 3 | -0.05199 4 | -0.13463 5 | -0.98037 6 | -0.19630 7 | -0.01825 8 | 0.63380 9 | -0.02808 10 | 0.04741 11 | 0.99848 12 | -6.94058 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/044.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/044.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/044.txt: -------------------------------------------------------------------------------- 1 | 549.66217 2 | 549.66217 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/044_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/044_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/044_ext.txt: -------------------------------------------------------------------------------- 1 | 0.34252 2 | 0.92934 3 | 0.13789 4 | -0.14764 5 | -0.93949 6 | 0.33970 7 | 0.04425 8 | 0.57568 9 | -0.00572 10 | -0.14470 11 | 0.98946 12 | -7.51767 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/062.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/062.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/062.txt: -------------------------------------------------------------------------------- 1 | 544.20050 2 | 544.20050 3 | 175.00000 4 | 319.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/062_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/062_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/062_ext.txt: -------------------------------------------------------------------------------- 1 | 0.86101 2 | 0.36287 3 | 0.35635 4 | -0.27652 5 | -0.44818 6 | 0.87255 7 | 0.19439 8 | 1.03958 9 | -0.24039 10 | -0.32708 11 | 0.91391 12 | -7.43995 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/068.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/068.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/068.txt: -------------------------------------------------------------------------------- 1 | 542.48267 2 | 542.48267 3 | 175.00000 4 | 319.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/068_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/068_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/068_ext.txt: -------------------------------------------------------------------------------- 1 | 0.89591 2 | 0.11736 3 | 0.42844 4 | -0.26967 5 | -0.25885 6 | 0.92173 7 | 0.28880 8 | 0.92617 9 | -0.36101 10 | -0.36964 11 | 0.85617 12 | -7.90157 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/074.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/074.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/074.txt: -------------------------------------------------------------------------------- 1 | 558.13892 2 | 558.13892 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/074_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/074_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/074_ext.txt: -------------------------------------------------------------------------------- 1 | 0.88700 2 | -0.14835 3 | 0.43729 4 | -0.21878 5 | -0.04606 6 | 0.91384 7 | 0.40345 8 | 0.57693 9 | -0.45947 10 | -0.37800 11 | 0.80374 12 | -7.53494 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/098.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/098.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/098.txt: -------------------------------------------------------------------------------- 1 | 560.99487 2 | 560.99487 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/098_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/098_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/098_ext.txt: -------------------------------------------------------------------------------- 1 | -0.07326 2 | -0.99731 3 | 0.00211 4 | -0.38476 5 | 0.70560 6 | -0.05034 7 | 0.70682 8 | 0.43857 9 | -0.70482 10 | 0.05327 11 | 0.70739 12 | -7.58088 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/101.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/101.jpg -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/101.txt: -------------------------------------------------------------------------------- 1 | 553.15106 2 | 553.15106 3 | 175.00000 4 | 319.00000 5 | -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/101_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/101_depth.png -------------------------------------------------------------------------------- /examples/co3dv2-samples/422_58670_113666/101_ext.txt: -------------------------------------------------------------------------------- 1 | -0.16198 2 | -0.98575 3 | -0.04545 4 | -0.64704 5 | 0.69978 6 | -0.14722 7 | 0.69903 8 | 0.52612 9 | -0.69576 10 | 0.08143 11 | 0.71365 12 | -7.49813 13 | 0.00000 14 | 0.00000 15 | 0.00000 16 | 1.00000 17 | -------------------------------------------------------------------------------- /examples/single-view/armor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/armor.png -------------------------------------------------------------------------------- /examples/single-view/armor.txt: -------------------------------------------------------------------------------- 1 | 55 2 | -------------------------------------------------------------------------------- /examples/single-view/ghost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/ghost.png -------------------------------------------------------------------------------- /examples/single-view/ghost.txt: -------------------------------------------------------------------------------- 1 | 60 2 | -------------------------------------------------------------------------------- /examples/single-view/jacket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/jacket.png -------------------------------------------------------------------------------- /examples/single-view/pile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/pile.png -------------------------------------------------------------------------------- /examples/single-view/pile.txt: -------------------------------------------------------------------------------- 1 | 65 2 | -------------------------------------------------------------------------------- /examples/single-view/skull.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/skull.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/061.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/061.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/061.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/072.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/072.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/072.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/081.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/081.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41069043/081.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/052.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/052.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/052.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/053.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/053.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/053.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/054.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/054.png -------------------------------------------------------------------------------- /examples/unposed-samples/arkitscenes/41125709/054.txt: -------------------------------------------------------------------------------- 1 | 1579.02808 2 | 1579.02808 3 | 717.76849 4 | 957.03693 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/051.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/051.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/051.txt: -------------------------------------------------------------------------------- 1 | 1206.45020 2 | 1206.45020 3 | 340.00000 4 | 605.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/084.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/084.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/084.txt: -------------------------------------------------------------------------------- 1 | 1206.86255 2 | 1206.86255 3 | 339.00000 4 | 604.50000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/126.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/126.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/195_20989_41543/126.txt: -------------------------------------------------------------------------------- 1 | 1238.41101 2 | 1238.41101 3 | 340.00000 4 | 605.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/000.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/000.txt: -------------------------------------------------------------------------------- 1 | 2601.26636 2 | 2601.26636 3 | 945.50000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/038.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/038.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/038.txt: -------------------------------------------------------------------------------- 1 | 2602.71191 2 | 2602.71191 3 | 944.50000 4 | 531.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/074.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/074.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/198_21285_41285/074.txt: -------------------------------------------------------------------------------- 1 | 2551.71631 2 | 2551.71631 3 | 945.50000 4 | 531.50000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/000.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/000.txt: -------------------------------------------------------------------------------- 1 | 811.81531 2 | 811.81531 3 | 237.50000 4 | 418.50000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/038.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/038.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/038.txt: -------------------------------------------------------------------------------- 1 | 812.97998 2 | 812.97998 3 | 237.00000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/067.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/067.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/201_21613_43652/067.txt: -------------------------------------------------------------------------------- 1 | 814.11646 2 | 814.11646 3 | 237.50000 4 | 419.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/010.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/010.txt: -------------------------------------------------------------------------------- 1 | 1894.62366 2 | 1894.62366 3 | 899.50000 4 | 505.50000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/041.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/041.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/041.txt: -------------------------------------------------------------------------------- 1 | 1841.41174 2 | 1841.41174 3 | 900.00000 4 | 506.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/072.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/072.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/31_1359_4114/072.txt: -------------------------------------------------------------------------------- 1 | 1810.31189 2 | 1810.31189 3 | 899.50000 4 | 506.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/000.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/000.txt: -------------------------------------------------------------------------------- 1 | 556.14813 2 | 556.14813 3 | 175.50000 4 | 318.50000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/062.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/062.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/062.txt: -------------------------------------------------------------------------------- 1 | 544.20050 2 | 544.20050 3 | 175.00000 4 | 319.00000 5 | -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/101.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/101.jpg -------------------------------------------------------------------------------- /examples/unposed-samples/co3dv2/422_58670_113666/101.txt: -------------------------------------------------------------------------------- 1 | 553.15106 2 | 553.15106 3 | 175.00000 4 | 319.00000 5 | -------------------------------------------------------------------------------- /model/dinov2.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | from typing import Optional, Tuple, Union, Dict, List 6 | import torch 7 | from torch import nn 8 | from torch.nn import functional as F 9 | from transformers.modeling_outputs import BaseModelOutput 10 | from transformers.models.dinov2.configuration_dinov2 import Dinov2Config 11 | from transformers.models.dinov2.modeling_dinov2 import ( 12 | BaseModelOutput, 13 | BaseModelOutputWithPooling, 14 | Dinov2SelfAttention, 15 | Dinov2Layer, 16 | Dinov2PreTrainedModel, 17 | Dinov2Embeddings, 18 | Dinov2PatchEmbeddings, 19 | ) 20 | 21 | from .dinov2_adaln.adaln import AdaLayerNorm 22 | 23 | class Dinov2SelfAttentionSDP(Dinov2SelfAttention): 24 | def __init__(self, config: Dinov2Config) -> None: 25 | super().__init__(config) 26 | self.dropout_prob = config.attention_probs_dropout_prob 27 | assert self.dropout_prob == 0.0 28 | 29 | def forward( 30 | self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False 31 | ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: 32 | assert head_mask is None 33 | assert not output_attentions 34 | 35 | mixed_query_layer = self.query(hidden_states) 36 | 37 | key_layer = self.transpose_for_scores(self.key(hidden_states)) 38 | value_layer = self.transpose_for_scores(self.value(hidden_states)) 39 | query_layer = self.transpose_for_scores(mixed_query_layer) 40 | 41 | context_layer = F.scaled_dot_product_attention(query_layer, key_layer, value_layer, dropout_p=self.dropout_prob) 42 | 43 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() 44 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) 45 | context_layer = context_layer.view(new_context_layer_shape) 46 | 47 | outputs = (context_layer,) 48 | 49 | return outputs 50 | 51 | class AdaDinov2Layer(Dinov2Layer): 52 | def __init__(self, config: Dinov2Config) -> None: 53 | super().__init__(config) 54 | self.norm1 = AdaLayerNorm(config.hidden_size, eps=config.layer_norm_eps, mod_act=config.hidden_act) 55 | self.norm2 = AdaLayerNorm(config.hidden_size, eps=config.layer_norm_eps, mod_act=config.hidden_act) 56 | self.attention.attention = Dinov2SelfAttentionSDP(config) 57 | 58 | def forward( 59 | self, 60 | hidden_states: torch.Tensor, 61 | head_mask: Optional[torch.Tensor] = None, 62 | modulation: Optional[torch.Tensor] = None, 63 | output_attentions: bool = False, 64 | ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: 65 | self_attention_outputs = self.attention( 66 | self.norm1(hidden_states, modulation), # in Dinov2, layernorm is applied before self-attention 67 | head_mask, 68 | output_attentions=output_attentions, 69 | ) 70 | attention_output = self_attention_outputs[0] 71 | 72 | attention_output = self.layer_scale1(attention_output) 73 | outputs = self_attention_outputs[1:] # add self attentions if we output attention weights 74 | 75 | # first residual connection 76 | hidden_states = attention_output + hidden_states 77 | 78 | # in Dinov2, layernorm is also applied after self-attention 79 | layer_output = self.norm2(hidden_states, modulation) 80 | layer_output = self.mlp(layer_output) 81 | layer_output = self.layer_scale2(layer_output) 82 | 83 | # second residual connection 84 | layer_output = layer_output + hidden_states 85 | 86 | outputs = (layer_output,) + outputs 87 | 88 | return outputs 89 | 90 | class AdaDinov2Encoder(nn.Module): 91 | def __init__(self, config: Dinov2Config) -> None: 92 | super().__init__() 93 | self.config = config 94 | self.layer = nn.ModuleList([AdaDinov2Layer(config) for _ in range(config.num_hidden_layers)]) 95 | self.gradient_checkpointing = False 96 | 97 | def forward( 98 | self, 99 | hidden_states: torch.Tensor, 100 | head_mask: Optional[torch.Tensor] = None, 101 | modulation: Optional[torch.Tensor] = None, 102 | output_attentions: bool = False, 103 | output_hidden_states: bool = False, 104 | return_dict: bool = True, 105 | ) -> Union[tuple, BaseModelOutput]: 106 | all_hidden_states = () if output_hidden_states else None 107 | all_self_attentions = () if output_attentions else None 108 | 109 | for i, layer_module in enumerate(self.layer): 110 | if output_hidden_states: 111 | all_hidden_states = all_hidden_states + (hidden_states,) 112 | 113 | layer_head_mask = head_mask[i] if head_mask is not None else None 114 | layer_inputs = (hidden_states, layer_head_mask, modulation, output_attentions) 115 | 116 | if self.gradient_checkpointing and self.training: 117 | layer_outputs = self._gradient_checkpointing_func( 118 | layer_module.__call__, 119 | *layer_inputs 120 | ) 121 | else: 122 | layer_outputs = layer_module(*layer_inputs) 123 | 124 | hidden_states = layer_outputs[0] 125 | 126 | if output_attentions: 127 | all_self_attentions = all_self_attentions + (layer_outputs[1],) 128 | 129 | if output_hidden_states: 130 | all_hidden_states = all_hidden_states + (hidden_states,) 131 | 132 | if not return_dict: 133 | return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) 134 | return BaseModelOutput( 135 | last_hidden_state=hidden_states, 136 | hidden_states=all_hidden_states, 137 | attentions=all_self_attentions, 138 | ) 139 | 140 | class AdaDinov2PreTrainedModel(Dinov2PreTrainedModel): 141 | def _init_weights(self, module: nn.Linear | nn.Conv2d | nn.LayerNorm | AdaLayerNorm) -> None: 142 | super()._init_weights(module) 143 | if isinstance(module, AdaLayerNorm): 144 | module.mod_init() 145 | 146 | class AdaDinov2Model(AdaDinov2PreTrainedModel): 147 | def __init__(self, config: Dinov2Config): 148 | super().__init__(config) 149 | self.config = config 150 | 151 | self.embeddings = Dinov2Embeddings(config) 152 | self.encoder = AdaDinov2Encoder(config) 153 | 154 | self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # TODO whether change this to ada 155 | 156 | # Initialize weights and apply final processing 157 | self.post_init() 158 | 159 | def get_input_embeddings(self) -> Dinov2PatchEmbeddings: 160 | return self.embeddings.patch_embeddings 161 | 162 | def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: 163 | """ 164 | Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base 165 | class PreTrainedModel 166 | """ 167 | for layer, heads in heads_to_prune.items(): 168 | self.encoder.layer[layer].attention.prune_heads(heads) 169 | 170 | def set_patch_size(self, new_size): 171 | # NOTE call immediately after from_pretrained 172 | # NOTE error is large (~0.5), not used for now, input size is 224 or 448, patch size remain 14 173 | num_patch = self.config.image_size // self.config.patch_size 174 | self.config.patch_size = new_size 175 | self.config.image_size = new_size * num_patch 176 | new_projection = nn.Conv2d( 177 | self.config.num_channels, 178 | self.config.hidden_size, 179 | kernel_size=new_size, 180 | stride=new_size).eval() 181 | with torch.no_grad(): 182 | new_projection.bias[:] = self.embeddings.patch_embeddings.projection.bias 183 | new_projection.weight[:] = F.interpolate( 184 | self.embeddings.patch_embeddings.projection.weight, 185 | new_size, mode='bilinear', align_corners=False, 186 | ) 187 | self.embeddings.patch_embeddings.projection = new_projection 188 | 189 | def forward( 190 | self, 191 | pixel_values: Optional[torch.Tensor] = None, 192 | bool_masked_pos: Optional[torch.Tensor] = None, 193 | head_mask: Optional[torch.Tensor] = None, 194 | modulation: Optional[torch.Tensor] = None, 195 | output_attentions: Optional[bool] = None, 196 | output_hidden_states: Optional[bool] = None, 197 | return_dict: Optional[bool] = None, 198 | ) -> Union[Tuple, BaseModelOutputWithPooling]: 199 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions 200 | output_hidden_states = ( 201 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states 202 | ) 203 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 204 | 205 | if pixel_values is None: 206 | raise ValueError("You have to specify pixel_values") 207 | 208 | # Prepare head mask if needed 209 | # 1.0 in head_mask indicate we keep the head 210 | # attention_probs has shape bsz x n_heads x N x N 211 | # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] 212 | # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] 213 | head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) 214 | 215 | embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) 216 | 217 | encoder_outputs = self.encoder( 218 | embedding_output, 219 | head_mask=head_mask, 220 | modulation=modulation, 221 | output_attentions=output_attentions, 222 | output_hidden_states=output_hidden_states, 223 | return_dict=return_dict, 224 | ) 225 | sequence_output = encoder_outputs[0] 226 | sequence_output = self.layernorm(sequence_output) 227 | pooled_output = sequence_output[:, 0, :] 228 | 229 | if not return_dict: 230 | head_outputs = (sequence_output, pooled_output) 231 | return head_outputs + encoder_outputs[1:] 232 | 233 | return BaseModelOutputWithPooling( 234 | last_hidden_state=sequence_output, 235 | pooler_output=pooled_output, 236 | hidden_states=encoder_outputs.hidden_states, 237 | attentions=encoder_outputs.attentions, 238 | ) 239 | -------------------------------------------------------------------------------- /model/dinov2_adaln/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # -------------------------------------------------------------------------------- /model/dinov2_adaln/adaln.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import torch 6 | from torch import Tensor 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.nn.modules.normalization import _shape_t 10 | from transformers.activations import get_activation 11 | 12 | # https://github.com/yenchenlin/dinov2-adaLN/commit/b195e7b7ebeefc0b249173b23734b1fb64227a9f 13 | class AdaLayerNorm(nn.LayerNorm): 14 | def __init__(self, normalized_shape: _shape_t, mod_shape: _shape_t = None, eps: float = 0.00001, mod_act='gelu', elementwise_affine: bool = True, device=None, dtype=None) -> None: 15 | super().__init__(normalized_shape, eps, elementwise_affine, device, dtype) 16 | if mod_shape is None: 17 | mod_shape = normalized_shape 18 | self.mod_linear = nn.Sequential( 19 | get_activation(mod_act), 20 | nn.Linear(mod_shape, 2 * normalized_shape, bias=True) 21 | ) if mod_shape > 0 else None 22 | 23 | def modulate(self, x, scale, shift): 24 | return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) 25 | 26 | def mod_init(self): 27 | nn.init.zeros_(self.mod_linear[-1].weight) # TODO why cannot use .zeros_() here 28 | nn.init.zeros_(self.mod_linear[-1].bias) 29 | 30 | def forward(self, input: Tensor, modulation: Tensor = None) -> Tensor: 31 | normed = super().forward(input) 32 | if modulation is None or self.mod_linear is None: 33 | return normed 34 | scale, shift = self.mod_linear(modulation).chunk(2, dim=1) 35 | return self.modulate(normed, scale, shift) 36 | -------------------------------------------------------------------------------- /model/feature_extractors.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | def resize(image, size=None, scale_factor=None): 10 | return nn.functional.interpolate( 11 | image, 12 | size=size, 13 | scale_factor=scale_factor, 14 | mode="bilinear", 15 | align_corners=False, 16 | ) 17 | 18 | 19 | class SpatialDino(nn.Module): 20 | def __init__( 21 | self, 22 | freeze_weights=True, 23 | model_type="dinov2_vits14", 24 | num_patches_x=16, 25 | num_patches_y=16, 26 | ): 27 | super().__init__() 28 | self.model = torch.hub.load("facebookresearch/dinov2", model_type) 29 | self.feature_dim = self.model.embed_dim 30 | self.num_patches_x = num_patches_x 31 | self.num_patches_y = num_patches_y 32 | if freeze_weights: 33 | for param in self.model.parameters(): 34 | param.requires_grad = False 35 | 36 | def forward(self, x, autoresize=False): 37 | """ 38 | Spatial dimensions of output will be H // 14, W // 14. If autoresize is True, 39 | then the output will be resized to the correct dimensions. 40 | 41 | Args: 42 | x (torch.Tensor): Images (B, C, H, W). Should be ImageNet normalized. 43 | autoresize (bool): Whether to resize the input to match the num_patch 44 | dimensions. 45 | 46 | Returns: 47 | feature_map (torch.tensor): (B, C, h, w) 48 | """ 49 | *B, c, h, w = x.shape 50 | 51 | x = x.reshape(-1, c, h, w) 52 | 53 | # Output will be (B, H * W, C) 54 | features = self.model.forward_features(x)["x_norm_patchtokens"] 55 | features = features.permute(0, 2, 1) 56 | features = features.reshape( # (B, C, H, W) 57 | -1, self.feature_dim, h // 14, w // 14 58 | ) 59 | if autoresize: 60 | features = resize(features, size=(self.num_patches_y, self.num_patches_x)) 61 | 62 | features = features.reshape( 63 | *B, self.feature_dim, self.num_patches_y, self.num_patches_x 64 | ) 65 | return features 66 | -------------------------------------------------------------------------------- /model/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # -------------------------------------------------------------------------------- /model/inference/ddpm.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import ipdb # noqa: F401 6 | import torch 7 | from tqdm.auto import tqdm 8 | from typing import Any, Callable, Dict, List, Optional, Union 9 | import inspect 10 | 11 | rescale_fn = { 12 | "zero": lambda x: 0, 13 | "identity": lambda x: x, 14 | "square": lambda x: x**2, 15 | "square_root": lambda x: torch.sqrt(x), 16 | } 17 | 18 | def retrieve_timesteps( 19 | scheduler, 20 | num_inference_steps: Optional[int] = None, 21 | device: Optional[Union[str, torch.device]] = None, 22 | timesteps: Optional[List[int]] = None, 23 | **kwargs, 24 | ): 25 | """ 26 | Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles 27 | custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. 28 | 29 | Args: 30 | scheduler (`SchedulerMixin`): 31 | The scheduler to get timesteps from. 32 | num_inference_steps (`int`): 33 | The number of diffusion steps used when generating samples with a pre-trained model. If used, 34 | `timesteps` must be `None`. 35 | device (`str` or `torch.device`, *optional*): 36 | The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. 37 | timesteps (`List[int]`, *optional*): 38 | Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default 39 | timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` 40 | must be `None`. 41 | 42 | Returns: 43 | `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the 44 | second element is the number of inference steps. 45 | """ 46 | if timesteps is not None: 47 | accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) 48 | if not accepts_timesteps: 49 | raise ValueError( 50 | f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" 51 | f" timestep schedules. Please check whether you are using the correct scheduler." 52 | ) 53 | scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) 54 | timesteps = scheduler.timesteps 55 | num_inference_steps = len(timesteps) 56 | else: 57 | scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) 58 | timesteps = scheduler.timesteps 59 | return timesteps, num_inference_steps 60 | 61 | 62 | def shift_scale_denormalize(x, shift, scale): 63 | '''denormalize the normalized data using the shfit/mean and scale/std.''' 64 | return x / scale + shift 65 | 66 | 67 | def inference_ddpm_call_varmod( 68 | model, 69 | scheduler, 70 | device, 71 | data=None, 72 | num_inference_steps=1000, 73 | guidance_scale=1.0, 74 | cfg=None, 75 | ): 76 | """ 77 | Implements DDPM-style inference. 78 | 79 | To get multiple samples, batch the images multiple times. 80 | 81 | Args: 82 | model: Ray Diffuser. 83 | images (torch.Tensor): (B, N, C, H, W). 84 | crop_parameters (torch.Tensor): (B, N, 4) or None. 85 | pbar (bool): If True, shows a progress bar. 86 | """ 87 | # batch_size, num_images, num_channel, num_patches_x, num_patches_y = data['rgb']['data'].shape 88 | batch_size, num_images = data['view_id'].shape[:2] 89 | scheduler.set_timesteps(num_inference_steps, device=device) 90 | timesteps = scheduler.timesteps 91 | 92 | cond_mods = [mod for mod in model.modalities if mod in data['conds']] 93 | gen_mods = [mod for mod in model.modalities if mod in data['gens']] 94 | use_rgb, use_ray, use_depth = 'rgb' in gen_mods, 'ray' in gen_mods, 'depth' in gen_mods 95 | 96 | x_t = data 97 | with torch.no_grad(): 98 | for t in tqdm(timesteps): 99 | # predict the noise residual 100 | mmod_preds = model( 101 | t=t.repeat(batch_size)-1, 102 | data={**x_t, 'uncond': False}, 103 | ) 104 | noise_pred_rgb = mmod_preds['rgb'] if use_rgb else None 105 | noise_pred_ray = mmod_preds['ray'] if use_ray else None 106 | noise_pred_depth = mmod_preds['depth'] if use_depth else None 107 | 108 | if guidance_scale > 1.0: 109 | mmod_preds_uncond = model( 110 | t=t.repeat(batch_size)-1, 111 | data={**x_t, 'uncond': True}, 112 | ) 113 | noise_pred_rgb_uncond = mmod_preds_uncond['rgb'] if use_rgb else None 114 | noise_pred_ray_uncond = mmod_preds_uncond['ray'] if use_ray else None 115 | noise_pred_depth_uncond = mmod_preds_uncond['depth'] if use_depth else None 116 | noise_pred_rgb = noise_pred_rgb_uncond + guidance_scale * (noise_pred_rgb - noise_pred_rgb_uncond) if use_rgb else None 117 | noise_pred_ray = noise_pred_ray_uncond + guidance_scale * (noise_pred_ray - noise_pred_ray_uncond) if use_ray else None 118 | noise_pred_depth = noise_pred_depth_uncond + guidance_scale * (noise_pred_depth - noise_pred_depth_uncond) if use_depth else None 119 | 120 | # compute the previous noisy sample x_t -> x_t-1 121 | if use_rgb: 122 | x_t_rgb = scheduler.step(noise_pred_rgb.flatten(0, 1).float(), t-1, x_t['gens']['rgb'].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_rgb.dtype) 123 | x_t_rgb = x_t_rgb.reshape((batch_size, num_images) + x_t_rgb.shape[1:]) 124 | x_t['gens']['rgb'] = x_t_rgb 125 | x_t['gens']['rgb_mask'] = mmod_preds['rgb_mask'] 126 | if use_ray: 127 | x_t_ray = scheduler.step(noise_pred_ray.flatten(0, 1).float(), t-1, x_t['gens']['ray'].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_ray.dtype) 128 | x_t_ray = x_t_ray.reshape((batch_size, num_images) + x_t_ray.shape[1:]) 129 | x_t['gens']['ray'] = x_t_ray 130 | x_t['gens']['ray_mask'] = mmod_preds['ray_mask'] 131 | if use_depth: 132 | x_t_depth = scheduler.step(noise_pred_depth.flatten(0, 1).float(), t-1, x_t['gens']['depth'][:, :, 0:1].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_depth.dtype) 133 | x_t_depth = x_t_depth.reshape((batch_size, num_images) + x_t_depth.shape[1:]) 134 | x_t['gens']['depth'] = torch.cat([x_t_depth, x_t['gens']['depth'][:, :, 1:2]], dim=2) 135 | x_t['gens']['depth_mask'] = mmod_preds['depth_mask'] 136 | 137 | # shift-scale denormalize 138 | if use_rgb: 139 | x_t['gens']['rgb'] = shift_scale_denormalize(x_t['gens']['rgb'], cfg.data.shift_scales.rgb[0], cfg.data.shift_scales.rgb[1]) 140 | if use_ray: 141 | if cfg.data.use_plucker: 142 | x_t['gens']['ray'][:, :, :3] = shift_scale_denormalize(x_t['gens']['ray'][:, :, :3], cfg.data.shift_scales.ray.dirs[0], cfg.data.shift_scales.ray.dirs[1]) 143 | x_t['gens']['ray'][:, :, 3:] = shift_scale_denormalize(x_t['gens']['ray'][:, :, 3:], cfg.data.shift_scales.ray.moms[0], cfg.data.shift_scales.ray.moms[1]) 144 | else: 145 | x_t['gens']['ray'][:, :, :3] = shift_scale_denormalize(x_t['gens']['ray'][:, :, :3], cfg.data.shift_scales.ray.origins[0], cfg.data.shift_scales.ray.origins[1]) 146 | x_t['gens']['ray'][:, :, 3:] = shift_scale_denormalize(x_t['gens']['ray'][:, :, 3:], cfg.data.shift_scales.ray.directions[0], cfg.data.shift_scales.ray.directions[1]) 147 | if use_depth: 148 | x_t['gens']['depth'][:, :, :1] = shift_scale_denormalize(x_t['gens']['depth'][:, :, :1], cfg.data.shift_scales.depth[0], cfg.data.shift_scales.depth[1]) 149 | 150 | 151 | return x_t -------------------------------------------------------------------------------- /model/load.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import safetensors 7 | import torch 8 | import torch.utils.checkpoint 9 | from diffusers import DDPMScheduler, DDIMScheduler 10 | from diffusers import HunyuanDiTPipeline 11 | 12 | from model.feature_extractors import SpatialDino 13 | from model.dit import DiT 14 | 15 | def load_model(cfg, checkpoint_path, device='cuda:0', weight_dtype=torch.float16): 16 | # Load scheduler, tokenizer and models. 17 | if cfg.eval.scheduler == "DDPM": 18 | noise_scheduler = DDPMScheduler.from_pretrained(cfg.model.scheduler_url, subfolder="scheduler") 19 | elif cfg.eval.scheduler == "DDIM": 20 | noise_scheduler = DDIMScheduler.from_pretrained(cfg.model.scheduler_url, subfolder="scheduler") 21 | 22 | # Freeze vae and text_encoder 23 | feature_extractor = SpatialDino( 24 | freeze_weights=True, 25 | model_type="dinov2_vitb14", 26 | num_patches_x=cfg.modalities.rgb.width, 27 | num_patches_y=cfg.modalities.rgb.width, 28 | ) 29 | hunyuan_pipe = HunyuanDiTPipeline.from_pretrained(cfg.model.decoder_url, torch_dtype=torch.float16) 30 | tokenizer, text_encoder = hunyuan_pipe.tokenizer, hunyuan_pipe.text_encoder 31 | tokenizer_2, text_encoder_2 = hunyuan_pipe.tokenizer_2, hunyuan_pipe.text_encoder_2 32 | vae = hunyuan_pipe.vae 33 | del hunyuan_pipe 34 | 35 | # Build model and load from checkpoint 36 | cfg.used_modalities = {key: cfg.modalities[key] for key in ['rgb', 'ray', 'depth', 'local_caption', 'global_caption']} 37 | model = DiT(modalities=cfg.used_modalities, **cfg.model) 38 | if os.path.splitext(checkpoint_path)[-1] == '.safetensors': 39 | state_dict = safetensors.torch.load_file(checkpoint_path) 40 | else: 41 | state_dict = torch.load(checkpoint_path)['module'] 42 | missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) 43 | model.eval() 44 | print('Loaded model from:', checkpoint_path) 45 | print('missing_keys', missing_keys) 46 | print('unexpected_keys', unexpected_keys) 47 | 48 | # Move non-trainables and cast to weight_dtype 49 | vae.to(device, dtype=weight_dtype) 50 | feature_extractor.to(device, dtype=weight_dtype) 51 | text_encoder.to(device, dtype=weight_dtype) 52 | text_encoder_2.to(device, dtype=weight_dtype) 53 | model.to(device, dtype=weight_dtype) 54 | 55 | # Package all components into one dict 56 | models = { 57 | 'model': model, 58 | 'noise_scheduler': noise_scheduler, 59 | 'tokenizer': tokenizer, 60 | 'text_encoder': text_encoder, 61 | 'tokenizer_2': tokenizer_2, 62 | 'text_encoder_2': text_encoder_2, 63 | 'vae': vae, 64 | 'feature_extractor': feature_extractor 65 | } 66 | 67 | return models -------------------------------------------------------------------------------- /model/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # -------------------------------------------------------------------------------- /model/utils/nn.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | from typing import Optional 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | 10 | import xformers.ops as xops 11 | from diffusers.models.attention_processor import Attention 12 | 13 | from .pos_encoder import FeaturePositionalEncoding 14 | 15 | 16 | def modulate(x, shift, scale): 17 | if x.is_nested: 18 | return x * (1 + scale) + shift 19 | else: 20 | return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) 21 | 22 | 23 | def convert_tensor_to_nested_tensor(tensor_list, in_nt_tensor): 24 | '''convert tensor to nested tensor''' 25 | batch_size = in_nt_tensor.size(0) 26 | out_nt_tensor = [] 27 | for tensor in tensor_list: 28 | nt_tensor = torch.nested.as_nested_tensor([tensor[i].unsqueeze(0).repeat(in_nt_tensor[i].shape[0], 1) for i in range(batch_size)]) 29 | out_nt_tensor.append(nt_tensor) 30 | 31 | return out_nt_tensor 32 | 33 | 34 | def restore_nested_tensor_to_tensor(nt_tensor, orig_shape, mask, value=0.): 35 | restored_tensor = torch.ones(orig_shape, dtype=nt_tensor.dtype, device=nt_tensor.device) * value 36 | for i, m in enumerate(mask): 37 | restored_tensor[i][m] = nt_tensor[i] 38 | 39 | return restored_tensor 40 | 41 | 42 | def full_to_packed(data, mask): 43 | seqlist = [seq[m] for seq, m in zip(data, mask)] 44 | seqlen = [seq.shape[0] for seq in seqlist] 45 | packed = torch.cat(seqlist, dim=0).unsqueeze(0) 46 | return packed, seqlen 47 | 48 | 49 | def packed_to_nested(data, seqlen): 50 | data = torch.nested.as_nested_tensor(list(data[0].split(seqlen, dim=0))) 51 | return data 52 | 53 | 54 | def nested_to_packed(data): 55 | seqlist = [seq for seq in data] 56 | seqlen = [seq.shape[0] for seq in seqlist] 57 | packed = torch.cat(seqlist, dim=0).unsqueeze(0) 58 | return packed, seqlen 59 | 60 | 61 | def packed_to_padded(data, seqlen, total=None, fill=0.): 62 | if total is None: 63 | total = max(seqlen) 64 | return torch.stack([ 65 | torch.cat([ 66 | seq, 67 | torch.full((total-seq.shape[0], *seq.shape[1:]), fill, dtype=seq.dtype, device=seq.device) 68 | ], dim=0) 69 | for seq in data[0].split(seqlen, dim=0) 70 | ], dim=0), seqlen 71 | 72 | 73 | def padded_to_packed(data, seqlen): 74 | return torch.cat([ 75 | seq[:l] for seq, l in zip(data, seqlen) 76 | ], dim=0).unsqueeze(0), seqlen 77 | 78 | 79 | def packed_to_full(data, mask, fill=0.): 80 | out = torch.full((*mask.shape, *data.shape[2:]), fill, dtype=data.dtype, device=data.device) 81 | out[mask] = data[0] 82 | return out, mask 83 | 84 | 85 | def full_to_padded(data, mask, total=None, fill=0.): 86 | return packed_to_padded(*full_to_packed(data, mask), total=total, fill=fill) 87 | 88 | 89 | def padded_to_full(data, seqlen, mask, fill=0.): 90 | return packed_to_full(padded_to_packed(data, seqlen)[0], mask, fill=fill) 91 | 92 | 93 | # modified from https://github.com/meta-llama/llama/blob/main/llama/model.py 94 | class RMSNorm(torch.nn.Module): 95 | def __init__(self, dim: int, eps: float = 1e-6): 96 | """ 97 | Initialize the RMSNorm normalization layer. 98 | 99 | Args: 100 | dim (int): The dimension of the input tensor. 101 | eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. 102 | 103 | Attributes: 104 | eps (float): A small value added to the denominator for numerical stability. 105 | weight (nn.Parameter): Learnable scaling parameter. 106 | 107 | """ 108 | super().__init__() 109 | self.eps = eps 110 | self.dim = dim 111 | self.weight = nn.Parameter(torch.zeros(dim)) 112 | 113 | def _norm(self, x): 114 | """ 115 | Apply the RMSNorm normalization to the input tensor. 116 | 117 | Args: 118 | x (torch.Tensor): The input tensor. 119 | 120 | Returns: 121 | torch.Tensor: The normalized tensor. 122 | 123 | """ 124 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 125 | 126 | def forward(self, x): 127 | """ 128 | Forward pass through the RMSNorm layer. 129 | 130 | Args: 131 | x (torch.Tensor): The input tensor. 132 | 133 | Returns: 134 | torch.Tensor: The output tensor after applying RMSNorm. 135 | 136 | """ 137 | nested = False 138 | if x.is_nested: # NOTE assume seq dim is 1 139 | nested = True 140 | x, seqlen = nested_to_packed(x) 141 | x = self._norm(x) * (1 + self.weight) 142 | if nested: 143 | x = packed_to_nested(x, seqlen) 144 | return x 145 | 146 | 147 | class HolisticAttnProcessor: 148 | def __call__( 149 | self, 150 | attn: Attention, 151 | hidden_states: torch.Tensor, 152 | encoder_hidden_states: Optional[torch.Tensor] = None, 153 | attention_mask: Optional[torch.Tensor] = None, 154 | query_pos_s=None, key_pos_s=None, query_pos_r=None, key_pos_r=None, seqlen_q=None, seqlen_kv=None, 155 | ) -> torch.Tensor: 156 | query = hidden_states 157 | key = value = encoder_hidden_states 158 | 159 | if query_pos_s is not None: 160 | query = query + query_pos_s 161 | if key_pos_s is not None: 162 | key = key + key_pos_s 163 | 164 | query, _ = padded_to_packed(query, seqlen_q) 165 | key, _ = padded_to_packed(key, seqlen_kv) 166 | value, _ = padded_to_packed(value, seqlen_kv) 167 | 168 | q = attn.to_q(query) 169 | k = attn.to_k(key) 170 | v = attn.to_v(value) 171 | 172 | assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] 173 | inner_dim = k.shape[-1] 174 | head_dim = inner_dim // attn.heads 175 | 176 | q = q.reshape(1, -1, attn.heads, head_dim) # batch_size=1 because it's packed 177 | k = k.reshape(1, -1, attn.heads, head_dim) 178 | v = v.reshape(1, -1, attn.heads, head_dim) 179 | 180 | q = attn.norm_q(q).to(q.dtype) 181 | k = attn.norm_k(k).to(k.dtype) 182 | 183 | if query_pos_r is not None: 184 | query_pos_r, _ = padded_to_packed(query_pos_r, seqlen_q) 185 | q = FeaturePositionalEncoding.apply_rotary_emb(q, query_pos_r) 186 | if key_pos_r is not None: 187 | key_pos_r, _ = padded_to_packed(key_pos_r, seqlen_kv) 188 | k = FeaturePositionalEncoding.apply_rotary_emb(k, key_pos_r) 189 | 190 | x = xops.memory_efficient_attention( 191 | q, k, v, 192 | attn_bias=xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(seqlen_q, seqlen_kv), 193 | p=attn.attn_drop.p if attn.training and hasattr(attn, 'attn_drop') else 0., 194 | ) 195 | x = x.reshape(1, -1, inner_dim) 196 | x = attn.to_out[0](x) # linear 197 | x = attn.to_out[1](x) # dropout 198 | 199 | x, _ = packed_to_padded(x, seqlen_q, max(seqlen_q)) 200 | 201 | return x 202 | 203 | 204 | 205 | class FinalLayer(nn.Module): 206 | """ 207 | The final layer of DiT. 208 | """ 209 | 210 | def __init__(self, hidden_size, patch_size, out_channels): 211 | super().__init__() 212 | self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 213 | self.linear = nn.Linear( 214 | hidden_size, patch_size * patch_size * out_channels, bias=True 215 | ) 216 | self.adaLN_modulation = nn.Sequential( 217 | nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True) 218 | ) 219 | 220 | def initialize_weights(self): 221 | nn.init.constant_(self.linear.weight, 0) 222 | nn.init.constant_(self.linear.bias, 0) 223 | nn.init.constant_(self.adaLN_modulation[-1].weight, 0) 224 | nn.init.constant_(self.adaLN_modulation[-1].bias, 0) 225 | 226 | def forward(self, x, c): 227 | shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) 228 | x = modulate(self.norm_final(x), shift, scale) 229 | x = self.linear(x) 230 | return x 231 | 232 | 233 | class MultiLayerPatchEmbed(nn.Module): 234 | def __init__(self, img_size, patch_size, in_chans, embed_dim): 235 | super().__init__() 236 | assert patch_size in [2, 4, 6, 8, 16] 237 | self.img_size = img_size 238 | self.patch_size = patch_size 239 | self.in_chans = in_chans 240 | self.embed_dim = embed_dim 241 | 242 | n_down = round(math.log2(patch_size)) 243 | self.proj = [nn.Conv2d(in_chans, embed_dim, 3, 1, 1)] 244 | for i in range(n_down): 245 | self.proj.append(nn.SiLU(inplace=True)) 246 | self.proj.append(nn.Conv2d(embed_dim, embed_dim, 2, 2, 0)) 247 | self.proj.append(nn.SiLU(inplace=True)) 248 | self.proj.append(nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)) 249 | self.proj = nn.Sequential(*self.proj) 250 | 251 | def initialize_weights(self): 252 | for m in self.proj.modules(): 253 | if isinstance(m, nn.Conv2d): 254 | w = m.weight.data 255 | nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 256 | nn.init.constant_(m.bias, 0) 257 | 258 | def forward(self, x): 259 | ''' 260 | x: [N, C, H, W] 261 | out: [N, C, H, W] 262 | ''' 263 | return self.proj(x) 264 | 265 | class MultiLayerFinalLayer(nn.Module): 266 | def __init__(self, hidden_size, patch_size, out_channels): 267 | super().__init__() 268 | assert patch_size in [2, 4, 6, 8, 16] 269 | self.hidden_size = hidden_size 270 | self.patch_size = patch_size 271 | self.out_channels = out_channels 272 | 273 | self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 274 | 275 | n_up = round(math.log2(patch_size)) 276 | self.proj = [] 277 | for i in range(n_up - 1): 278 | self.proj.append(nn.ConvTranspose2d(hidden_size, hidden_size, 2, 2, 0, 0)) 279 | self.proj.append(nn.SiLU(inplace=True)) 280 | self.proj.append(nn.Conv2d(hidden_size, hidden_size, 3, 1, 1)) 281 | self.proj.append(nn.SiLU(inplace=True)) 282 | self.proj.append(nn.ConvTranspose2d(hidden_size, out_channels, 2, 2, 0, 0)) 283 | self.proj = nn.Sequential(*self.proj) 284 | 285 | self.adaLN_modulation = nn.Sequential( 286 | nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True) 287 | ) 288 | 289 | def initialize_weights(self): 290 | nn.init.constant_(self.adaLN_modulation[-1].weight, 0) 291 | nn.init.constant_(self.adaLN_modulation[-1].bias, 0) 292 | for m in self.proj.modules(): 293 | if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)): 294 | w = m.weight.data 295 | nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 296 | nn.init.constant_(m.bias, 0) 297 | nn.init.constant_(self.proj[-1].weight, 0) 298 | 299 | def forward(self, x, c): 300 | ''' 301 | x: [B, N, H, W, D] 302 | c: [B, D] 303 | out: [B, N, D, H, W] 304 | ''' 305 | shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) 306 | seq_dims = x.shape[1:4] 307 | x = x.flatten(1, 3) # [B, L, D] 308 | x = modulate(self.norm_final(x), shift, scale) 309 | x = x.unflatten(1, seq_dims) # [B, N, H, W, D] 310 | x = x.permute(0, 1, 4, 2, 3) # [B, N, D, H, W] 311 | batch_dims = x.shape[0:2] 312 | x = x.flatten(0, 1) # [BN, D, H, W] 313 | x = self.proj(x) 314 | return x.unflatten(0, batch_dims) 315 | -------------------------------------------------------------------------------- /model/utils/normalize.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | Adapted from code originally written by David Novotny. 8 | """ 9 | 10 | import ipdb # noqa: F401 11 | import torch 12 | from pytorch3d.transforms import Rotate, Translate 13 | 14 | 15 | def intersect_skew_line_groups(p, r, mask): 16 | # p, r both of shape (B, N, n_intersected_lines, 3) 17 | # mask of shape (B, N, n_intersected_lines) 18 | p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask) 19 | if p_intersect is None: 20 | return None, None, None, None 21 | _, p_line_intersect = point_line_distance( 22 | p, r, p_intersect[..., None, :].expand_as(p) 23 | ) 24 | intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum( 25 | dim=-1 26 | ) 27 | return p_intersect, p_line_intersect, intersect_dist_squared, r 28 | 29 | 30 | def intersect_skew_lines_high_dim(p, r, mask=None): 31 | # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions 32 | dim = p.shape[-1] 33 | # make sure the heading vectors are l2-normed 34 | if mask is None: 35 | mask = torch.ones_like(p[..., 0]) 36 | r = torch.nn.functional.normalize(r, dim=-1) 37 | 38 | eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None] 39 | I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None] 40 | sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3) 41 | 42 | # I_eps = torch.zeros_like(I_min_cov.sum(dim=-3)) + 1e-10 43 | # p_intersect = torch.pinverse(I_min_cov.sum(dim=-3) + I_eps).matmul(sum_proj)[..., 0] 44 | p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0] 45 | 46 | # I_min_cov.sum(dim=-3): torch.Size([1, 1, 3, 3]) 47 | # sum_proj: torch.Size([1, 1, 3, 1]) 48 | 49 | # p_intersect = np.linalg.lstsq(I_min_cov.sum(dim=-3).numpy(), sum_proj.numpy(), rcond=None)[0] 50 | 51 | if torch.any(torch.isnan(p_intersect)): 52 | print(p_intersect) 53 | return None, None 54 | ipdb.set_trace() 55 | assert False 56 | return p_intersect, r 57 | 58 | 59 | def point_line_distance(p1, r1, p2): 60 | df = p2 - p1 61 | proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1) 62 | line_pt_nearest = p2 - proj_vector 63 | d = (proj_vector).norm(dim=-1) 64 | return d, line_pt_nearest 65 | 66 | 67 | def compute_optical_axis_intersection(cameras): 68 | centers = cameras.get_camera_center() 69 | principal_points = cameras.principal_point 70 | 71 | one_vec = torch.ones((len(cameras), 1), device=centers.device) 72 | optical_axis = torch.cat((principal_points, one_vec), -1) 73 | 74 | # optical_axis = torch.cat( 75 | # (principal_points, cameras.focal_length[:, 0].unsqueeze(1)), -1 76 | # ) 77 | 78 | pp = cameras.unproject_points(optical_axis, from_ndc=True, world_coordinates=True) 79 | pp2 = torch.diagonal(pp, dim1=0, dim2=1).T 80 | 81 | directions = pp2 - centers 82 | centers = centers.unsqueeze(0).unsqueeze(0) 83 | directions = directions.unsqueeze(0).unsqueeze(0) 84 | 85 | p_intersect, p_line_intersect, _, r = intersect_skew_line_groups( 86 | p=centers, r=directions, mask=None 87 | ) 88 | 89 | if p_intersect is None: 90 | dist = None 91 | else: 92 | p_intersect = p_intersect.squeeze().unsqueeze(0) 93 | dist = (p_intersect - centers).norm(dim=-1) 94 | 95 | return p_intersect, dist, p_line_intersect, pp2, r 96 | 97 | class IntersectionException(Exception): 98 | pass 99 | 100 | def normalize_cameras(cameras, scale=1.0, add_cameras=False): 101 | """ 102 | Normalizes cameras such that the optical axes point to the origin, the rotation is 103 | identity, and the norm of the translation of the first camera is 1. 104 | 105 | Args: 106 | cameras (pytorch3d.renderer.cameras.CamerasBase). 107 | scale (float): Norm of the translation of the first camera. 108 | 109 | Returns: 110 | new_cameras (pytorch3d.renderer.cameras.CamerasBase): Normalized cameras. 111 | undo_transform (function): Function that undoes the normalization. 112 | """ 113 | 114 | # Let distance from first camera to origin be unit 115 | new_cameras = cameras.clone() 116 | new_transform = ( 117 | new_cameras.get_world_to_view_transform() 118 | ) # potential R is not valid matrix 119 | p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection( 120 | cameras 121 | ) 122 | 123 | if p_intersect is None: 124 | raise IntersectionException 125 | 126 | d = dist.squeeze(dim=1).squeeze(dim=0)[0] 127 | # Degenerate case 128 | if d == 0: 129 | # print(cameras.T) 130 | # print(new_transform.get_matrix()[:, 3, :3]) 131 | raise IntersectionException 132 | 133 | # Can't figure out how to make scale part of the transform too without messing up R. 134 | # Ideally, we would just wrap it all in a single Pytorch3D transform so that it 135 | # would work with any structure (eg PointClouds, Meshes). 136 | tR = Rotate(new_cameras.R[0].unsqueeze(0)).inverse() 137 | tT = Translate(p_intersect) 138 | t = tR.compose(tT) 139 | 140 | new_transform2 = t.compose(new_transform) # = t.get_matrix() @ new_transform.get_matrix() 141 | new_cameras.R = new_transform2.get_matrix()[:, :3, :3] 142 | new_cameras.T = new_transform2.get_matrix()[:, 3, :3] / d * scale 143 | 144 | scene_scale = scale / d 145 | 146 | def undo_transform(cameras): 147 | cameras_copy = cameras.clone() 148 | cameras_copy.T *= d / scale 149 | new_t = ( 150 | t.inverse().compose(cameras_copy.get_world_to_view_transform()).get_matrix() 151 | ) 152 | cameras_copy.R = new_t[:, :3, :3] 153 | cameras_copy.T = new_t[:, 3, :3] 154 | return cameras_copy 155 | 156 | if add_cameras: 157 | return new_cameras, undo_transform, scene_scale, t 158 | else: 159 | return new_cameras, undo_transform, scene_scale 160 | 161 | 162 | def normalize_cameras_inference(cameras, scale=1.0): 163 | """ 164 | Normalizes cameras such that the optical axes point to the origin, the rotation is 165 | identity, and the norm of the translation of the first camera is 1. 166 | 167 | Args: 168 | cameras (pytorch3d.renderer.cameras.CamerasBase). 169 | scale (float): Norm of the translation of the first camera. 170 | 171 | Returns: 172 | new_cameras (pytorch3d.renderer.cameras.CamerasBase): Normalized cameras. 173 | undo_transform (function): Function that undoes the normalization. 174 | """ 175 | 176 | # Let distance from first camera to origin be unit 177 | new_cameras = cameras.clone() 178 | new_transform = ( 179 | new_cameras.get_world_to_view_transform() 180 | ) # potential R is not valid matrix 181 | 182 | # Can't figure out how to make scale part of the transform too without messing up R. 183 | # Ideally, we would just wrap it all in a single Pytorch3D transform so that it 184 | # would work with any structure (eg PointClouds, Meshes). 185 | tR = Rotate(new_cameras.R[0].unsqueeze(0)).inverse() 186 | d = new_cameras.T[0].norm() 187 | T = torch.Tensor([0., 0., 1.]) - new_cameras.T[0] 188 | tT = Translate(T.unsqueeze(0)) 189 | t = tR.compose(tT) 190 | 191 | new_transform = t.compose(new_transform) 192 | new_cameras.R = new_transform.get_matrix()[:, :3, :3] 193 | new_cameras.T = new_transform.get_matrix()[:, 3, :3] / d * scale 194 | 195 | def undo_transform(cameras): 196 | cameras_copy = cameras.clone() 197 | cameras_copy.T *= d / scale 198 | new_t = ( 199 | t.inverse().compose(cameras_copy.get_world_to_view_transform()).get_matrix() 200 | ) 201 | cameras_copy.R = new_t[:, :3, :3] 202 | cameras_copy.T = new_t[:, 3, :3] 203 | return cameras_copy 204 | 205 | return new_cameras, undo_transform 206 | 207 | 208 | def first_camera_transform(cameras, rotation_only=True): 209 | new_cameras = cameras.clone() 210 | new_transform = new_cameras.get_world_to_view_transform() 211 | tR = Rotate(new_cameras.R[0].unsqueeze(0)) 212 | if rotation_only: 213 | t = tR.inverse() 214 | else: 215 | tT = Translate(new_cameras.T[0].unsqueeze(0)) 216 | t = tR.compose(tT).inverse() 217 | 218 | new_transform = t.compose(new_transform) 219 | new_cameras.R = new_transform.get_matrix()[:, :3, :3] 220 | new_cameras.T = new_transform.get_matrix()[:, 3, :3] 221 | 222 | return new_cameras 223 | -------------------------------------------------------------------------------- /pipeline_depth_prediction.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import cv2 7 | import argparse 8 | import numpy as np 9 | import torch 10 | import trimesh 11 | from omegaconf import OmegaConf 12 | from model.load import load_model 13 | from utils.train_utils import model_inference 14 | from utils.data_utils import DataHandler, get_rgbd_point_cloud_numpy 15 | from data import Preprocessor 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser(description="Simple example of a test script.") 20 | parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.") 21 | parser.add_argument("--exp_name", type=str, default='depth-prediction') 22 | parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114") 23 | parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"]) 24 | parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.") 25 | parser.add_argument("--checkpoint_path", type=str, default=None) 26 | parser.add_argument("--guidance_scale", type=float, default=1.0, help="inference cfg. 1.0 denotes not use cfg") 27 | args = parser.parse_args() 28 | 29 | # Make experiment directory 30 | exp_folder = f'./results/{args.exp_name}' 31 | os.makedirs(exp_folder, exist_ok=True) 32 | 33 | # Set cuda and mixed precision 34 | device = torch.device(f"cuda:{args.gpu}") 35 | if args.mixed_precision == "no": 36 | weight_dtype = torch.float32 37 | elif args.mixed_precision == "fp16": 38 | weight_dtype = torch.float16 39 | elif args.mixed_precision == "bf16": 40 | weight_dtype = torch.bfloat16 41 | 42 | # Load config and model 43 | cfg = OmegaConf.load(args.config) 44 | models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype) 45 | 46 | # Load data pre-processor 47 | preprocessor = Preprocessor(cfg) 48 | 49 | # Get data 50 | data = preprocessor(args.data_path, input_type='multi-view') 51 | data_handler = DataHandler(data) 52 | 53 | # Hyper-parameters setting & Mod flag editing 54 | # In this example, 3 views (id = 0, 1, 2) are used for inference 55 | # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth', 56 | # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used 57 | # The order of letters in each modality denotes the state order 58 | # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used 59 | # would be auto-cutted based on the view numbers 60 | used_view_ids = torch.arange(3) 61 | mod_flags = 'cccccccc,cccccccc,gggggggg' 62 | 63 | # Set random seed 64 | SEED = np.random.randint(0, 2147483647) 65 | 66 | # Inference 67 | np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \ 68 | model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED) 69 | 70 | # Write paired visualizations 71 | num_view = len(used_view_ids) 72 | gt_part = np_image[0][:512*num_view, :512*4] 73 | pred_part = np_image[0][-512*num_view:, 512*3:512*4] 74 | # from left to right: gt_rgb - gt_pose (dir + mom) - gt-depth - pred depth 75 | concat_images = np.concatenate([gt_part, pred_part], axis=1) 76 | file_name = f"{data_handler('scene_id')}-{SEED}-compare.png" 77 | cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1]) 78 | 79 | # Back-project depth images to point clouds 80 | camera = data_handler('gt_pyt3d_camera')[0][used_view_ids] 81 | mask = depth_mask[0].cpu() 82 | gt_images = data_handler('gen_image')[0][used_view_ids][mask] * 0.5 + 0.5 83 | # write predictions 84 | pred_depths = 1.0 / mmod_preds['gens']['depth'][0].cpu()[:, 0:1] 85 | pred_points, pred_colors = get_rgbd_point_cloud_numpy(camera, gt_images, pred_depths) 86 | output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-{SEED}-depth-pred.ply") 87 | combined_ply = trimesh.PointCloud(pred_points, pred_colors * 255) 88 | _ = combined_ply.export(output_path) 89 | # write groundtruths 90 | gt_depths = 1.0 / data_handler('gen_depth')[0][used_view_ids][mask].cpu() 91 | gt_depth_masks = torch.logical_and(gt_depths > 0, ~torch.isinf(gt_depths)).to(gt_depths) 92 | gt_points, gt_colors = get_rgbd_point_cloud_numpy(camera, gt_images, gt_depths, depth_masks=gt_depth_masks, mask_thr=0.5) 93 | output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-depth-gt.ply") 94 | combined_ply = trimesh.PointCloud(gt_points, gt_colors * 255) 95 | _ = combined_ply.export(output_path) -------------------------------------------------------------------------------- /pipeline_novel_view_synthesis.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import cv2 7 | import argparse 8 | import numpy as np 9 | import torch 10 | from omegaconf import OmegaConf 11 | from model.load import load_model 12 | from utils.train_utils import model_inference 13 | from utils.data_utils import DataHandler 14 | from data import Preprocessor 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser(description="Simple example of a test script.") 19 | parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.") 20 | parser.add_argument("--exp_name", type=str, default='novel-view-synthesis') 21 | parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114") 22 | parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"]) 23 | parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.") 24 | parser.add_argument("--checkpoint_path", type=str, default=None) 25 | parser.add_argument("--guidance_scale", type=float, default=1.5, help="inference cfg. 1.0 denotes not use cfg") 26 | args = parser.parse_args() 27 | 28 | # Make experiment directory 29 | exp_folder = f'./results/{args.exp_name}' 30 | os.makedirs(exp_folder, exist_ok=True) 31 | 32 | # Set cuda and mixed precision 33 | device = torch.device(f"cuda:{args.gpu}") 34 | if args.mixed_precision == "no": 35 | weight_dtype = torch.float32 36 | elif args.mixed_precision == "fp16": 37 | weight_dtype = torch.float16 38 | elif args.mixed_precision == "bf16": 39 | weight_dtype = torch.bfloat16 40 | 41 | # Load config and model 42 | cfg = OmegaConf.load(args.config) 43 | models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype) 44 | 45 | # Load data pre-processor 46 | preprocessor = Preprocessor(cfg) 47 | 48 | # Get data 49 | data = preprocessor(args.data_path, input_type='multi-view') 50 | data_handler = DataHandler(data) 51 | 52 | # Hyper-parameters setting & Mod flag editing 53 | # In this example, 4 views (id = 0, 1, 2, 3) are used for inference 54 | # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth', 55 | # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used 56 | # The order of letters in each modality denotes the state order 57 | # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used 58 | # would be auto-cutted based on the view numbers 59 | used_view_ids = torch.arange(4) 60 | mod_flags = 'cggggggg,cccccccc,cccccccc' 61 | 62 | # Set random seed 63 | SEED = np.random.randint(0, 2147483647) 64 | 65 | # Inference 66 | np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \ 67 | model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED) 68 | 69 | # Write paired visualizations 70 | num_view = len(used_view_ids) 71 | gt_part = np_image[0][:512*num_view, :512*3] 72 | pred_part = np_image[0][-512*num_view:, :512] 73 | # from left to right: gt_rgb - gt_pose (dir + mom) - pred_rgb 74 | concat_images = np.concatenate([gt_part, pred_part], axis=1) 75 | file_name = f"{data_handler('scene_id')}-{SEED}-compare.png" 76 | cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1]) -------------------------------------------------------------------------------- /pipeline_pose_estimation.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import io 7 | import cv2 8 | import base64 9 | import plotly 10 | import argparse 11 | import numpy as np 12 | import torch 13 | import matplotlib 14 | import matplotlib.pyplot as plt 15 | from omegaconf import OmegaConf 16 | from pytorch3d.renderer import PerspectiveCameras 17 | from pytorch3d.vis.plotly_vis import plot_scene 18 | from model.load import load_model 19 | from model.utils.rays import Rays, rays_to_cameras_homography 20 | from utils.train_utils import model_inference 21 | from utils.data_utils import DataHandler, tensor_recursive_to 22 | from utils.vis import view_color_coded_images_from_tensor 23 | from data import Preprocessor 24 | 25 | 26 | HTML_TEMPLATE = """ 27 | 28 | {plotly_html}""" 29 | 30 | 31 | def plotly_scene_visualization_dual(pred_camera, gt_camera, scale=0.03): 32 | num_frames = len(pred_camera) 33 | camera = {} 34 | R_pred, T_pred = pred_camera.R, pred_camera.T 35 | for i in range(num_frames): 36 | camera[i] = PerspectiveCameras(R=R_pred[i, None], T=T_pred[i, None]) 37 | if gt_camera is not None: 38 | R_gt, T_gt = gt_camera.R, gt_camera.T 39 | for i in range(num_frames): 40 | camera[i + num_frames] = PerspectiveCameras(R=R_gt[i, None], T=T_gt[i, None]) 41 | 42 | fig = plot_scene( 43 | {"scene": camera}, 44 | camera_scale=scale, 45 | ) 46 | fig.update_scenes(aspectmode="data") 47 | 48 | cmap = plt.get_cmap("hsv") 49 | for i in range(num_frames): 50 | fig.data[i].line.color = matplotlib.colors.to_hex(cmap(i / (num_frames))) 51 | if gt_camera is not None: 52 | for i in range(num_frames): 53 | fig.data[i + num_frames].line.color = matplotlib.colors.to_hex((0.0, 0.0, 0.0, 1.0)) 54 | return fig 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser(description="Simple example of a test script.") 59 | parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.") 60 | parser.add_argument("--exp_name", type=str, default='pose-estimation') 61 | parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114") 62 | parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"]) 63 | parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.") 64 | parser.add_argument("--checkpoint_path", type=str, default=None) 65 | parser.add_argument("--guidance_scale", type=float, default=1.5, help="inference cfg. 1.0 denotes not use cfg") 66 | parser.add_argument("--default_fov", type=float, default=60.0) 67 | args = parser.parse_args() 68 | 69 | # Make experiment directory 70 | exp_folder = f'./results/{args.exp_name}' 71 | os.makedirs(exp_folder, exist_ok=True) 72 | 73 | # Set cuda and mixed precision 74 | device = torch.device(f"cuda:{args.gpu}") 75 | if args.mixed_precision == "no": 76 | weight_dtype = torch.float32 77 | elif args.mixed_precision == "fp16": 78 | weight_dtype = torch.float16 79 | elif args.mixed_precision == "bf16": 80 | weight_dtype = torch.bfloat16 81 | 82 | # Load config and model 83 | cfg = OmegaConf.load(args.config) 84 | models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype) 85 | 86 | # Load data pre-processor 87 | preprocessor = Preprocessor(cfg, fov=args.default_fov) 88 | 89 | # Get data 90 | data = preprocessor(args.data_path, input_type='multi-view') 91 | data_handler = DataHandler(data) 92 | 93 | # Hyper-parameters setting & Mod flag editing 94 | # In this example, 8 views (id = 0, 1, 2, 3, 4, 5, 6, 7) are used for inference 95 | # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth', 96 | # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used 97 | # The order of letters in each modality denotes the state order 98 | # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used 99 | # would be auto-cutted based on the view numbers 100 | used_view_ids = torch.arange(8) 101 | mod_flags = 'cccccccc,cggggggg,xxxxxxxx' 102 | 103 | # Set random seed 104 | SEED = np.random.randint(0, 2147483647) 105 | 106 | # Inference 107 | np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \ 108 | model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED) 109 | 110 | # Write paired visualizations 111 | num_view = len(used_view_ids) 112 | gt_part = np_image[0][:512*num_view, :512*3] 113 | pred_part = np_image[0][-512*num_view:, 512:512*3] 114 | # from left to right: gt_rgb - gt_pose (dir + mom) - pred_pose (dir + mom) 115 | concat_images = np.concatenate([gt_part, pred_part], axis=1) 116 | file_name = f"{data_handler('scene_id')}-{SEED}-compare.png" 117 | cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1]) 118 | 119 | # Save camera visualization html following RayDiffuison 120 | gt_camera = data_handler('gt_pyt3d_camera')[0][used_view_ids] if data_handler('gt_pyt3d_camera') else None 121 | gt_rays = data_handler('cond_rays')[0][used_view_ids].float().cpu() 122 | pred_ray = mmod_preds['gens']['ray'][0].float().cpu() 123 | pred_ray[0] = gt_rays[0] 124 | # create camera from rays 125 | pred_camera = rays_to_cameras_homography( 126 | Rays.from_spatial(pred_ray), 127 | crop_parameters=None, 128 | num_patches_x=cfg.data.raymap_size, 129 | num_patches_y=cfg.data.raymap_size, 130 | ) 131 | fig = plotly_scene_visualization_dual(pred_camera, gt_camera, scale=0.1) 132 | output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-{SEED}-cameras-vis.html") 133 | html_plot = plotly.io.to_html(fig, full_html=False, include_plotlyjs="cdn") 134 | s = io.BytesIO() 135 | images = torch.nn.functional.interpolate(data_handler('cond_image')[0][used_view_ids], size=(128, 128), mode='bilinear', align_corners=False).permute(0, 2, 3, 1) 136 | view_color_coded_images_from_tensor(images) 137 | plt.savefig(s, format="png", bbox_inches="tight") 138 | plt.close() 139 | image_encoded = base64.b64encode(s.getvalue()).decode("utf-8").replace("\n", "") 140 | with open(output_path, "w") as f: 141 | s = HTML_TEMPLATE.format( 142 | image_encoded=image_encoded, 143 | plotly_html=html_plot, 144 | ) 145 | f.write(s) 146 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipdb 2 | imageio 3 | imageio-ffmpeg 4 | matplotlib 5 | numpy==1.26 6 | scipy 7 | omegaconf 8 | opencv-python 9 | plotly 10 | transformers 11 | tensorboard 12 | open3d 13 | sentencepiece 14 | deepspeed 15 | torchtyping 16 | diffusers==0.31.0 17 | accelerate 18 | scikit-image 19 | torchmetrics 20 | git+https://github.com/NVlabs/tiny-cuda-nn#subdirectory=bindings/torch 21 | nerfstudio @ git+https://github.com/nerfstudio-project/nerfstudio@fc4fc5cb15ad994ea82d8c651c9d42172d890de1 22 | -------------------------------------------------------------------------------- /scripts/depth_prediction.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | data_path=$1 7 | 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_depth_prediction.py \ 9 | --config configs/config_stage3.yaml \ 10 | --data_path $data_path \ 11 | --mixed_precision fp16 \ 12 | --guidance_scale 1.0 \ 13 | --checkpoint_path checkpoints/matrix3d_512.pt 14 | -------------------------------------------------------------------------------- /scripts/novel_view_synthesis.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | data_path=$1 7 | 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_novel_view_synthesis.py \ 9 | --config configs/config_stage3.yaml \ 10 | --data_path $data_path \ 11 | --mixed_precision fp16 \ 12 | --guidance_scale 1.5 \ 13 | --checkpoint_path checkpoints/matrix3d_512.pt 14 | -------------------------------------------------------------------------------- /scripts/pose_estimation.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | data_path=$1 7 | 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_pose_estimation.py \ 9 | --config configs/config_stage3.yaml \ 10 | --data_path $data_path \ 11 | --mixed_precision fp16 \ 12 | --guidance_scale 1.5 \ 13 | --checkpoint_path checkpoints/matrix3d_512.pt 14 | -------------------------------------------------------------------------------- /scripts/single_view_to_3d.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")") 7 | REPO_DIR=$(dirname "$SCRIPT_DIR") 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method" 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR 10 | 11 | EXP_NAME=$1 12 | INPUT_PATH=$2 13 | NAME_EXT=$(basename "$INPUT_PATH") 14 | NAME="${NAME_EXT%.*}" 15 | 16 | ### Step 1: Generation: Create novel view observations 17 | CUDA_VISIBLE_DEVICES=0 python pipeline_single_to_3d.py \ 18 | --config configs/config_stage3.yaml \ 19 | --exp_name $EXP_NAME \ 20 | --data_path $INPUT_PATH \ 21 | --default_fov 60 \ 22 | --num_samples 80 \ 23 | --checkpoint_path checkpoints/matrix3d_512.pt \ 24 | --mixed_precision fp16 \ 25 | --random_seed 1 26 | 27 | 28 | ### Step 2: Reconstruction: 3DGS optimization 29 | cd results/$EXP_NAME/$NAME 30 | 31 | # 1. optimization 32 | ITERS=1200 33 | NUM_IMG=10 34 | ns-train splatfacto_matrix3d \ 35 | --data transforms_train.json \ 36 | --mixed-precision False \ 37 | --output-dir outputs \ 38 | --timestamp exps \ 39 | --viewer.quit-on-train-completion True \ 40 | --max-num-iterations $ITERS \ 41 | --steps-per-save 1000 \ 42 | --pipeline.model.num-downscales -1 \ 43 | --pipeline.model.resolution-schedule 1000 \ 44 | --pipeline.datamanager.max-num-iterations $ITERS \ 45 | --pipeline.datamanager.num_image_each_iteration $NUM_IMG \ 46 | --pipeline.model.background-color white \ 47 | --pipeline.model.warmup-length 200 \ 48 | --pipeline.model.densify-grad-thresh 0.0008 \ 49 | --pipeline.model.cull-alpha-thresh 0.05 \ 50 | --pipeline.model.cull-scale-thresh 0.5 \ 51 | --pipeline.model.cull-screen-size 0.5 \ 52 | --pipeline.model.reset-alpha-every 20 \ 53 | --pipeline.model.refine-every 50 \ 54 | --pipeline.model.use_scale_regularization True \ 55 | --pipeline.model.max-gauss-ratio 3 \ 56 | --pipeline.model.stop-screen-size-at 4000 \ 57 | --pipeline.model.stop-split-at 1000 \ 58 | --pipeline.model.sh-degree 2 \ 59 | --pipeline.model.sh-degree-interval 500 \ 60 | --pipeline.model.full-accumulation-lambda 0.0 \ 61 | --pipeline.model.accumulation-lambda 5.0 \ 62 | --pipeline.model.mask_lambda 5.0 \ 63 | --pipeline.model.ssim-lambda 0.2 \ 64 | --pipeline.model.lpips-lambda 10.0 \ 65 | --pipeline.model.l1-lambda-on-captured-views 20.0 \ 66 | --pipeline.model.l1-lambda-on-generation-views 1.0 \ 67 | --pipeline.model.apply-annealing False \ 68 | --pipeline.model.rasterize-mode antialiased \ 69 | --pipeline.model.use-absgrad False \ 70 | --pipeline.model.lpips-downsample 1 \ 71 | --pipeline.model.lpips-min-img-size 128 \ 72 | --pipeline.model.lpips-patch-size 512 \ 73 | --pipeline.model.lpips-no-resize True \ 74 | --pipeline.model.depth-l1-lambda 10.0 \ 75 | --pipeline.model.depth-ranking-lambda 10.0 \ 76 | --pipeline.model.output-depth-during-training True \ 77 | --pipeline.model.use-bilateral-grid False \ 78 | nerfstudio-data --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0 79 | # 2. use ns-render to render frames 80 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders 81 | # 3. write frames into videos 82 | python $REPO_DIR/utils/write_videos.py --render_root renders --type object -------------------------------------------------------------------------------- /scripts/unposed_fewshot_to_3d_arkitscenes.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")") 7 | REPO_DIR=$(dirname "$SCRIPT_DIR") 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method" 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR 10 | 11 | EXP_NAME=$1 12 | INPUT_PATH=$2 13 | NAME_EXT=$(basename "$INPUT_PATH") 14 | NAME="${NAME_EXT%.*}" 15 | 16 | ### Step 1: Generation: Create novel view observations 17 | CUDA_VISIBLE_DEVICES=0 python pipeline_unposed_few_shot_to_3d.py \ 18 | --config configs/config_stage3.yaml \ 19 | --exp_name $EXP_NAME \ 20 | --data_path $INPUT_PATH \ 21 | --spline_scales 3 \ 22 | --num_samples 80 \ 23 | --num_depth_runs_for_init_depth 15 \ 24 | --checkpoint_path checkpoints/matrix3d_512.pt \ 25 | --mixed_precision fp16 \ 26 | --random_seed 1 \ 27 | --use_loop_traj 1 \ 28 | --dataset arkitscenes 29 | 30 | 31 | ### Step 2: Reconstruction: 3DGS optimization 32 | cd results/$EXP_NAME/$NAME 33 | 34 | # 1. optimization 35 | ITERS=3000 36 | NUM_IMG=5 37 | ns-train splatfacto_matrix3d \ 38 | --data transforms_train.json \ 39 | --mixed-precision False \ 40 | --output-dir outputs \ 41 | --timestamp exps \ 42 | --viewer.quit-on-train-completion True \ 43 | --max-num-iterations $ITERS \ 44 | --steps-per-save 1000 \ 45 | --pipeline.model.num-downscales 0 \ 46 | --pipeline.model.resolution-schedule 500 \ 47 | --pipeline.datamanager.max-num-iterations $ITERS \ 48 | --pipeline.datamanager.num_image_each_iteration $NUM_IMG \ 49 | --pipeline.model.warmup-length 500 \ 50 | --pipeline.model.densify-grad-thresh 0.0008 \ 51 | --pipeline.model.cull-alpha-thresh 0.2 \ 52 | --pipeline.model.cull-scale-thresh 0.5 \ 53 | --pipeline.model.cull-screen-size 0.5 \ 54 | --pipeline.model.reset-alpha-every 15 \ 55 | --pipeline.model.refine-every 100 \ 56 | --pipeline.model.use_scale_regularization True \ 57 | --pipeline.model.max-gauss-ratio 6 \ 58 | --pipeline.model.apply-annealing False \ 59 | --pipeline.model.stop-screen-size-at 4000 \ 60 | --pipeline.model.stop-split-at 2000 \ 61 | --pipeline.model.sh-degree 3 \ 62 | --pipeline.model.sh-degree-interval 800 \ 63 | --pipeline.model.accumulation-lambda 0.5 \ 64 | --pipeline.model.full-accumulation-lambda 5.0 \ 65 | --pipeline.model.start-full-accumulation 1500 \ 66 | --pipeline.model.ssim-lambda 0.2 \ 67 | --pipeline.model.lpips-lambda 20.0 \ 68 | --pipeline.model.l1-lambda-on-captured-views 20.0 \ 69 | --pipeline.model.l1-lambda-on-generation-views 1.0 \ 70 | --pipeline.model.rasterize-mode antialiased \ 71 | --pipeline.model.use-absgrad True \ 72 | --pipeline.model.lpips-downsample 4 \ 73 | --pipeline.model.lpips-min-img-size 256 \ 74 | --pipeline.model.lpips-patch-size 512 \ 75 | --pipeline.model.lpips-no-resize True \ 76 | --pipeline.model.depth-l1-lambda 10.0 \ 77 | --pipeline.model.depth-ranking-lambda 20.0 \ 78 | --pipeline.model.output-depth-during-training True \ 79 | --pipeline.model.use-bilateral-grid False \ 80 | nerfstudio-data --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0 81 | # 2. use ns-render to render frames 82 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders 83 | # 3. write frames into videos 84 | python $REPO_DIR/utils/write_videos.py --render_root renders --type scene --num_splines 3 -------------------------------------------------------------------------------- /scripts/unposed_fewshot_to_3d_co3dv2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # For licensing see accompanying LICENSE file. 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 5 | # 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")") 7 | REPO_DIR=$(dirname "$SCRIPT_DIR") 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method" 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR 10 | 11 | EXP_NAME=$1 12 | INPUT_PATH=$2 13 | NAME_EXT=$(basename "$INPUT_PATH") 14 | NAME="${NAME_EXT%.*}" 15 | 16 | ### Step 1: Generation: Create novel view observations 17 | CUDA_VISIBLE_DEVICES=0 python pipeline_unposed_few_shot_to_3d.py \ 18 | --config configs/config_stage3.yaml \ 19 | --exp_name $EXP_NAME \ 20 | --data_path $INPUT_PATH \ 21 | --spline_scales 3 \ 22 | --num_samples 80 \ 23 | --num_depth_runs_for_init_depth 15 \ 24 | --checkpoint_path checkpoints/matrix3d_512.pt \ 25 | --mixed_precision fp16 \ 26 | --random_seed 1 \ 27 | --use_loop_traj 0 \ 28 | --dataset co3dv2 29 | 30 | 31 | ### Step 2: Reconstruction: 3DGS optimization 32 | cd results/$EXP_NAME/$NAME 33 | 34 | # 1. optimization 35 | ITERS=3000 36 | NUM_IMG=5 37 | ns-train splatfacto_matrix3d \ 38 | --data transforms_train.json \ 39 | --mixed-precision False \ 40 | --output-dir outputs \ 41 | --timestamp exps \ 42 | --viewer.quit-on-train-completion True \ 43 | --max-num-iterations $ITERS \ 44 | --steps-per-save 1000 \ 45 | --pipeline.model.num-downscales 0 \ 46 | --pipeline.model.resolution-schedule 500 \ 47 | --pipeline.datamanager.max-num-iterations $ITERS \ 48 | --pipeline.datamanager.num_image_each_iteration $NUM_IMG \ 49 | --pipeline.model.warmup-length 500 \ 50 | --pipeline.model.densify-grad-thresh 0.0008 \ 51 | --pipeline.model.cull-alpha-thresh 0.2 \ 52 | --pipeline.model.cull-scale-thresh 0.5 \ 53 | --pipeline.model.cull-screen-size 0.5 \ 54 | --pipeline.model.reset-alpha-every 15 \ 55 | --pipeline.model.refine-every 100 \ 56 | --pipeline.model.use_scale_regularization True \ 57 | --pipeline.model.max-gauss-ratio 6 \ 58 | --pipeline.model.apply-annealing False \ 59 | --pipeline.model.stop-screen-size-at 4000 \ 60 | --pipeline.model.stop-split-at 2000 \ 61 | --pipeline.model.sh-degree 3 \ 62 | --pipeline.model.sh-degree-interval 800 \ 63 | --pipeline.model.accumulation-lambda 0.5 \ 64 | --pipeline.model.full-accumulation-lambda 5.0 \ 65 | --pipeline.model.start-full-accumulation 1500 \ 66 | --pipeline.model.ssim-lambda 0.2 \ 67 | --pipeline.model.lpips-lambda 20.0 \ 68 | --pipeline.model.l1-lambda-on-captured-views 20.0 \ 69 | --pipeline.model.l1-lambda-on-generation-views 1.0 \ 70 | --pipeline.model.rasterize-mode antialiased \ 71 | --pipeline.model.use-absgrad True \ 72 | --pipeline.model.lpips-downsample 4 \ 73 | --pipeline.model.lpips-min-img-size 256 \ 74 | --pipeline.model.lpips-patch-size 512 \ 75 | --pipeline.model.lpips-no-resize True \ 76 | --pipeline.model.depth-l1-lambda 10.0 \ 77 | --pipeline.model.depth-ranking-lambda 20.0 \ 78 | --pipeline.model.output-depth-during-training True \ 79 | --pipeline.model.use-bilateral-grid False \ 80 | nerfstudio-data --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0 81 | # 2. use ns-render to render frames 82 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders 83 | # 3. write frames into videos 84 | python $REPO_DIR/utils/write_videos.py --render_root renders --type scene --num_splines 3 -------------------------------------------------------------------------------- /splatfacto_matrix3d/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # -------------------------------------------------------------------------------- /splatfacto_matrix3d/splatfacto_configs.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | from __future__ import annotations 6 | 7 | from nerfstudio.configs.base_config import ViewerConfig 8 | from nerfstudio.data.datamanagers.base_datamanager import VanillaDataManager, VanillaDataManagerConfig 9 | from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig 10 | from nerfstudio.data.datasets.depth_dataset import DepthDataset 11 | from nerfstudio.engine.optimizers import AdamOptimizerConfig 12 | from nerfstudio.engine.schedulers import ( 13 | CosineDecaySchedulerConfig, 14 | ExponentialDecaySchedulerConfig, 15 | MultiStepSchedulerConfig, 16 | ) 17 | from nerfstudio.engine.trainer import TrainerConfig 18 | from nerfstudio.pipelines.base_pipeline import VanillaPipelineConfig 19 | from nerfstudio.plugins.types import MethodSpecification 20 | 21 | 22 | from .batch_full_images_datamanager import FullImageDatamanager, BatchFullImageDatamanagerConfig 23 | from .splatfacto import SplatfactoModelConfig 24 | 25 | 26 | splatfacto_method = MethodSpecification( 27 | config=TrainerConfig( 28 | method_name="splatfacto_matrix3d", 29 | steps_per_eval_image=100, 30 | steps_per_eval_batch=0, 31 | steps_per_save=200, 32 | steps_per_eval_all_images=1000, 33 | max_num_iterations=1000, 34 | mixed_precision=False, 35 | pipeline=VanillaPipelineConfig( 36 | datamanager=BatchFullImageDatamanagerConfig( 37 | _target=FullImageDatamanager[DepthDataset], 38 | # dataparser=NerfstudioDataParserConfig(load_3D_points=True), 39 | cache_images_type="uint8", 40 | ), 41 | model=SplatfactoModelConfig(), 42 | ), 43 | optimizers={ 44 | "means": { 45 | "optimizer": AdamOptimizerConfig(lr=1.6e-4, eps=1e-15), 46 | "scheduler": ExponentialDecaySchedulerConfig( 47 | lr_final=1.6e-6, 48 | max_steps=30000, 49 | ), 50 | }, 51 | "features_dc": { 52 | "optimizer": AdamOptimizerConfig(lr=0.0025, eps=1e-15), 53 | "scheduler": None, 54 | }, 55 | "features_rest": { 56 | "optimizer": AdamOptimizerConfig(lr=0.0025 / 20, eps=1e-15), 57 | "scheduler": None, 58 | }, 59 | "opacities": { 60 | "optimizer": AdamOptimizerConfig(lr=0.05, eps=1e-15), 61 | "scheduler": None, 62 | }, 63 | "scales": { 64 | "optimizer": AdamOptimizerConfig(lr=0.005, eps=1e-15), 65 | "scheduler": None, 66 | }, 67 | "quats": {"optimizer": AdamOptimizerConfig(lr=0.001, eps=1e-15), "scheduler": None}, 68 | "camera_opt": { 69 | "optimizer": AdamOptimizerConfig(lr=1e-4, eps=1e-15), 70 | "scheduler": ExponentialDecaySchedulerConfig( 71 | lr_final=5e-7, max_steps=30000, warmup_steps=1000, lr_pre_warmup=0 72 | ), 73 | }, 74 | "bilateral_grid": { 75 | "optimizer": AdamOptimizerConfig(lr=2e-3, eps=1e-15), 76 | "scheduler": ExponentialDecaySchedulerConfig( 77 | lr_final=1e-4, max_steps=30000, warmup_steps=1000, lr_pre_warmup=0 78 | ), 79 | }, 80 | }, 81 | viewer=ViewerConfig(num_rays_per_chunk=1 << 15), 82 | vis="viewer", 83 | ), 84 | description="Matrix3D modified Gaussian-Splatting model for 3D reconstruction" 85 | ) 86 | -------------------------------------------------------------------------------- /utils/camera_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import os 6 | import numpy as np 7 | import torch 8 | import json 9 | import splines 10 | import splines.quaternion 11 | from pytorch3d.renderer import PerspectiveCameras 12 | from pytorch3d.renderer.cameras import look_at_view_transform 13 | from pytorch3d.transforms import quaternion_to_matrix, matrix_to_quaternion 14 | from pytorch3d.utils import opencv_from_cameras_projection 15 | 16 | 17 | def fov_to_focal(fov, size): 18 | # convert fov angle in degree to focal 19 | return size / np.tan(fov * np.pi / 180.0 / 2.0) / 2.0 20 | 21 | 22 | def focal_to_fov(focal, size): 23 | # convert focal to fov angle in degree 24 | return 2.0 * np.arctan(size / (2.0 * focal)) * 180.0 / np.pi 25 | 26 | 27 | def set_pytorch3d_cameras_eye_at_up(azimuths, elevations, distance=1.0): 28 | nv = azimuths.shape[0] 29 | azimuths, elevations = np.deg2rad(azimuths), np.deg2rad(elevations) 30 | x = distance * np.sin(azimuths) * np.cos(elevations) 31 | y = distance * np.sin(elevations) 32 | z = distance * np.cos(azimuths) * np.cos(elevations) * -1 33 | 34 | at = torch.tensor([[0., 0., 0.]]).repeat(nv, 1).float() 35 | up = torch.tensor([[0., 1., 0.]]).repeat(nv, 1).float() 36 | eye = torch.tensor([x, y, z]).T.float() 37 | 38 | R, T = look_at_view_transform(eye=eye, at=at, up=up) 39 | 40 | return R, T 41 | 42 | 43 | 44 | def fit_spline_given_pyt3d_cameras(pyt3d_camera, n_frames=80, scales=8, tension=0.5, 45 | continuity=0.0, bias=0.0, is_loop=True): 46 | num_keyframes = len(pyt3d_camera) 47 | end_frame = num_keyframes if is_loop else num_keyframes - 1 48 | timestamps = np.linspace(0, end_frame, n_frames, endpoint=False, ) 49 | quaternions_wxyz = matrix_to_quaternion(pyt3d_camera.R).numpy() 50 | positions = pyt3d_camera.get_camera_center().numpy() 51 | focals = pyt3d_camera.focal_length.numpy() 52 | orientation_spline = splines.quaternion.KochanekBartels( 53 | [ 54 | splines.quaternion.UnitQuaternion.from_unit_xyzw(np.roll(wxyz, shift=-1)) 55 | for wxyz in quaternions_wxyz 56 | ], 57 | tcb=(tension, continuity, bias), 58 | endconditions="closed" if is_loop else "natural", 59 | ) 60 | position_spline = splines.KochanekBartels( 61 | [position for position in positions], 62 | tcb=(tension, continuity, bias), 63 | endconditions="closed" if is_loop else "natural", 64 | ) 65 | focal_spline = splines.KochanekBartels( 66 | [foc for foc in focals], 67 | tcb=(tension, continuity, bias), 68 | endconditions="closed" if is_loop else "natural", 69 | ) 70 | quats = orientation_spline.evaluate(timestamps) 71 | quat_array = np.array([[quat.scalar, *quat.vector] for quat in quats], dtype=np.float32) 72 | points_array = position_spline.evaluate(timestamps).astype(np.float32) 73 | focal_array = focal_spline.evaluate(timestamps).astype(np.float32) 74 | 75 | # convert back to pyt3d 76 | R = quaternion_to_matrix(torch.from_numpy(quat_array)) 77 | points = torch.from_numpy(points_array).float() 78 | T = torch.bmm(-R.permute(0, 2, 1), points[..., None])[..., 0] 79 | spline_focal = torch.from_numpy(focal_array) 80 | spline_p0 = pyt3d_camera.principal_point[0].unsqueeze(0).repeat(n_frames, 1) 81 | image_size = pyt3d_camera.image_size[0].unsqueeze(0).repeat(n_frames, 1) 82 | 83 | # scale the cameras based on the scales 84 | if scales == 1: 85 | scales_values = torch.Tensor([1.0 + 0.0 * s for s in range(scales)]) 86 | elif scales == 2: 87 | scales_values = torch.Tensor([1.0 + 0.05 * s for s in range(scales)]) 88 | elif scales == 3: 89 | scales_values = torch.Tensor([0.8 + 0.2 * s for s in range(scales)]) 90 | elif scales == 8: 91 | scales_values = torch.Tensor([0.9 + 0.05 * s for s in range(scales)]) 92 | else: 93 | raise NotImplementedError("Unsupported number of scales for spline fitting. Please configure it manually.") 94 | R_matrices = R[None].repeat(scales, 1, 1, 1) 95 | T_matrices = T[None].repeat(scales, 1, 1) * scales_values.unsqueeze(-1).unsqueeze(-1).repeat(1, n_frames, 1) 96 | 97 | 98 | new_R_matrices = [] 99 | new_T_matrices = [] 100 | from scipy.spatial.transform import Rotation 101 | for i in range(scales): 102 | new_T = T_matrices[i] 103 | # quat = Rotation.from_matrix(R_matrices[i].cpu().numpy()).as_quat() 104 | # rotation_matrix = Rotation.from_quat(quat).as_matrix() 105 | new_R_matrices.append(R_matrices[i]) 106 | new_T_matrices.append(new_T) 107 | new_R_matrices = torch.stack(new_R_matrices).flatten(0, 1) 108 | new_T_matrices = torch.stack(new_T_matrices).flatten(0, 1) 109 | 110 | 111 | spline_focal = spline_focal.repeat(scales, 1) 112 | spline_p0 = spline_p0.repeat(scales, 1) 113 | image_size = image_size.repeat(scales, 1) 114 | 115 | spline_cam = PerspectiveCameras( 116 | R=new_R_matrices, 117 | T=new_T_matrices, 118 | focal_length=spline_focal, 119 | principal_point=spline_p0, 120 | image_size=image_size, 121 | device=R.device, 122 | ) 123 | return spline_cam 124 | 125 | 126 | def write_pyt3d_camera_to_nerfstudio_json(folder, ref_camera, gen_camera, eval_camera=None, has_ply=False, has_mask=False, has_depth=False): 127 | # train jsons 128 | transform = {} 129 | frames_list = [] 130 | # reference_cameras 131 | num_ref_frames = len(ref_camera) 132 | camera_centers = ref_camera.get_camera_center() 133 | R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(ref_camera, image_size=ref_camera.image_size) 134 | for i in range(num_ref_frames): 135 | frame = {} 136 | R_c2w = ref_camera.R[i] 137 | R_c2w_blender = R_c2w.clone() 138 | # convert pytorch3d camera to blender/opengl camera 139 | R_c2w_blender[:, [0, 2]] *= -1 140 | # R_c2w = R_cv_w2c[i]#.T 141 | T_c2w = camera_centers[i].unsqueeze(-1) 142 | c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1) 143 | c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float() 144 | frame["file_path"] = f"images/ref_frame_{i:04d}.png" 145 | frame["transform_matrix"] = c2w_homo.tolist() 146 | frame["fl_x"] = Ks[i][0, 0].item() 147 | frame["fl_y"] = Ks[i][1, 1].item() 148 | frame["cx"] = Ks[i][0, 2].item() 149 | frame["cy"] = Ks[i][1, 2].item() 150 | frame["w"] = ref_camera.image_size[0, 1].item() 151 | frame["h"] = ref_camera.image_size[0, 0].item() 152 | if has_mask: 153 | frame["mask_path"] = f"masks/ref_frame_{i:04d}.png" 154 | if has_depth: 155 | frame["depth_file_path"] = f"depths/ref_frame_{i:04d}.npy" 156 | frames_list.append(frame) 157 | # generation cameras 158 | num_gen_frames = len(gen_camera) 159 | camera_centers = gen_camera.get_camera_center() 160 | R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(gen_camera, image_size=gen_camera.image_size) 161 | for i in range(num_gen_frames): 162 | frame = {} 163 | R_c2w = gen_camera.R[i] 164 | R_c2w_blender = R_c2w.clone() 165 | # convert pytorch3d camera to blender/opengl camera 166 | R_c2w_blender[:, [0, 2]] *= -1 167 | # R_c2w = R_cv_w2c[i]#.T 168 | T_c2w = camera_centers[i].unsqueeze(-1) 169 | c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1) 170 | c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float() 171 | frame["file_path"] = f"images/frame_{i:04d}.png" 172 | frame["transform_matrix"] = c2w_homo.tolist() 173 | frame["fl_x"] = Ks[i][0, 0].item() 174 | frame["fl_y"] = Ks[i][1, 1].item() 175 | frame["cx"] = Ks[i][0, 2].item() 176 | frame["cy"] = Ks[i][1, 2].item() 177 | frame["w"] = gen_camera.image_size[0, 1].item() 178 | frame["h"] = gen_camera.image_size[0, 0].item() 179 | if has_mask: 180 | frame["mask_path"] = f"masks/frame_{i:04d}.png" 181 | if has_depth: 182 | frame["depth_file_path"] = f"depths/frame_{i:04d}.npy" 183 | frames_list.append(frame) 184 | transform["frames"] = frames_list 185 | if has_ply: 186 | transform["ply_file_path"] = "ref_pred_pointcloud.ply" 187 | with open(os.path.join(folder, 'transforms_train.json'), 'w') as json_file: 188 | json.dump(transform, json_file, indent=4) 189 | 190 | # test jsons 191 | if eval_camera is not None: 192 | transform = {} 193 | frames_list = [] 194 | # evaluation_cameras 195 | num_eval_frames = len(eval_camera) 196 | camera_centers = eval_camera.get_camera_center() 197 | R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(eval_camera, image_size=eval_camera.image_size) 198 | for i in range(num_eval_frames): 199 | frame = {} 200 | R_c2w = eval_camera.R[i] 201 | R_c2w_blender = R_c2w.clone() 202 | # convert pytorch3d camera to blender/opengl camera 203 | R_c2w_blender[:, [0, 2]] *= -1 204 | # R_c2w = R_cv_w2c[i]#.T 205 | T_c2w = camera_centers[i].unsqueeze(-1) 206 | c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1) 207 | c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float() 208 | frame["file_path"] = f"images/eval_frame_{i:04d}.png" 209 | frame["transform_matrix"] = c2w_homo.tolist() 210 | frame["fl_x"] = Ks[i][0, 0].item() 211 | frame["fl_y"] = Ks[i][1, 1].item() 212 | frame["cx"] = Ks[i][0, 2].item() 213 | frame["cy"] = Ks[i][1, 2].item() 214 | frame["w"] = eval_camera.image_size[0, 1].item() 215 | frame["h"] = eval_camera.image_size[0, 0].item() 216 | if has_mask: 217 | frame["mask_path"] = f"masks/eval_frame_{i:04d}.png" 218 | if has_depth: 219 | frame["depth_file_path"] = f"depths/eval_frame_{i:04d}.npy" 220 | frames_list.append(frame) 221 | transform["frames"] = frames_list 222 | with open(os.path.join(folder, 'transforms_test.json'), 'w') as json_file: 223 | json.dump(transform, json_file, indent=4) 224 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import torch 6 | import numpy as np 7 | import os 8 | import cv2 9 | from pytorch3d.renderer import PerspectiveCameras 10 | from pytorch3d.implicitron.tools.point_cloud_utils import ( 11 | render_point_cloud_pytorch3d, 12 | get_rgbd_point_cloud, 13 | ) 14 | 15 | 16 | MOD_FLAG_TABLE = { 17 | 'c': 0, 18 | 'g': 1, 19 | 'x': -1, 20 | } 21 | 22 | class DataHandler(): 23 | '''DataHandler for multi-view multi-modal data''' 24 | def __init__(self, data: dict, pad_length=None, except_keys=None): 25 | if not isinstance(data, dict): 26 | raise ValueError("Input data must be a dictionary.") 27 | self.data = data 28 | self.batch_size, self.num_view_raw = data['view_id'].shape 29 | if pad_length: 30 | self.pad_batch_data_using_first_value(pad_length, except_keys) 31 | 32 | 33 | def pad_batch_data_using_first_value(self, 34 | target_length, 35 | except_keys=['scene_id', 'global_caption', 'num_views', 'train_ids', 'test_ids', 'scene_scale']): 36 | # pad every value to target length 37 | for key in self.data.keys(): 38 | if key in except_keys: continue 39 | elif type(self.data[key]) == dict: 40 | if key == 'mods_flags': 41 | # use -1 (not used flag) for all mod flags 42 | for sub_key in self.data[key].keys(): 43 | current_length = self.data[key][sub_key].size(1) 44 | padding_size = target_length - current_length 45 | padding = torch.ones([1])[None].repeat(self.batch_size, padding_size) * -1 46 | self.data[key][sub_key] = torch.cat([self.data[key][sub_key], padding], dim=1) 47 | else: 48 | raise NotImplementedError() 49 | elif isinstance(self.data[key], torch.Tensor): 50 | current_length = self.data[key].size(1) 51 | if current_length < target_length: 52 | padding_size = target_length - current_length 53 | first_value = self.data[key][:, :1, ...] 54 | padding = first_value.repeat(1, padding_size, *[1] * (self.data[key].dim() - 2)) 55 | self.data[key] = torch.cat([self.data[key], padding], dim=1) 56 | elif isinstance(self.data[key], list): 57 | for i in range(len(self.data[key])): 58 | if isinstance(self.data[key][i], list): 59 | current_length = len(self.data[key][i]) 60 | self.data[key][i].extend([self.data[key][i][0] for _ in range(target_length - current_length)]) 61 | elif isinstance(self.data[key][i], PerspectiveCameras): 62 | current_length = len(self.data[key][i]) 63 | padding_size = target_length - current_length 64 | indices = [k for k in range(current_length)] + [0 for j in range(padding_size)] 65 | self.data[key][i] = self.data[key][i][indices] 66 | # hard code pass pytorch3d camera 67 | elif isinstance(self.data[key][i], str): continue 68 | # hard code pass global caption 69 | else: raise NotImplementedError(f'meet type {type(self.data[key])} not implemented! key={key}') 70 | 71 | 72 | def select_via_indices(self, 73 | indices=np.array([0, 1]), 74 | except_keys=['scene_id', 'global_caption', 'num_views', 'train_ids', 'test_ids', 'scene_scale'], 75 | reset_viewid=True): 76 | new_data = {} 77 | for key, value in self.data.items(): 78 | if key in except_keys: 79 | new_data[key] = value 80 | elif isinstance(value, dict): 81 | if key == 'mods_flags': 82 | new_data[key] = {} 83 | for sub_key in value.keys(): 84 | new_data[key][sub_key] = value[sub_key][:, indices].clone() 85 | else: 86 | raise NotImplementedError() 87 | elif isinstance(value, torch.Tensor): 88 | new_data[key] = value[:, indices].clone() 89 | elif isinstance(value, list): 90 | new_list = [] 91 | for item in value: 92 | if isinstance(item, list): 93 | new_list.append([item[idx] for idx in indices]) 94 | elif isinstance(item, PerspectiveCameras): 95 | new_list.append(item[indices.tolist()]) 96 | elif isinstance(item, str): 97 | new_list.append(item) 98 | else: 99 | raise NotImplementedError(f'meet type {type(item)} not implemented! key={key}') 100 | new_data[key] = new_list 101 | elif isinstance(value, bool): 102 | new_data[key] = value 103 | else: 104 | raise NotImplementedError(f'meet type {type(value)} not implemented! key={key}') 105 | 106 | if reset_viewid and 'view_id' in new_data: 107 | bs, num_view = new_data['view_id'].shape 108 | new_data['view_id'] = torch.arange(num_view)[None].repeat(bs, 1) 109 | return new_data 110 | 111 | @staticmethod 112 | def mod_flags_update(batch, mod_flags): 113 | num_view = batch['view_id'].shape[1] 114 | for mod_name, mod_flags in zip(['rgb', 'ray', 'depth'], mod_flags.split(',')): 115 | for view_i, mod_flag in enumerate(mod_flags): 116 | if view_i < int(num_view): 117 | batch['mods_flags'][mod_name][:, view_i] = MOD_FLAG_TABLE[mod_flag] 118 | # force set first-view pose flag as condition 119 | if mod_name == 'ray' and view_i == 0: 120 | batch['mods_flags'][mod_name][:, view_i] = MOD_FLAG_TABLE['c'] 121 | return batch 122 | 123 | def update(self, key, indices, values): 124 | if key in self.data: 125 | self.data[key][:, indices] = values 126 | 127 | def __call__(self, key): 128 | return self.data[key] if key in self.data else None 129 | 130 | 131 | 132 | 133 | def tensor_recursive_to(d: dict, func): 134 | if isinstance(d, (list)): 135 | iterator = range(len(d)) 136 | elif isinstance(d, dict): 137 | iterator = d.keys() 138 | for it in iterator: 139 | if isinstance(d[it], (list, dict, tuple)): 140 | if isinstance(d[it], tuple): 141 | d[it] = list(d[it]) 142 | tensor_recursive_to(d[it], func) 143 | elif isinstance(d[it], (int, float, str, np.ndarray, PerspectiveCameras)): 144 | pass 145 | elif d[it] == None: 146 | pass 147 | else: 148 | d[it] = func(d[it]) 149 | 150 | 151 | def save_compare_image(np_image, path): 152 | N, H, W, C = np_image.shape 153 | np_image = np_image.transpose(1, 0, 2, 3).reshape(H, N * W, C) 154 | os.makedirs(os.path.dirname(path), exist_ok=True) 155 | cv2.imwrite(path, np_image[..., ::-1]) 156 | 157 | 158 | def get_rgbd_point_cloud_numpy(cam, images, depths, depth_masks=None, mask_thr=None): 159 | point_cloud = get_rgbd_point_cloud(cam, images, depths, mask=depth_masks, mask_thr=mask_thr) 160 | points, colors = point_cloud.points_list()[0].detach().numpy(), point_cloud.features_list()[0].detach().numpy() 161 | # remove invalid points 162 | valid_mask = np.isfinite(points).all(axis=1) 163 | points, colors = points[valid_mask], colors[valid_mask] 164 | 165 | return points, colors -------------------------------------------------------------------------------- /utils/vis.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import io 6 | import os 7 | import os.path as osp 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import torch 11 | from PIL import Image 12 | 13 | 14 | def unnormalize_image(image): 15 | if isinstance(image, torch.Tensor): 16 | image = image.cpu().numpy() 17 | if image.shape[0] == 3: 18 | image = image.transpose(1, 2, 0) 19 | mean = np.array([0.5, 0.5, 0.5]) 20 | std = np.array([0.5, 0.5, 0.5]) 21 | image = image * std + mean 22 | return (image * 255.0).astype(np.uint8) 23 | 24 | 25 | def plot_to_image(figure, dpi=100): 26 | """Converts matplotlib fig to a png for logging with tf.summary.image.""" 27 | buffer = io.BytesIO() 28 | figure.savefig(buffer, format="raw", dpi=dpi) 29 | plt.close(figure) 30 | buffer.seek(0) 31 | image = np.reshape( 32 | np.frombuffer(buffer.getvalue(), dtype=np.uint8), 33 | newshape=(int(figure.bbox.bounds[3]), int(figure.bbox.bounds[2]), -1), 34 | ) 35 | return image[..., :3] 36 | 37 | 38 | def view_color_coded_images_from_path(image_dir): 39 | cmap = plt.get_cmap("hsv") 40 | num_rows = 2 41 | num_cols = 4 42 | figsize = (num_cols * 2, num_rows * 2) 43 | fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize) 44 | axs = axs.flatten() 45 | 46 | def hidden(x): 47 | return not x.startswith(".") 48 | 49 | image_paths = sorted(os.listdir(image_dir)) 50 | image_paths = list(filter(hidden, image_paths)) 51 | image_paths = image_paths[0 : (min(len(image_paths), 8))] 52 | num_frames = len(image_paths) 53 | 54 | for i in range(num_rows * num_cols): 55 | if i < num_frames: 56 | img = np.asarray(Image.open(osp.join(image_dir, image_paths[i]))) 57 | print(img.shape) 58 | axs[i].imshow(img) 59 | for s in ["bottom", "top", "left", "right"]: 60 | axs[i].spines[s].set_color(cmap(i / (num_frames))) 61 | axs[i].spines[s].set_linewidth(5) 62 | axs[i].set_xticks([]) 63 | axs[i].set_yticks([]) 64 | else: 65 | axs[i].axis("off") 66 | plt.tight_layout() 67 | return fig, num_frames 68 | 69 | 70 | def view_color_coded_images_from_tensor(images): 71 | num_frames = images.shape[0] 72 | cmap = plt.get_cmap("hsv") 73 | num_rows = 2 74 | num_cols = 4 75 | figsize = (num_cols * 2, num_rows * 2) 76 | fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize) 77 | axs = axs.flatten() 78 | for i in range(num_rows * num_cols): 79 | if i < num_frames: 80 | axs[i].imshow(unnormalize_image(images[i])) 81 | for s in ["bottom", "top", "left", "right"]: 82 | axs[i].spines[s].set_color(cmap(i / (num_frames))) 83 | axs[i].spines[s].set_linewidth(5) 84 | axs[i].set_xticks([]) 85 | axs[i].set_yticks([]) 86 | else: 87 | axs[i].axis("off") 88 | plt.tight_layout() 89 | -------------------------------------------------------------------------------- /utils/write_videos.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved. 4 | # 5 | import argparse 6 | import os 7 | import cv2 8 | import imageio 9 | 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--render_root', type=str, default='logs/exp-xxxxx/renders/xxxxx-train-set') 14 | parser.add_argument('--num_samples', type=int, default=80) 15 | parser.add_argument('--num_splines', type=int, default=3) 16 | parser.add_argument('--type', type=str, default='scene') 17 | 18 | args = parser.parse_args() 19 | render_folder = os.path.join(args.render_root, 'train') 20 | pred_root = os.path.join(render_folder, 'rgb') 21 | num_frames = args.num_samples 22 | output_folder = os.path.dirname(os.path.dirname(args.render_root)) 23 | # scene_id = args.render_root.split('/')[-1] 24 | 25 | if args.type == 'scene': 26 | all_frames = sorted(os.listdir(pred_root)) 27 | for i in range(args.num_splines): 28 | st_id, ed_id = i * num_frames, (i + 1) * num_frames 29 | img_list = [] 30 | for j in range(st_id, ed_id): 31 | file = os.path.join(pred_root, f'frame_{j:04d}.png') 32 | img_list.append(cv2.imread(file)[..., ::-1]) 33 | video_file = os.path.join(output_folder, f'3DGS-render-traj{i}.mp4') 34 | imageio.mimsave(video_file, img_list, fps=30) 35 | elif args.type == 'object': 36 | all_frames = sorted(os.listdir(pred_root)) 37 | for i in range(1): 38 | st_id, ed_id = i * num_frames, (i + 1) * num_frames 39 | img_list = [] 40 | first_view_file = os.path.join(pred_root, 'ref_frame_0000.png') 41 | img_list.append(cv2.imread(first_view_file)[..., ::-1]) 42 | for j in range(st_id, ed_id - 1): 43 | file = os.path.join(pred_root, f'frame_{j:04d}.png') 44 | img_list.append(cv2.imread(file)[..., ::-1]) 45 | video_file = os.path.join(output_folder, f'3DGS-render-traj.mp4') 46 | imageio.mimsave(video_file, img_list, fps=30) --------------------------------------------------------------------------------