├── .gitignore
├── ACKNOWLEDGMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── IS_Net
    ├── Inference.py
    ├── __init__.py
    ├── basics.py
    ├── data_loader_cache.py
    ├── hce_metric_main.py
    ├── models
    │   ├── __init__.py
    │   └── isnet.py
    ├── pytorch18.yml
    ├── requirements.txt
    └── train_valid_inference_main.py
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── configs
    └── config_stage3.yaml
├── data
    ├── __init__.py
    └── data_preprocessor.py
├── docs
    └── inference-pipe.png
├── examples
    ├── co3dv2-samples
    │   ├── 195_20989_41543
    │   │   ├── 047.jpg
    │   │   ├── 047.txt
    │   │   ├── 047_depth.png
    │   │   ├── 047_ext.txt
    │   │   ├── 051.jpg
    │   │   ├── 051.txt
    │   │   ├── 051_depth.png
    │   │   ├── 051_ext.txt
    │   │   ├── 060.jpg
    │   │   ├── 060.txt
    │   │   ├── 060_depth.png
    │   │   ├── 060_ext.txt
    │   │   ├── 084.jpg
    │   │   ├── 084.txt
    │   │   ├── 084_depth.png
    │   │   ├── 084_ext.txt
    │   │   ├── 122.jpg
    │   │   ├── 122.txt
    │   │   ├── 122_depth.png
    │   │   ├── 122_ext.txt
    │   │   ├── 126.jpg
    │   │   ├── 126.txt
    │   │   ├── 126_depth.png
    │   │   ├── 126_ext.txt
    │   │   ├── 161.jpg
    │   │   ├── 161.txt
    │   │   ├── 161_depth.png
    │   │   ├── 161_ext.txt
    │   │   ├── 164.jpg
    │   │   ├── 164.txt
    │   │   ├── 164_depth.png
    │   │   └── 164_ext.txt
    │   ├── 198_21285_41285
    │   │   ├── 000.jpg
    │   │   ├── 000.txt
    │   │   ├── 000_depth.png
    │   │   ├── 000_ext.txt
    │   │   ├── 038.jpg
    │   │   ├── 038.txt
    │   │   ├── 038_depth.png
    │   │   ├── 038_ext.txt
    │   │   ├── 051.jpg
    │   │   ├── 051.txt
    │   │   ├── 051_depth.png
    │   │   ├── 051_ext.txt
    │   │   ├── 074.jpg
    │   │   ├── 074.txt
    │   │   ├── 074_depth.png
    │   │   ├── 074_ext.txt
    │   │   ├── 121.jpg
    │   │   ├── 121.txt
    │   │   ├── 121_depth.png
    │   │   ├── 121_ext.txt
    │   │   ├── 122.jpg
    │   │   ├── 122.txt
    │   │   ├── 122_depth.png
    │   │   ├── 122_ext.txt
    │   │   ├── 155.jpg
    │   │   ├── 155.txt
    │   │   ├── 155_depth.png
    │   │   ├── 155_ext.txt
    │   │   ├── 176.jpg
    │   │   ├── 176.txt
    │   │   ├── 176_depth.png
    │   │   └── 176_ext.txt
    │   ├── 201_21613_43652
    │   │   ├── 000.jpg
    │   │   ├── 000.txt
    │   │   ├── 000_depth.png
    │   │   ├── 000_ext.txt
    │   │   ├── 027.jpg
    │   │   ├── 027.txt
    │   │   ├── 027_depth.png
    │   │   ├── 027_ext.txt
    │   │   ├── 038.jpg
    │   │   ├── 038.txt
    │   │   ├── 038_depth.png
    │   │   ├── 038_ext.txt
    │   │   ├── 051.jpg
    │   │   ├── 051.txt
    │   │   ├── 051_depth.png
    │   │   ├── 051_ext.txt
    │   │   ├── 060.jpg
    │   │   ├── 060.txt
    │   │   ├── 060_depth.png
    │   │   ├── 060_ext.txt
    │   │   ├── 067.jpg
    │   │   ├── 067.txt
    │   │   ├── 067_depth.png
    │   │   ├── 067_ext.txt
    │   │   ├── 099.jpg
    │   │   ├── 099.txt
    │   │   ├── 099_depth.png
    │   │   ├── 099_ext.txt
    │   │   ├── 143.jpg
    │   │   ├── 143.txt
    │   │   ├── 143_depth.png
    │   │   └── 143_ext.txt
    │   ├── 31_1359_4114
    │   │   ├── 010.jpg
    │   │   ├── 010.txt
    │   │   ├── 010_depth.png
    │   │   ├── 010_ext.txt
    │   │   ├── 017.jpg
    │   │   ├── 017.txt
    │   │   ├── 017_depth.png
    │   │   ├── 017_ext.txt
    │   │   ├── 041.jpg
    │   │   ├── 041.txt
    │   │   ├── 041_depth.png
    │   │   ├── 041_ext.txt
    │   │   ├── 072.jpg
    │   │   ├── 072.txt
    │   │   ├── 072_depth.png
    │   │   ├── 072_ext.txt
    │   │   ├── 099.jpg
    │   │   ├── 099.txt
    │   │   ├── 099_depth.png
    │   │   ├── 099_ext.txt
    │   │   ├── 107.jpg
    │   │   ├── 107.txt
    │   │   ├── 107_depth.png
    │   │   ├── 107_ext.txt
    │   │   ├── 118.jpg
    │   │   ├── 118.txt
    │   │   ├── 118_depth.png
    │   │   ├── 118_ext.txt
    │   │   ├── 130.jpg
    │   │   ├── 130.txt
    │   │   ├── 130_depth.png
    │   │   └── 130_ext.txt
    │   └── 422_58670_113666
    │   │   ├── 000.jpg
    │   │   ├── 000.txt
    │   │   ├── 000_depth.png
    │   │   ├── 000_ext.txt
    │   │   ├── 028.jpg
    │   │   ├── 028.txt
    │   │   ├── 028_depth.png
    │   │   ├── 028_ext.txt
    │   │   ├── 044.jpg
    │   │   ├── 044.txt
    │   │   ├── 044_depth.png
    │   │   ├── 044_ext.txt
    │   │   ├── 062.jpg
    │   │   ├── 062.txt
    │   │   ├── 062_depth.png
    │   │   ├── 062_ext.txt
    │   │   ├── 068.jpg
    │   │   ├── 068.txt
    │   │   ├── 068_depth.png
    │   │   ├── 068_ext.txt
    │   │   ├── 074.jpg
    │   │   ├── 074.txt
    │   │   ├── 074_depth.png
    │   │   ├── 074_ext.txt
    │   │   ├── 098.jpg
    │   │   ├── 098.txt
    │   │   ├── 098_depth.png
    │   │   ├── 098_ext.txt
    │   │   ├── 101.jpg
    │   │   ├── 101.txt
    │   │   ├── 101_depth.png
    │   │   └── 101_ext.txt
    ├── single-view
    │   ├── armor.png
    │   ├── armor.txt
    │   ├── ghost.png
    │   ├── ghost.txt
    │   ├── jacket.png
    │   ├── pile.png
    │   ├── pile.txt
    │   └── skull.png
    └── unposed-samples
    │   ├── arkitscenes
    │       ├── 41069043
    │       │   ├── 061.png
    │       │   ├── 061.txt
    │       │   ├── 072.png
    │       │   ├── 072.txt
    │       │   ├── 081.png
    │       │   └── 081.txt
    │       └── 41125709
    │       │   ├── 052.png
    │       │   ├── 052.txt
    │       │   ├── 053.png
    │       │   ├── 053.txt
    │       │   ├── 054.png
    │       │   └── 054.txt
    │   └── co3dv2
    │       ├── 195_20989_41543
    │           ├── 051.jpg
    │           ├── 051.txt
    │           ├── 084.jpg
    │           ├── 084.txt
    │           ├── 126.jpg
    │           └── 126.txt
    │       ├── 198_21285_41285
    │           ├── 000.jpg
    │           ├── 000.txt
    │           ├── 038.jpg
    │           ├── 038.txt
    │           ├── 074.jpg
    │           └── 074.txt
    │       ├── 201_21613_43652
    │           ├── 000.jpg
    │           ├── 000.txt
    │           ├── 038.jpg
    │           ├── 038.txt
    │           ├── 067.jpg
    │           └── 067.txt
    │       ├── 31_1359_4114
    │           ├── 010.jpg
    │           ├── 010.txt
    │           ├── 041.jpg
    │           ├── 041.txt
    │           ├── 072.jpg
    │           └── 072.txt
    │       └── 422_58670_113666
    │           ├── 000.jpg
    │           ├── 000.txt
    │           ├── 062.jpg
    │           ├── 062.txt
    │           ├── 101.jpg
    │           └── 101.txt
├── model
    ├── dinov2.py
    ├── dinov2_adaln
    │   ├── __init__.py
    │   └── adaln.py
    ├── dit.py
    ├── feature_extractors.py
    ├── hunyuan.py
    ├── inference
    │   ├── __init__.py
    │   └── ddpm.py
    ├── load.py
    └── utils
    │   ├── __init__.py
    │   ├── nn.py
    │   ├── normalize.py
    │   ├── pos_encoder.py
    │   └── rays.py
├── pipeline_depth_prediction.py
├── pipeline_novel_view_synthesis.py
├── pipeline_pose_estimation.py
├── pipeline_single_to_3d.py
├── pipeline_unposed_few_shot_to_3d.py
├── requirements.txt
├── scripts
    ├── depth_prediction.sh
    ├── novel_view_synthesis.sh
    ├── pose_estimation.sh
    ├── single_view_to_3d.sh
    ├── unposed_fewshot_to_3d_arkitscenes.sh
    └── unposed_fewshot_to_3d_co3dv2.sh
├── splatfacto_matrix3d
    ├── __init__.py
    ├── batch_full_images_datamanager.py
    ├── splatfacto.py
    ├── splatfacto_configs.py
    └── strategy.py
└── utils
    ├── camera_utils.py
    ├── data_utils.py
    ├── train_utils.py
    ├── vis.py
    └── write_videos.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # debug files
  2 | debug/
  3 | sd-model-finetuned
  4 | logs
  5 | outputs
  6 | checkpoints
  7 | results
  8 | 
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | .DS_Store
141 | .vscode
142 | 
143 | __MACOSX
144 | 


--------------------------------------------------------------------------------
/ACKNOWLEDGMENTS:
--------------------------------------------------------------------------------
 1 | Acknowledgements
 2 | Portions of this Matrix3D Software may utilize the following copyrighted 
 3 | material, the use of which is hereby acknowledged.
 4 | 
 5 | _____________________
 6 | 
 7 | The HuggingFace Team (https://github.com/huggingface/diffusers)
 8 |         This Software uses code from the diffusers library from the HuggingFace
 9 |         Team, which is distributed under Apache license.
10 | 
11 |         Copyright 2024 The HuggingFace Team. All rights reserved.
12 | 
13 |         Licensed under the Apache License, Version 2.0 (the "License");
14 |         you may not use this file except in compliance with the License.
15 |         You may obtain a copy of the License at
16 | 
17 |             http://www.apache.org/licenses/LICENSE-2.0
18 | 
19 |         Unless required by applicable law or agreed to in writing, software
20 |         distributed under the License is distributed on an "AS IS" BASIS,
21 |         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 |         See the License for the specific language governing permissions and
23 |         limitations under the License.
24 | 
25 | The Nerfstudio Team (https://github.com/nerfstudio-project/nerfstudio)
26 |         This Software uses code from the nerfstudio library, which is distributed 
27 |         under Apache license.
28 | 
29 |         Copyright 2023 The Nerfstudio Team
30 | 
31 |         Licensed under the Apache License, Version 2.0 (the "License");
32 |         you may not use this file except in compliance with the License.
33 |         You may obtain a copy of the License at
34 |      
35 |             http://www.apache.org/licenses/LICENSE-2.0
36 |      
37 |         Unless required by applicable law or agreed to in writing, software
38 |         distributed under the License is distributed on an "AS IS" BASIS,
39 |         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 |         See the License for the specific language governing permissions and
41 |         limitations under the License.
42 | 
43 | Xuebin Qin (https://github.com/xuebinqin/DIS)
44 |     This Software uses code from the DIS library, which is distributed under Apache
45 |     license.
46 | 
47 |     Copyright 2024 Xuebin Qin. All rights reserved.
48 | 
49 |     Licensed under the Apache License, Version 2.0 (the "License");
50 |     you may not use this file except in compliance with the License.
51 |     You may obtain a copy of the License at
52 | 
53 |         http://www.apache.org/licenses/LICENSE-2.0
54 | 
55 |     Unless required by applicable law or agreed to in writing, software
56 |     distributed under the License is distributed on an "AS IS" BASIS,
57 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
58 |     See the License for the specific language governing permissions and
59 |     limitations under the License.   
60 | 
61 | Jason Y. Zhang (https://github.com/jasonyzhang/RayDiffusion)
62 |         This Software uses code from the RayDiffusion library,
63 |         which is distributed under MIT license.
64 | 
65 |         Copyright (c) 2024 Jason Y. Zhang
66 | 
67 |         Permission is hereby granted, free of charge, to any person obtaining a copy
68 |         of this software and associated documentation files (the "Software"), to deal
69 |         in the Software without restriction, including without limitation the rights
70 |         to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
71 |         copies of the Software, and to permit persons to whom the Software is
72 |         furnished to do so, subject to the following conditions:
73 | 
74 |         The above copyright notice and this permission notice shall be included in all
75 |         copies or substantial portions of the Software.
76 | 
77 |         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
78 |         IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
79 |         FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80 |         AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81 |         LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
82 |         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
83 |         SOFTWARE.  
84 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
 4 | 
 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
 6 | 
 7 | ## Before you get started
 8 | 
 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10 | 
11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).
12 | 


--------------------------------------------------------------------------------
/IS_Net/Inference.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import os
 6 | import time
 7 | import numpy as np
 8 | from skimage import io
 9 | import time
10 | from glob import glob
11 | from tqdm import tqdm
12 | 
13 | import torch, gc
14 | import torch.nn as nn
15 | from torch.autograd import Variable
16 | import torch.optim as optim
17 | import torch.nn.functional as F
18 | from torchvision.transforms.functional import normalize
19 | 
20 | from models import *
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     dataset_path="../demo_datasets/your_dataset"  #Your dataset path
25 |     model_path="../saved_models/IS-Net/isnet-general-use.pth"  # the model path
26 |     result_path="../demo_datasets/your_dataset_result"  #The folder path that you want to save the results
27 |     input_size=[1024,1024]
28 |     net=ISNetDIS()
29 | 
30 |     if torch.cuda.is_available():
31 |         net.load_state_dict(torch.load(model_path))
32 |         net=net.cuda()
33 |     else:
34 |         net.load_state_dict(torch.load(model_path,map_location="cpu"))
35 |     net.eval()
36 |     im_list = glob(dataset_path+"/*.jpg")+glob(dataset_path+"/*.JPG")+glob(dataset_path+"/*.jpeg")+glob(dataset_path+"/*.JPEG")+glob(dataset_path+"/*.png")+glob(dataset_path+"/*.PNG")+glob(dataset_path+"/*.bmp")+glob(dataset_path+"/*.BMP")+glob(dataset_path+"/*.tiff")+glob(dataset_path+"/*.TIFF")
37 |     with torch.no_grad():
38 |         for i, im_path in tqdm(enumerate(im_list), total=len(im_list)):
39 |             print("im_path: ", im_path)
40 |             im = io.imread(im_path)
41 |             if len(im.shape) < 3:
42 |                 im = im[:, :, np.newaxis]
43 |             im_shp=im.shape[0:2]
44 |             im_tensor = torch.tensor(im, dtype=torch.float32).permute(2,0,1)
45 |             im_tensor = F.upsample(torch.unsqueeze(im_tensor,0), input_size, mode="bilinear").type(torch.uint8)
46 |             image = torch.divide(im_tensor,255.0)
47 |             image = normalize(image,[0.5,0.5,0.5],[1.0,1.0,1.0])
48 | 
49 |             if torch.cuda.is_available():
50 |                 image=image.cuda()
51 |             result=net(image)
52 |             result=torch.squeeze(F.upsample(result[0][0],im_shp,mode='bilinear'),0)
53 |             ma = torch.max(result)
54 |             mi = torch.min(result)
55 |             result = (result-mi)/(ma-mi)
56 |             im_name=im_path.split('/')[-1].split('.')[0]
57 |             io.imsave(os.path.join(result_path,im_name+".png"),(result*255).permute(1,2,0).cpu().data.numpy().astype(np.uint8))
58 | 


--------------------------------------------------------------------------------
/IS_Net/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #


--------------------------------------------------------------------------------
/IS_Net/basics.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import os
 6 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2'
 7 | from skimage import io, transform
 8 | import torch
 9 | import torchvision
10 | from torch.autograd import Variable
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | from torch.utils.data import Dataset, DataLoader
14 | from torchvision import transforms, utils
15 | import torch.optim as optim
16 | 
17 | import matplotlib.pyplot as plt
18 | import numpy as np
19 | from PIL import Image
20 | import glob
21 | 
22 | def mae_torch(pred,gt):
23 | 
24 | 	h,w = gt.shape[0:2]
25 | 	sumError = torch.sum(torch.absolute(torch.sub(pred.float(), gt.float())))
26 | 	maeError = torch.divide(sumError,float(h)*float(w)*255.0+1e-4)
27 | 
28 | 	return maeError
29 | 
30 | def f1score_torch(pd,gt):
31 | 
32 | 	# print(gt.shape)
33 | 	gtNum = torch.sum((gt>128).float()*1) ## number of ground truth pixels
34 | 
35 | 	pp = pd[gt>128]
36 | 	nn = pd[gt<=128]
37 | 
38 | 	pp_hist =torch.histc(pp,bins=255,min=0,max=255)
39 | 	nn_hist = torch.histc(nn,bins=255,min=0,max=255)
40 | 
41 | 
42 | 	pp_hist_flip = torch.flipud(pp_hist)
43 | 	nn_hist_flip = torch.flipud(nn_hist)
44 | 
45 | 	pp_hist_flip_cum = torch.cumsum(pp_hist_flip, dim=0)
46 | 	nn_hist_flip_cum = torch.cumsum(nn_hist_flip, dim=0)
47 | 
48 | 	precision = (pp_hist_flip_cum)/(pp_hist_flip_cum + nn_hist_flip_cum + 1e-4)#torch.divide(pp_hist_flip_cum,torch.sum(torch.sum(pp_hist_flip_cum, nn_hist_flip_cum), 1e-4))
49 | 	recall = (pp_hist_flip_cum)/(gtNum + 1e-4)
50 | 	f1 = (1+0.3)*precision*recall/(0.3*precision+recall + 1e-4)
51 | 
52 | 	return torch.reshape(precision,(1,precision.shape[0])),torch.reshape(recall,(1,recall.shape[0])),torch.reshape(f1,(1,f1.shape[0]))
53 | 
54 | 
55 | def f1_mae_torch(pred, gt, valid_dataset, idx, mybins, hypar):
56 | 
57 | 	import time
58 | 	tic = time.time()
59 | 
60 | 	if(len(gt.shape)>2):
61 | 		gt = gt[:,:,0]
62 | 
63 | 	pre, rec, f1 = f1score_torch(pred,gt)
64 | 	mae = mae_torch(pred,gt)
65 | 
66 | 
67 | 	# hypar["valid_out_dir"] = hypar["valid_out_dir"]+"-eval" ###
68 | 	if(hypar["valid_out_dir"]!=""):
69 | 		if(not os.path.exists(hypar["valid_out_dir"])):
70 | 			os.mkdir(hypar["valid_out_dir"])
71 | 		dataset_folder = os.path.join(hypar["valid_out_dir"],valid_dataset.dataset["data_name"][idx])
72 | 		if(not os.path.exists(dataset_folder)):
73 | 			os.mkdir(dataset_folder)
74 | 		io.imsave(os.path.join(dataset_folder,valid_dataset.dataset["im_name"][idx]+".png"),pred.cpu().data.numpy().astype(np.uint8))
75 | 	print(valid_dataset.dataset["im_name"][idx]+".png")
76 | 	print("time for evaluation : ", time.time()-tic)
77 | 
78 | 	return pre.cpu().data.numpy(), rec.cpu().data.numpy(), f1.cpu().data.numpy(), mae.cpu().data.numpy()
79 | 


--------------------------------------------------------------------------------
/IS_Net/hce_metric_main.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | import numpy as np
  6 | from skimage import io
  7 | import matplotlib.pyplot as plt
  8 | import cv2 as cv
  9 | from skimage.morphology import skeletonize
 10 | from skimage.morphology import erosion, dilation, disk
 11 | from skimage.measure import label
 12 | 
 13 | import os
 14 | import sys
 15 | from tqdm import tqdm
 16 | from glob import glob
 17 | import pickle as pkl
 18 | 
 19 | def filter_bdy_cond(bdy_, mask, cond):
 20 | 
 21 |     cond = cv.dilate(cond.astype(np.uint8),disk(1))
 22 |     labels = label(mask) # find the connected regions
 23 |     lbls = np.unique(labels) # the indices of the connected regions
 24 |     indep = np.ones(lbls.shape[0]) # the label of each connected regions
 25 |     indep[0] = 0 # 0 indicate the background region
 26 | 
 27 |     boundaries = []
 28 |     h,w = cond.shape[0:2]
 29 |     ind_map = np.zeros((h,w))
 30 |     indep_cnt = 0
 31 | 
 32 |     for i in range(0,len(bdy_)):
 33 |         tmp_bdies = []
 34 |         tmp_bdy = []
 35 |         for j in range(0,bdy_[i].shape[0]):
 36 |             r, c = bdy_[i][j,0,1],bdy_[i][j,0,0]
 37 | 
 38 |             if(np.sum(cond[r,c])==0 or ind_map[r,c]!=0):
 39 |                 if(len(tmp_bdy)>0):
 40 |                     tmp_bdies.append(tmp_bdy)
 41 |                     tmp_bdy = []
 42 |                 continue
 43 |             tmp_bdy.append([c,r])
 44 |             ind_map[r,c] =  ind_map[r,c] + 1
 45 |             indep[labels[r,c]] = 0 # indicates part of the boundary of this region needs human correction
 46 |         if(len(tmp_bdy)>0):
 47 |             tmp_bdies.append(tmp_bdy)
 48 | 
 49 |         # check if the first and the last boundaries are connected
 50 |         # if yes, invert the first boundary and attach it after the last boundary
 51 |         if(len(tmp_bdies)>1):
 52 |             first_x, first_y = tmp_bdies[0][0]
 53 |             last_x, last_y = tmp_bdies[-1][-1]
 54 |             if((abs(first_x-last_x)==1 and first_y==last_y) or
 55 |                (first_x==last_x and abs(first_y-last_y)==1) or
 56 |                (abs(first_x-last_x)==1 and abs(first_y-last_y)==1)
 57 |               ):
 58 |                 tmp_bdies[-1].extend(tmp_bdies[0][::-1])
 59 |                 del tmp_bdies[0]
 60 | 
 61 |         for k in range(0,len(tmp_bdies)):
 62 |             tmp_bdies[k] =  np.array(tmp_bdies[k])[:,np.newaxis,:]
 63 |         if(len(tmp_bdies)>0):
 64 |             boundaries.extend(tmp_bdies)
 65 | 
 66 |     return boundaries, np.sum(indep)
 67 | 
 68 | # this function approximate each boundary by DP algorithm
 69 | # https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm
 70 | def approximate_RDP(boundaries,epsilon=1.0):
 71 | 
 72 |     boundaries_ = []
 73 |     boundaries_len_ = []
 74 |     pixel_cnt_ = 0
 75 | 
 76 |     # polygon approximate of each boundary
 77 |     for i in range(0,len(boundaries)):
 78 |         boundaries_.append(cv.approxPolyDP(boundaries[i],epsilon,False))
 79 | 
 80 |     # count the control points number of each boundary and the total control points number of all the boundaries
 81 |     for i in range(0,len(boundaries_)):
 82 |         boundaries_len_.append(len(boundaries_[i]))
 83 |         pixel_cnt_ = pixel_cnt_ + len(boundaries_[i])
 84 | 
 85 |     return boundaries_, boundaries_len_, pixel_cnt_
 86 | 
 87 | 
 88 | def relax_HCE(gt, rs, gt_ske, relax=5, epsilon=2.0):
 89 |     # print("max(gt_ske): ", np.amax(gt_ske))
 90 |     # gt_ske = gt_ske>128
 91 |     # print("max(gt_ske): ", np.amax(gt_ske))
 92 | 
 93 |     # Binarize gt
 94 |     if(len(gt.shape)>2):
 95 |         gt = gt[:,:,0]
 96 | 
 97 |     epsilon_gt = 128#(np.amin(gt)+np.amax(gt))/2.0
 98 |     gt = (gt>epsilon_gt).astype(np.uint8)
 99 | 
100 |     # Binarize rs
101 |     if(len(rs.shape)>2):
102 |         rs = rs[:,:,0]
103 |     epsilon_rs = 128#(np.amin(rs)+np.amax(rs))/2.0
104 |     rs = (rs>epsilon_rs).astype(np.uint8)
105 | 
106 |     Union = np.logical_or(gt,rs)
107 |     TP = np.logical_and(gt,rs)
108 |     FP = rs - TP
109 |     FN = gt - TP
110 | 
111 |     # relax the Union of gt and rs
112 |     Union_erode = Union.copy()
113 |     Union_erode = cv.erode(Union_erode.astype(np.uint8),disk(1),iterations=relax)
114 | 
115 |     # --- get the relaxed False Positive regions for computing the human efforts in correcting them ---
116 |     FP_ = np.logical_and(FP,Union_erode) # get the relaxed FP
117 |     for i in range(0,relax):
118 |         FP_ = cv.dilate(FP_.astype(np.uint8),disk(1))
119 |         FP_ = np.logical_and(FP_, 1-np.logical_or(TP,FN))
120 |     FP_ = np.logical_and(FP, FP_)
121 | 
122 |     # --- get the relaxed False Negative regions for computing the human efforts in correcting them ---
123 |     FN_ = np.logical_and(FN,Union_erode) # preserve the structural components of FN
124 |     ## recover the FN, where pixels are not close to the TP borders
125 |     for i in range(0,relax):
126 |         FN_ = cv.dilate(FN_.astype(np.uint8),disk(1))
127 |         FN_ = np.logical_and(FN_,1-np.logical_or(TP,FP))
128 |     FN_ = np.logical_and(FN,FN_)
129 |     FN_ = np.logical_or(FN_, np.logical_xor(gt_ske,np.logical_and(TP,gt_ske))) # preserve the structural components of FN
130 | 
131 |     ## 2. =============Find exact polygon control points and independent regions==============
132 |     ## find contours from FP_
133 |     ctrs_FP, hier_FP = cv.findContours(FP_.astype(np.uint8), cv.RETR_TREE, cv.CHAIN_APPROX_NONE)
134 |     ## find control points and independent regions for human correction
135 |     bdies_FP, indep_cnt_FP = filter_bdy_cond(ctrs_FP, FP_, np.logical_or(TP,FN_))
136 |     ## find contours from FN_
137 |     ctrs_FN, hier_FN = cv.findContours(FN_.astype(np.uint8), cv.RETR_TREE, cv.CHAIN_APPROX_NONE)
138 |     ## find control points and independent regions for human correction
139 |     bdies_FN, indep_cnt_FN = filter_bdy_cond(ctrs_FN, FN_, 1-np.logical_or(np.logical_or(TP,FP_),FN_))
140 | 
141 |     poly_FP, poly_FP_len, poly_FP_point_cnt = approximate_RDP(bdies_FP,epsilon=epsilon)
142 |     poly_FN, poly_FN_len, poly_FN_point_cnt = approximate_RDP(bdies_FN,epsilon=epsilon)
143 | 
144 |     return poly_FP_point_cnt, indep_cnt_FP, poly_FN_point_cnt, indep_cnt_FN
145 | 
146 | def compute_hce(pred_root,gt_root,gt_ske_root):
147 | 
148 |     gt_name_list = glob(pred_root+'/*.png')
149 |     gt_name_list = sorted([x.split('/')[-1] for x in gt_name_list])
150 | 
151 |     hces = []
152 |     for gt_name in tqdm(gt_name_list, total=len(gt_name_list)):
153 |         gt_path = os.path.join(gt_root, gt_name)
154 |         pred_path = os.path.join(pred_root, gt_name)
155 | 
156 |         gt = cv.imread(gt_path, cv.IMREAD_GRAYSCALE)
157 |         pred = cv.imread(pred_path, cv.IMREAD_GRAYSCALE)
158 | 
159 |         ske_path = os.path.join(gt_ske_root,gt_name)
160 |         if os.path.exists(ske_path):
161 |             ske = cv.imread(ske_path,cv.IMREAD_GRAYSCALE)
162 |             ske = ske>128
163 |         else:
164 |             ske = skeletonize(gt>128)
165 | 
166 |         FP_points, FP_indep, FN_points, FN_indep = relax_HCE(gt, pred,ske)
167 |         print(gt_path.split('/')[-1],FP_points, FP_indep, FN_points, FN_indep)
168 |         hces.append([FP_points, FP_indep, FN_points, FN_indep, FP_points+FP_indep+FN_points+FN_indep])
169 | 
170 |     hce_metric ={'names': gt_name_list,
171 |                  'hces': hces}
172 | 
173 | 
174 |     file_metric = open(pred_root+'/hce_metric.pkl','wb')
175 |     pkl.dump(hce_metric,file_metric)
176 |     # file_metrics.write(cmn_metrics)
177 |     file_metric.close()
178 | 
179 |     return np.mean(np.array(hces)[:,-1])
180 | 
181 | def main():
182 | 
183 |     gt_root = "../DIS5K/DIS-VD/gt"
184 |     gt_ske_root = ""
185 |     pred_root = "../Results/isnet(ours)/DIS-VD"
186 | 
187 |     print("The average HCE metric: ", compute_hce(pred_root,gt_root,gt_ske_root))
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     main()
192 | 


--------------------------------------------------------------------------------
/IS_Net/models/__init__.py:
--------------------------------------------------------------------------------
1 | from IS_Net.models.isnet import ISNetGTEncoder, ISNetDIS
2 | 


--------------------------------------------------------------------------------
/IS_Net/pytorch18.yml:
--------------------------------------------------------------------------------
 1 | name: pytorch18
 2 | channels:
 3 |   - conda-forge
 4 |   - anaconda
 5 |   - pytorch
 6 |   - defaults
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1=main
 9 |   - _openmp_mutex=4.5=1_gnu
10 |   - blas=1.0=mkl
11 |   - brotli=1.0.9=he6710b0_2
12 |   - bzip2=1.0.8=h7b6447c_0
13 |   - ca-certificates=2022.2.1=h06a4308_0
14 |   - certifi=2021.10.8=py37h06a4308_2
15 |   - cloudpickle=2.0.0=pyhd3eb1b0_0
16 |   - colorama=0.4.4=pyhd3eb1b0_0
17 |   - cudatoolkit=10.2.89=hfd86e86_1
18 |   - cycler=0.11.0=pyhd3eb1b0_0
19 |   - cytoolz=0.11.0=py37h7b6447c_0
20 |   - dask-core=2021.10.0=pyhd3eb1b0_0
21 |   - ffmpeg=4.3=hf484d3e_0
22 |   - fonttools=4.25.0=pyhd3eb1b0_0
23 |   - freetype=2.11.0=h70c0345_0
24 |   - fsspec=2022.2.0=pyhd3eb1b0_0
25 |   - gmp=6.2.1=h2531618_2
26 |   - gnutls=3.6.15=he1e5248_0
27 |   - imageio=2.9.0=pyhd3eb1b0_0
28 |   - intel-openmp=2021.4.0=h06a4308_3561
29 |   - jpeg=9b=h024ee3a_2
30 |   - kiwisolver=1.3.2=py37h295c915_0
31 |   - lame=3.100=h7b6447c_0
32 |   - lcms2=2.12=h3be6417_0
33 |   - ld_impl_linux-64=2.35.1=h7274673_9
34 |   - libffi=3.3=he6710b0_2
35 |   - libgcc-ng=9.3.0=h5101ec6_17
36 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
37 |   - libgfortran4=7.5.0=ha8ba4b0_17
38 |   - libgomp=9.3.0=h5101ec6_17
39 |   - libiconv=1.15=h63c8f33_5
40 |   - libidn2=2.3.2=h7f8727e_0
41 |   - libpng=1.6.37=hbc83047_0
42 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
43 |   - libtasn1=4.16.0=h27cfd23_0
44 |   - libtiff=4.2.0=h85742a9_0
45 |   - libunistring=0.9.10=h27cfd23_0
46 |   - libuv=1.40.0=h7b6447c_0
47 |   - libwebp-base=1.2.2=h7f8727e_0
48 |   - locket=0.2.1=py37h06a4308_2
49 |   - lz4-c=1.9.3=h295c915_1
50 |   - matplotlib-base=3.5.1=py37ha18d171_1
51 |   - mkl=2021.4.0=h06a4308_640
52 |   - mkl-service=2.4.0=py37h7f8727e_0
53 |   - mkl_fft=1.3.1=py37hd3c417c_0
54 |   - mkl_random=1.2.2=py37h51133e4_0
55 |   - munkres=1.1.4=py_0
56 |   - ncurses=6.3=h7f8727e_2
57 |   - nettle=3.7.3=hbbd107a_1
58 |   - networkx=2.6.3=pyhd3eb1b0_0
59 |   - ninja=1.10.2=py37hd09550d_3
60 |   - numpy=1.21.2=py37h20f2e39_0
61 |   - numpy-base=1.21.2=py37h79a1101_0
62 |   - olefile=0.46=py37_0
63 |   - openh264=2.1.1=h4ff587b_0
64 |   - openssl=1.1.1n=h7f8727e_0
65 |   - packaging=21.3=pyhd3eb1b0_0
66 |   - partd=1.2.0=pyhd3eb1b0_1
67 |   - pillow=8.0.0=py37h9a89aac_0
68 |   - pip=21.2.2=py37h06a4308_0
69 |   - pyparsing=3.0.4=pyhd3eb1b0_0
70 |   - python=3.7.11=h12debd9_0
71 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
72 |   - pytorch=1.8.0=py3.7_cuda10.2_cudnn7.6.5_0
73 |   - pywavelets=1.1.1=py37h7b6447c_2
74 |   - pyyaml=6.0=py37h7f8727e_1
75 |   - readline=8.1.2=h7f8727e_1
76 |   - scikit-image=0.15.0=py37hb3f55d8_2
77 |   - scipy=1.7.3=py37hc147768_0
78 |   - setuptools=58.0.4=py37h06a4308_0
79 |   - six=1.16.0=pyhd3eb1b0_1
80 |   - sqlite=3.38.0=hc218d9a_0
81 |   - tk=8.6.11=h1ccaba5_0
82 |   - toolz=0.11.2=pyhd3eb1b0_0
83 |   - torchaudio=0.8.0=py37
84 |   - torchvision=0.9.0=py37_cu102
85 |   - tqdm=4.63.0=pyhd8ed1ab_0
86 |   - typing_extensions=3.10.0.2=pyh06a4308_0
87 |   - wheel=0.37.1=pyhd3eb1b0_0
88 |   - xz=5.2.5=h7b6447c_0
89 |   - yaml=0.2.5=h7b6447c_0
90 |   - zlib=1.2.11=h7f8727e_4
91 |   - zstd=1.4.9=haebb681_0
92 | prefix: /home/solar/anaconda3/envs/pytorch18
93 | 


--------------------------------------------------------------------------------
/IS_Net/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: linux-64
 4 | _libgcc_mutex=0.1=main
 5 | _openmp_mutex=4.5=1_gnu
 6 | blas=1.0=mkl
 7 | brotli=1.0.9=he6710b0_2
 8 | bzip2=1.0.8=h7b6447c_0
 9 | ca-certificates=2022.2.1=h06a4308_0
10 | certifi=2021.10.8=py37h06a4308_2
11 | cloudpickle=2.0.0=pyhd3eb1b0_0
12 | colorama=0.4.4=pyhd3eb1b0_0
13 | cudatoolkit=10.2.89=hfd86e86_1
14 | cycler=0.11.0=pyhd3eb1b0_0
15 | cytoolz=0.11.0=py37h7b6447c_0
16 | dask-core=2021.10.0=pyhd3eb1b0_0
17 | ffmpeg=4.3=hf484d3e_0
18 | fonttools=4.25.0=pyhd3eb1b0_0
19 | freetype=2.11.0=h70c0345_0
20 | fsspec=2022.2.0=pyhd3eb1b0_0
21 | gmp=6.2.1=h2531618_2
22 | gnutls=3.6.15=he1e5248_0
23 | imageio=2.9.0=pyhd3eb1b0_0
24 | intel-openmp=2021.4.0=h06a4308_3561
25 | jpeg=9b=h024ee3a_2
26 | kiwisolver=1.3.2=py37h295c915_0
27 | lame=3.100=h7b6447c_0
28 | lcms2=2.12=h3be6417_0
29 | ld_impl_linux-64=2.35.1=h7274673_9
30 | libffi=3.3=he6710b0_2
31 | libgcc-ng=9.3.0=h5101ec6_17
32 | libgfortran-ng=7.5.0=ha8ba4b0_17
33 | libgfortran4=7.5.0=ha8ba4b0_17
34 | libgomp=9.3.0=h5101ec6_17
35 | libiconv=1.15=h63c8f33_5
36 | libidn2=2.3.2=h7f8727e_0
37 | libpng=1.6.37=hbc83047_0
38 | libstdcxx-ng=9.3.0=hd4cf53a_17
39 | libtasn1=4.16.0=h27cfd23_0
40 | libtiff=4.2.0=h85742a9_0
41 | libunistring=0.9.10=h27cfd23_0
42 | libuv=1.40.0=h7b6447c_0
43 | libwebp-base=1.2.2=h7f8727e_0
44 | locket=0.2.1=py37h06a4308_2
45 | lz4-c=1.9.3=h295c915_1
46 | matplotlib-base=3.5.1=py37ha18d171_1
47 | mkl=2021.4.0=h06a4308_640
48 | mkl-service=2.4.0=py37h7f8727e_0
49 | mkl_fft=1.3.1=py37hd3c417c_0
50 | mkl_random=1.2.2=py37h51133e4_0
51 | munkres=1.1.4=py_0
52 | ncurses=6.3=h7f8727e_2
53 | nettle=3.7.3=hbbd107a_1
54 | networkx=2.6.3=pyhd3eb1b0_0
55 | ninja=1.10.2=py37hd09550d_3
56 | numpy=1.21.2=py37h20f2e39_0
57 | numpy-base=1.21.2=py37h79a1101_0
58 | olefile=0.46=py37_0
59 | openh264=2.1.1=h4ff587b_0
60 | openssl=1.1.1n=h7f8727e_0
61 | packaging=21.3=pyhd3eb1b0_0
62 | partd=1.2.0=pyhd3eb1b0_1
63 | pillow=8.0.0=py37h9a89aac_0
64 | pip=21.2.2=py37h06a4308_0
65 | pyparsing=3.0.4=pyhd3eb1b0_0
66 | python=3.7.11=h12debd9_0
67 | python-dateutil=2.8.2=pyhd3eb1b0_0
68 | pytorch=1.8.0=py3.7_cuda10.2_cudnn7.6.5_0
69 | pywavelets=1.1.1=py37h7b6447c_2
70 | pyyaml=6.0=py37h7f8727e_1
71 | readline=8.1.2=h7f8727e_1
72 | scikit-image=0.15.0=py37hb3f55d8_2
73 | scipy=1.7.3=py37hc147768_0
74 | setuptools=58.0.4=py37h06a4308_0
75 | six=1.16.0=pyhd3eb1b0_1
76 | sqlite=3.38.0=hc218d9a_0
77 | tk=8.6.11=h1ccaba5_0
78 | toolz=0.11.2=pyhd3eb1b0_0
79 | torchaudio=0.8.0=py37
80 | torchvision=0.9.0=py37_cu102
81 | tqdm=4.63.0=pyhd8ed1ab_0
82 | typing_extensions=3.10.0.2=pyh06a4308_0
83 | wheel=0.37.1=pyhd3eb1b0_0
84 | xz=5.2.5=h7b6447c_0
85 | yaml=0.2.5=h7b6447c_0
86 | zlib=1.2.11=h7f8727e_4
87 | zstd=1.4.9=haebb681_0
88 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2025 Apple Inc. All Rights Reserved.
 2 | 
 3 | IMPORTANT:  This Apple software is supplied to you by Apple
 4 | Inc. ("Apple") in consideration of your agreement to the following
 5 | terms, and your use, installation, modification or redistribution of
 6 | this Apple software constitutes acceptance of these terms.  If you do
 7 | not agree with these terms, please do not use, install, modify or
 8 | redistribute this Apple software.
 9 | 
10 | In consideration of your agreement to abide by the following terms, and
11 | subject to these terms, Apple grants you a personal, non-exclusive
12 | license, under Apple's copyrights in this original Apple software (the
13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple
14 | Software, with or without modifications, in source and/or binary forms;
15 | provided that if you redistribute the Apple Software in its entirety and
16 | without modifications, you must retain this notice and the following
17 | text and disclaimers in all such redistributions of the Apple Software.
18 | Neither the name, trademarks, service marks or logos of Apple Inc. may
19 | be used to endorse or promote products derived from the Apple Software
20 | without specific prior written permission from Apple.  Except as
21 | expressly stated in this notice, no other rights or licenses, express or
22 | implied, are granted by Apple herein, including but not limited to any
23 | patent rights that may be infringed by your derivative works or by other
24 | works in which the Apple Software may be incorporated.
25 | 
26 | The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31 | 
32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39 | POSSIBILITY OF SUCH DAMAGE.
40 | 


--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025 Apple Inc. All Rights Reserved.
 2 | 
 3 | Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is specifically
 4 | developed and released by Apple Inc. ("Apple") for the sole purpose of scientific
 5 | research of artificial intelligence and machine-learning technology. “Apple Machine
 6 | Learning Research Model” means the model, including but not limited to algorithms,
 7 | formulas, trained model weights, parameters, configurations, checkpoints, and any
 8 | related materials (including documentation).
 9 | 
10 | This Apple Machine Learning Research Model is provided to You by Apple in
11 | consideration of your agreement to the following terms, and your use, modification,
12 | creation of Model Derivatives, and or redistribution of the Apple Machine Learning
13 | Research Model constitutes acceptance of this Agreement. If You do not agree with
14 | these terms, please do not use, modify, create Model Derivatives of, or distribute this
15 | Apple Machine Learning Research Model or Model Derivatives.
16 | 
17 | 1. License Scope: In consideration of your agreement to abide by the following
18 | terms, and subject to these terms, Apple hereby grants you a personal, non-
19 | exclusive, worldwide, non-transferable, royalty-free, revocable, and limited
20 | license, to use, copy, modify, distribute, and create Model Derivatives (defined
21 | below) of the Apple Machine Learning Research Model exclusively for Research
22 | Purposes. You agree that any Model Derivatives You may create or that may be
23 | created for You will be limited to Research Purposes as well. “Research
24 | Purposes” means non-commercial scientific research and academic
25 | development activities, such as experimentation, analysis, testing conducted by
26 | You with the sole intent to advance scientific knowledge and research.
27 | “Research Purposes” does not include any commercial exploitation, product
28 | development or use in any commercial product or service.
29 | 
30 | 2. Distribution of Apple Machine Learning Research Model and Model Derivatives:
31 | If you choose to redistribute Apple Machine Learning Research Model or its
32 | Model Derivatives, you must provide a copy of this Agreement to such third
33 | party, and ensure that the following attribution notice be provided: “Apple
34 | Machine Learning Research Model is licensed under the Apple Machine
35 | Learning Research Model License Agreement.” Additionally, all Model
36 | Derivatives must clearly be identified as such, including disclosure of
37 | modifications and changes made to the Apple Machine Learning Research
38 | Model. The name, trademarks, service marks or logos of Apple may not be used
39 | to endorse or promote Model Derivatives or the relationship between You and
40 | Apple. “Model Derivatives” means any models or any other artifacts created by
41 | modifications, improvements, adaptations, alterations to the architecture,
42 | algorithm or training processes of the Apple Machine Learning Research Model,
43 | or by any retraining, fine-tuning of the Apple Machine Learning Research
44 | Model.
45 | 
46 | 3. No Other License: Except as expressly stated in this notice, no other rights or
47 | licenses, express or implied, are granted by Apple herein, including but not
48 | limited to any patent, trademark, and similar intellectual property rights
49 | worldwide that may be infringed by the Apple Machine Learning Research
50 | Model, the Model Derivatives or by other works in which the Apple Machine
51 | Learning Research Model may be incorporated.
52 | 
53 | 4. Compliance with Laws: Your use of Apple Machine Learning Research Model
54 | must be in compliance with all applicable laws and regulations.
55 | 
56 | 5. Term and Termination: The term of this Agreement will begin upon your
57 | acceptance of this Agreement or use of the Apple Machine Learning Research
58 | Model and will continue until terminated in accordance with the following terms.
59 | Apple may terminate this Agreement at any time if You are in breach of any term
60 | or condition of this Agreement. Upon termination of this Agreement, You must
61 | cease to use all Apple Machine Learning Research Models and Model
62 | Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
63 | survive termination.
64 | 
65 | 6. Disclaimer and Limitation of Liability: This Apple Machine Learning Research
66 | Model and any outputs generated by the Apple Machine Learning Research
67 | Model are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES,
68 | EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
69 | WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
70 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE MACHINE
71 | LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY THE APPLE
72 | MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
73 | determining the appropriateness of using or redistributing the Apple Machine
74 | Learning Research Model and any outputs of the Apple Machine Learning
75 | Research Model and assume any risks associated with Your use of the Apple
76 | Machine Learning Research Model and any output and results. IN NO EVENT
77 | SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
78 | CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE,
79 | REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF THE APPLE
80 | MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE
81 | MACHINE LEARNING RESEARCH MODEL, HOWEVER CAUSED AND
82 | WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING 
83 | NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
84 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 | 
86 | 7. Governing Law: This Agreement will be governed by and construed under the
87 | laws of the State of California without regard to its choice of law principles. The
88 | Convention on Contracts for the International Sale of Goods shall not apply to
89 | the Agreement except that the arbitration clause and any arbitration hereunder
90 | shall be governed by the Federal Arbitration Act, Chapters 1 and 2.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Matrix3D: Large Photogrammetry Model All-in-One
  2 | Yuanxun Lu<sup>1\*</sup>, Jingyang Zhang<sup>2\*</sup>, Tian Fang<sup>2</sup>, Jean-Daniel Nahmias<sup>2</sup>, Yanghai Tsin<sup>2</sup>, Long Quan<sup>3</sup>, Xun Cao<sup>1</sup>, Yao Yao<sup>1†</sup>, Shiwei Li<sup>2</sup>  
  3 | <sup>1</sup>Nanjing University, <sup>2</sup>Apple, <sup>3</sup>HKUST  
  4 | <sup>\*</sup>Equal contribution <sup>†</sup>Corresponding author
  5 | 
  6 | ### [Project Page](https://nju-3dv.github.io/projects/matrix3d/) | [Paper](https://arxiv.org/abs/2502.07685) | [Weights](#environment-setup) 
  7 | 
  8 | This is the official implementation of Matrix3D, a unified model that performs several photogrammetry subtasks, including pose estimation, depth prediction, and novel view synthesis using the same model.
  9 | 
 10 | This repository includes the model inference pipeline and the modified 3DGS reconstruction pipeline for 3D reconstruction.
 11 | 
 12 | <p align="center">
 13 |   <img width="90%" src="docs/inference-pipe.png"/>
 14 | </p>
 15 | <p align="center">
 16 |    <em>Matrix3D supports various photogrammetry tasks via masked inference.</em>
 17 | <br>
 18 | 
 19 | ## Environment Setup
 20 | 
 21 | - This project is successfully tested on Ubuntu 20.04 with PyTorch 2.4 (Python 3.10). We recommend creating a new environment and install necessary dependencies:
 22 | 
 23 |   ```
 24 |   conda create -y -n matrix3d python=3.10
 25 |   conda activate matrix3d
 26 |   # Here we take Pytorch 2.4 with cuda 11.8 as an example
 27 |   # If you install a different PyTorch version, please select a matched xformers/pytorch3d version
 28 |   pip install torch==2.4.0 torchvision==0.19.0 xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu118
 29 |   pip install --extra-index-url https://miropsota.github.io/torch_packages_builder pytorch3d==0.7.7+pt2.4.0cu118
 30 |   pip install -r requirements.txt
 31 |   # fixed the requirement conflicts from nerfstudio
 32 |   pip install timm==1.0.11
 33 |   ```
 34 |   Some dependencies may require CUDA with the same version used by `torch` in your system, and the installation may not work out of the box. Please refer to their official repo for troubleshooting.
 35 | 
 36 | * Download the Pre-trained model:
 37 |   * Download the checkpoints: [matrix3d_512.pt](https://ml-site.cdn-apple.com/models/matrix3d/matrix3d_512.pt)
 38 |   * Create a `checkpoints` folder and put the pre-trained model into it.
 39 | * (Optional) Download `IS-Net` checkpoint if you would like to use single-view to 3d reconstruction:
 40 |   * Download the pre-trained model `isnet-general-use.pth` from the [DIS official repo](https://github.com/xuebinqin/DIS) and also put it into the `checkpoints` folder.
 41 | 
 42 | ## Run Demo
 43 | 
 44 | - Matrix3D supports several photogrammetry tasks and their dynamic compositions via masked inference. Here we provide several example scripts on the CO3Dv2 dataset. All results will be saved to the `results` folder by default.
 45 | 
 46 |   - **Novel View Synthesis**
 47 | 
 48 |     ```
 49 |     sh scripts/novel_view_synthesis.sh examples/co3dv2-samples/31_1359_4114
 50 |     ```
 51 | 
 52 |     This script demonstrates the usage of novel view synthesis from single-view image input. 
 53 | 
 54 |     For all diffusion sampling tasks, we use indicators `mod_flags` and `view_ids` to control the input states in `L48-L56`. You could try to set a different modality flag or view numbers to achieve different tasks, such as predict novel views from 2 posed RGB images.
 55 | 
 56 |   - **Pose Estimation**
 57 | 
 58 |     ```
 59 |     sh scripts/pose_estimation.sh examples/co3dv2-samples/31_1359_4114
 60 |     ```
 61 | 
 62 |     This script demonstrates the usage of pose prediction from images. The saved `*.png` and `*.html` file demonstrates a visual comparison between predictions and groundtruth values.
 63 | 
 64 |     Replace the data root to an unposed data folder like `examples/unposed-samples/co3dv2/201_21613_43652` would generate the results without comparisons to groundtruth poses. 
 65 | 
 66 |     It is **strongly recommended** to provide the camera intrinsics saved in the .txt files since the model is trained with known camera intrinsics. If not, the processor would set a default fov=60 and performance may degrade. You could also change the default Fov value by passing `--default_fov`. 
 67 | 
 68 |   - **Depth Prediction**
 69 | 
 70 |     ```
 71 |     sh scripts/depth_prediction.sh examples/co3dv2-samples/31_1359_4114
 72 |     ```
 73 | 
 74 |     This script demonstrates the usage of depth prediction from several posed images. The back-projected groundtruth and prediction point clouds can be found in the folder.
 75 | 
 76 | - By dynamically combining the above tasks, one could later apply a modified 3DGS pipeline to achieve 3D reconstruction from various inputs, even with unknown camera parameters. In the following, we provide two specific examples:
 77 | 
 78 |   - **Single-view to 3D**
 79 | 
 80 |     ```
 81 |     sh scripts/single_view_to_3d.sh single-view-to-3d examples/single-view/skull.png
 82 |     ```
 83 | 
 84 |     The 3DGS rendering results are saved in `results/single-view-to-3d/skull/3DGS-render-traj.mp4`.
 85 | 
 86 |     In this task, camera Fov is set to 60 by default, while you could also manually set it by creating a `$name.txt` file along with the image. The dataprocessor would automatically load it. For example, you could replace the `skull.png` with `ghost.png`. 
 87 | 
 88 |     Please check the `examples/single-view` folder for more examples.
 89 | 
 90 |   - **Unposed Few-shot to 3D**
 91 | 
 92 |     ```
 93 |     sh scripts/unposed_fewshot_to_3d_co3dv2.sh unposed-fewshot-to-3d examples/unposed-samples/co3dv2/31_1359_4114
 94 |     ```
 95 | 
 96 |     This script demonstrates a reconstruction process from unposed images in CO3Dv2 dataset. Note that the camera trajectories of novel views are sampled on fitted splines from predicted poses and designed to work under object-centric scenes. The specific interpolation video is saved as `3DGS-render-traj1.mp4` by default. You could also change to apply reconstruction on arkitscenes data as follows:
 97 |     
 98 |     ```
 99 |     sh scripts/unposed_fewshot_to_3d_arkitscenes.sh unposed-fewshot-to-3d examples/unposed-samples/arkitscenes/41069043
100 |     ```
101 |     
102 |     The only difference lies in the splined camera generation while the 3DGS part is exactly same. You may need to tune the parameters of trajectory generation and 3DGS reconstruction for different datasets to achieve higher performance.
103 | 
104 | - Based on the examples above, you can flexibly define specifically tailored tasks by combining different inputs.
105 | 
106 | - Notes:
107 | 
108 |   - When trying on the diffusion process, please carefully assign the values of indicators `mods_flags` and `view_ids`. Besides, the model is trained with a maximum view number of 8, so do not set `view_ids` larger than 8 views.
109 |   - The example data in `examples/co3dv2-samples` and `examples/unposed-samples` are part of CO3Dv2 and ARKitScenes datasets. The camera extrinsic is saved in FOV values or Blender camera coordinates. In processing, we would convert them into PyTorch3D cameras, and these part codes could be found in `L654-659` from `data/data_preprocessor.py`. Therefore, it is easy for users to change to different camera representations, e.g., you could apply the official Pytorch3D conversion function `pytorch3d.utils.cameras_from_opencv_projection` to convert OpenCV cameras into Pytorch3D cameras.
110 | 
111 | 
112 | ## License
113 | 
114 | This sample code is released under the [LICENSE](LICENSE) terms.
115 | 
116 | ## Citation
117 | ```
118 | @article{lu2025matrix3d,
119 |   title={Matrix3D: Large Photogrammetry Model All-in-One},
120 |   author={Lu, Yuanxun and Zhang, Jingyang and Fang, Tian and Nahmias, Jean-Daniel and Tsin, Yanghai and Quan, Long and Cao, Xun and Yao, Yao and Li, Shiwei},
121 |   journal={Computer Vision and Pattern Recognition (CVPR)},
122 |   year={2025}
123 | }
124 | ```
125 | 


--------------------------------------------------------------------------------
/configs/config_stage3.yaml:
--------------------------------------------------------------------------------
  1 | train:
  2 |   val_inference_steps: 50
  3 |   val_scheduler: DDPM
  4 |   val_height: 512
  5 |   val_width: 512
  6 |   
  7 | eval:
  8 |   val_inference_steps: 50
  9 |   scheduler: DDPM 
 10 |   save_image: True
 11 | 
 12 | modalities:
 13 |   rgb:
 14 |     dimensions: 2
 15 |     height: 64
 16 |     width: 64
 17 |     patch_size: 2
 18 |     dense: True
 19 |     gen_channel:
 20 |       latent: 4
 21 |     cond_channel:
 22 |       dino: 768
 23 |       latent: 4
 24 |   ray:
 25 |     dimensions: 2
 26 |     height: 32
 27 |     width: 32
 28 |     patch_size: 1
 29 |     dense: True
 30 |     gen_channel:
 31 |       dir: 3
 32 |       moment: 3
 33 |     cond_channel:
 34 |       dir: 3
 35 |       moment: 3
 36 |   depth:
 37 |     dimensions: 2
 38 |     height: &depth_size 128
 39 |     width: 128
 40 |     patch_size: 4
 41 |     dense: False
 42 |     seq_len: &depth_token_num 1024
 43 |     gen_channel:
 44 |       disparity: 1
 45 |     gen_aux_channel:
 46 |       valid: 1
 47 |     cond_channel:
 48 |       disparity: 1
 49 |       valid: 1
 50 |   local_caption:
 51 |     dimensions: 1
 52 |     cond_channel:
 53 |       latent: 1024
 54 |   global_caption:
 55 |     dimensions: 1
 56 |     cond_channel:
 57 |       latent: 1024
 58 | 
 59 | data:
 60 |   shuffle: True
 61 |   modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption']
 62 |   modalities_probs: [[2, 2, 1], [2, 2, 1], [2, 2, 1], [0, 1, 0], [0, 1, 1]]   # each modality contains three probs: gens/conds/not_used. no need to sum to 1, would be normalized automatically
 63 |   dataset_supported_modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption']
 64 |   dataset_type: 'object-centric'  # 'object-centric', 'scenes'
 65 |   shift_scales:  # mean/scale
 66 |     rgb: [0.000, 1.000]
 67 |     ray:
 68 |       dirs: [0.000, 1.800]
 69 |       moms: [-0.100, 2.700]
 70 |       origins: [-0.145, 1.700]     # origin & direction are only used when use_plucker==False
 71 |       directions: [0.145, 1.715]
 72 |     depth: [1.100, 2.000]  
 73 |   num_view: [2, 8]
 74 |   num_batch_per_scene: null
 75 |   cond_size: 896
 76 |   gen_size: 512
 77 |   raymap_size: 32       # this should be consistent to the model rays config!
 78 |   use_plucker: True     # if False, use ray origins and directions instead
 79 |   use_background: True
 80 |   use_depth_valid_only: True
 81 |   background_color: "white"
 82 |   # relative_pose: "raydiffusion_refcam" 
 83 |   pose_trans_jitter: 0.0
 84 |   relative_rot: True
 85 |   relative_trans: 1.0
 86 |   pre_resize: 512    # always pre-resize images to 512, and predict the KRT of 512-sized images
 87 |   depth_size: *depth_size     # TODO: this is a temp setting, should align with the patch size in model config!
 88 |   depth_samples_per_images: *depth_token_num 
 89 |   center_crop_min_scale: 1.0  # in [0.0, 1.0], 0.6
 90 |   center_crop_max_jitter: 0.0    # in pixel  15
 91 |   per_sample_aug_enable: False   # For vae training only, set this to false for all other paras
 92 |   per_sample_aug:
 93 |     depth:
 94 |       rotate: [-30.0, 30.0]
 95 |       scale: [0.5, 1.0]
 96 |       value_scales: [0.5, 2.0]
 97 |   raydiffusion_official: False
 98 |   dataset_overwrite:
 99 |     mvimgnet:
100 |       dataset_supported_modalities: ['rgb', 'ray', 'local_caption', 'global_caption']
101 |       use_background: False
102 |     co3dv2:
103 |       use_background: False
104 |     realestate10k:
105 |       dataset_supported_modalities: ['rgb', 'ray', 'local_caption', 'global_caption']
106 |       dataset_type: 'scenes'
107 |       use_background: False
108 |     hypersim:
109 |       dataset_type: 'scenes'
110 |       use_background: False
111 |     arkitscenes:
112 |       dataset_supported_modalities: ['rgb', 'depth', 'global_caption', 'local_caption']
113 |       dataset_type: 'scenes'
114 |       use_background: False
115 |     dtu:
116 |       dataset_supported_modalities: ['rgb', 'ray', 'depth']
117 |       use_background: False
118 |     mipnerf360:
119 |       dataset_supported_modalities: ['rgb', 'ray']
120 |       use_background: False
121 |     llff:
122 |       dataset_supported_modalities: ['rgb', 'ray']
123 |       dataset_type: 'scenes'
124 |       use_background: False
125 |   validation_overwrite:
126 |     num_batch_per_scene: 1
127 |     center_crop_min_scale: 1.0
128 |     center_crop_max_jitter: 0.0
129 |     pose_trans_jitter: 0.0
130 |     modalities: ['rgb', 'ray', 'depth', 'local_caption', 'global_caption']
131 |     modalities_probs: [[2, 2, 0], [2, 2, 0], [2, 2, 0], [0, 1, 0], [0, 1, 0]]   # each modality contains three probs: gens/conds/not_used. no need to sum to 1, would be normalized automatically
132 |     num_view: [2, 8]
133 |   evaluation_overwrite:
134 |     shuffle: False
135 |     center_crop_min_scale: 1.0
136 |     center_crop_max_jitter: 0.0
137 |     pose_trans_jitter: 0.0
138 |     modalities_probs: [[2, 2, 0], [2, 2, 0], [2, 2, 0], [0, 1, 0], [0, 1, 0]]
139 |   inference_overwrite:
140 |     shuffle: False
141 |     num_batch_per_scene: 1
142 |     num_view: 8
143 | 
144 | model:
145 |   model_type: dit
146 |   hidden_size: 1024
147 |   depth: 40
148 |   encoder: True
149 |   encoder_depth: 20
150 |   decoder_url: Tencent-Hunyuan/HunyuanDiT-Diffusers
151 |   scheduler_url: Tencent-Hunyuan/HunyuanDiT-Diffusers
152 |   peripheral_url: Tencent-Hunyuan/HunyuanDiT-Diffusers
153 |   qk_norm: rmsnorm  # "layernorm" or None
154 |   mod_norm: True  # modality-specific normalization
155 |   pe_config:  # choose from 'sinusoid', 'sinusoid_all' and 'rope'
156 |     pos:
157 |       type: rope
158 |       base_size: 64
159 |     view:
160 |       type: sinusoid_all
161 |       base: 70007
162 |       max: 8
163 |       zero_init: False
164 |     mod:
165 |       type: sinusoid_all
166 |       base: 30003
167 |       max: 5   # Local (view-dependent): RGB + Ray + Disparity + Text  Global (view-independent): global text description
168 |       zero_init: False


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #
5 | from .data_preprocessor import Preprocessor


--------------------------------------------------------------------------------
/docs/inference-pipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/docs/inference-pipe.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/047.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/047.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/047.txt:
--------------------------------------------------------------------------------
1 | 1202.11938
2 | 1202.11938
3 | 340.00000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/047_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/047_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/047_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.03058
 2 | -0.93401
 3 | 0.35594
 4 | 0.72073
 5 | -0.13265
 6 | 0.34916
 7 | 0.92763
 8 | 1.12437
 9 | -0.99069
10 | -0.07559
11 | -0.11322
12 | -11.29426
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/051.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/051.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/051.txt:
--------------------------------------------------------------------------------
1 | 1206.45020
2 | 1206.45020
3 | 340.00000
4 | 605.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/051_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/051_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/051_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.16954
 2 | -0.97649
 3 | 0.13312
 4 | 0.47076
 5 | -0.12253
 6 | 0.15491
 7 | 0.98030
 8 | 1.05989
 9 | -0.97788
10 | 0.14989
11 | -0.14591
12 | -12.12915
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/060.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/060.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/060.txt:
--------------------------------------------------------------------------------
1 | 1196.35779
2 | 1196.35779
3 | 340.00000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/060_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/060_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/060_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.55893
 2 | -0.75033
 3 | -0.35298
 4 | 0.22116
 5 | -0.23823
 6 | -0.26243
 7 | 0.93508
 8 | 0.66358
 9 | -0.79425
10 | 0.60674
11 | -0.03207
12 | -13.40600
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/084.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/084.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/084.txt:
--------------------------------------------------------------------------------
1 | 1206.86255
2 | 1206.86255
3 | 339.00000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/084_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/084_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/084_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.58065
 2 | 0.63914
 3 | -0.50432
 4 | 0.05580
 5 | -0.81253
 6 | -0.49396
 7 | 0.30951
 8 | 0.52403
 9 | -0.05130
10 | 0.58949
11 | 0.80614
12 | -11.59393
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/122.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/122.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/122.txt:
--------------------------------------------------------------------------------
1 | 1214.67126
2 | 1214.67126
3 | 340.00000
4 | 605.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/122_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/122_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/122_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.51593
 2 | 0.49504
 3 | 0.69911
 4 | 0.39027
 5 | -0.78450
 6 | 0.60082
 7 | 0.15351
 8 | 0.70223
 9 | -0.34405
10 | -0.62765
11 | 0.69834
12 | -11.89899
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/126.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/126.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/126.txt:
--------------------------------------------------------------------------------
1 | 1238.41101
2 | 1238.41101
3 | 340.00000
4 | 605.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/126_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/126_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/126_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.55931
 2 | 0.14791
 3 | 0.81565
 4 | 0.43170
 5 | -0.68172
 6 | 0.64188
 7 | 0.35107
 8 | 1.23148
 9 | -0.47162
10 | -0.75241
11 | 0.45985
12 | -12.58318
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/161.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/161.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/161.txt:
--------------------------------------------------------------------------------
1 | 1202.45288
2 | 1202.45288
3 | 339.50000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/161_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/161_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/161_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.63609
 2 | -0.63176
 3 | -0.44302
 4 | 0.61886
 5 | -0.28926
 6 | -0.33704
 7 | 0.89595
 8 | 0.64184
 9 | -0.71534
10 | 0.69806
11 | 0.03165
12 | -13.78141
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/164.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/164.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/164.txt:
--------------------------------------------------------------------------------
1 | 1205.65637
2 | 1205.65637
3 | 339.50000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/164_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/195_20989_41543/164_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/195_20989_41543/164_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.68144
 2 | -0.54741
 3 | -0.48578
 4 | 0.28494
 5 | -0.32165
 6 | -0.37220
 7 | 0.87064
 8 | 0.67835
 9 | -0.65740
10 | 0.74954
11 | 0.07756
12 | -13.76035
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/000.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/000.txt:
--------------------------------------------------------------------------------
1 | 2601.26636
2 | 2601.26636
3 | 945.50000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/000_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/000_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/000_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.00096
 2 | 1.00000
 3 | 0.00071
 4 | -0.33653
 5 | -1.00000
 6 | -0.00096
 7 | -0.00079
 8 | 0.69979
 9 | -0.00079
10 | -0.00071
11 | 1.00000
12 | -15.21722
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/038.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/038.txt:
--------------------------------------------------------------------------------
1 | 2602.71191
2 | 2602.71191
3 | 944.50000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/038_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/038_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/038_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.52150
 2 | -0.42915
 3 | 0.73748
 4 | -0.02598
 5 | -0.47699
 6 | 0.57002
 7 | 0.66900
 8 | 1.60313
 9 | -0.70747
10 | -0.70065
11 | 0.09257
12 | -16.45008
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/051.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/051.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/051.txt:
--------------------------------------------------------------------------------
1 | 2619.51001
2 | 2619.51001
3 | 945.00000
4 | 531.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/051_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/051_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/051_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.08393
 2 | -0.98197
 3 | 0.16938
 4 | -1.09742
 5 | -0.27862
 6 | 0.14008
 7 | 0.95013
 8 | 1.32587
 9 | -0.95673
10 | -0.12694
11 | -0.26184
12 | -16.52538
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/074.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/074.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/074.txt:
--------------------------------------------------------------------------------
1 | 2551.71631
2 | 2551.71631
3 | 945.50000
4 | 531.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/074_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/074_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/074_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.53447
 2 | -0.30849
 3 | -0.78688
 4 | 0.88837
 5 | -0.53093
 6 | -0.60184
 7 | 0.59657
 8 | 1.00150
 9 | -0.65761
10 | 0.73663
11 | 0.15787
12 | -14.44483
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/121.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/121.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/121.txt:
--------------------------------------------------------------------------------
1 | 2577.85205
2 | 2577.85205
3 | 945.50000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/121_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/121_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/121_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.45418
 2 | 0.63051
 3 | 0.62943
 4 | 0.15200
 5 | -0.85992
 6 | 0.49495
 7 | 0.12471
 8 | 1.70349
 9 | -0.23291
10 | -0.59790
11 | 0.76699
12 | -15.90632
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/122.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/122.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/122.txt:
--------------------------------------------------------------------------------
1 | 2582.54541
2 | 2582.54541
3 | 945.00000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/122_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/122_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/122_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.52242
 2 | 0.40567
 3 | 0.75001
 4 | 0.08422
 5 | -0.78922
 6 | 0.56304
 7 | 0.24519
 8 | 1.42875
 9 | -0.32282
10 | -0.72001
11 | 0.61431
12 | -16.41819
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/155.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/155.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/155.txt:
--------------------------------------------------------------------------------
1 | 2578.92993
2 | 2578.92993
3 | 945.00000
4 | 531.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/155_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/155_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/155_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.04384
 2 | -0.99894
 3 | 0.01393
 4 | -0.39690
 5 | -0.28022
 6 | 0.02567
 7 | 0.95959
 8 | 1.39255
 9 | -0.95894
10 | 0.03817
11 | -0.28105
12 | -16.14845
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/176.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/176.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/176.txt:
--------------------------------------------------------------------------------
1 | 2588.79712
2 | 2588.79712
3 | 945.00000
4 | 530.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/176_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/198_21285_41285/176_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/198_21285_41285/176_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.58317
 2 | -0.17790
 3 | -0.79263
 4 | 0.00340
 5 | -0.56393
 6 | -0.61365
 7 | 0.55264
 8 | 0.67918
 9 | -0.58472
10 | 0.76927
11 | 0.25754
12 | -14.50829
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/000.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/000.txt:
--------------------------------------------------------------------------------
1 | 811.81531
2 | 811.81531
3 | 237.50000
4 | 418.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/000_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/000_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/000_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.04156
 2 | 0.99838
 3 | 0.03880
 4 | -0.47825
 5 | -0.99905
 6 | 0.04204
 7 | -0.01169
 8 | 1.66573
 9 | -0.01330
10 | -0.03828
11 | 0.99918
12 | -17.26369
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/027.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/027.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/027.txt:
--------------------------------------------------------------------------------
1 | 813.34485
2 | 813.34485
3 | 237.50000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/027_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/027_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/027_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.76536
 2 | 0.21894
 3 | -0.60522
 4 | -0.97550
 5 | -0.52489
 6 | -0.75651
 7 | 0.39010
 8 | 0.31043
 9 | -0.37245
10 | 0.61624
11 | 0.69392
12 | -16.39164
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/038.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/038.txt:
--------------------------------------------------------------------------------
1 | 812.97998
2 | 812.97998
3 | 237.00000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/038_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/038_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/038_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.67144
 2 | -0.47797
 3 | -0.56632
 4 | -1.71187
 5 | -0.12918
 6 | -0.67701
 7 | 0.72455
 8 | 0.03485
 9 | -0.72972
10 | 0.55964
11 | 0.39282
12 | -16.93951
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/051.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/051.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/051.txt:
--------------------------------------------------------------------------------
1 | 818.33954
2 | 818.33954
3 | 237.50000
4 | 419.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/051_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/051_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/051_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.06066
 2 | -0.99810
 3 | -0.01033
 4 | -1.12151
 5 | 0.17837
 6 | -0.02102
 7 | 0.98374
 8 | 1.85056
 9 | -0.98209
10 | 0.05783
11 | 0.17931
12 | -18.70874
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/060.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/060.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/060.txt:
--------------------------------------------------------------------------------
1 | 821.59253
2 | 821.59253
3 | 237.50000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/060_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/060_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/060_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.43974
 2 | -0.78924
 3 | 0.42863
 4 | -0.50657
 5 | 0.09524
 6 | 0.51554
 7 | 0.85156
 8 | 1.77140
 9 | -0.89306
10 | -0.33364
11 | 0.30187
12 | -18.07548
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/067.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/067.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/067.txt:
--------------------------------------------------------------------------------
1 | 814.11646
2 | 814.11646
3 | 237.50000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/067_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/067_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/067_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.65844
 2 | -0.47740
 3 | 0.58184
 4 | -0.69980
 5 | -0.07103
 6 | 0.73021
 7 | 0.67952
 8 | 1.48863
 9 | -0.74927
10 | -0.48875
11 | 0.44689
12 | -16.04611
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/099.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/099.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/099.txt:
--------------------------------------------------------------------------------
1 | 817.50549
2 | 817.50549
3 | 237.50000
4 | 419.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/099_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/099_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/099_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.33144
 2 | 0.91811
 3 | -0.21731
 4 | -0.77130
 5 | -0.94027
 6 | -0.34041
 7 | -0.00411
 8 | -0.49282
 9 | -0.07775
10 | 0.20297
11 | 0.97609
12 | -14.93493
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/143.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/143.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/143.txt:
--------------------------------------------------------------------------------
1 | 813.87244
2 | 813.87244
3 | 236.50000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/143_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/201_21613_43652/143_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/201_21613_43652/143_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.41307
 2 | -0.83543
 3 | -0.36253
 4 | -0.94164
 5 | 0.04783
 6 | -0.41743
 7 | 0.90745
 8 | -0.04272
 9 | -0.90944
10 | 0.35750
11 | 0.21239
12 | -18.61402
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/010.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/010.txt:
--------------------------------------------------------------------------------
1 | 1894.62366
2 | 1894.62366
3 | 899.50000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/010_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/010_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/010_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.48597
 2 | 0.78452
 3 | -0.38519
 4 | -0.33164
 5 | -0.87386
 6 | -0.44330
 7 | 0.19961
 8 | 1.50121
 9 | -0.01416
10 | 0.43361
11 | 0.90099
12 | -14.69950
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/017.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/017.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/017.txt:
--------------------------------------------------------------------------------
1 | 1877.22815
2 | 1877.22815
3 | 899.00000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/017_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/017_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/017_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.67710
 2 | 0.43300
 3 | -0.59501
 4 | -0.31781
 5 | -0.69160
 6 | -0.65071
 7 | 0.31349
 8 | 1.46675
 9 | -0.25144
10 | 0.62377
11 | 0.74006
12 | -13.76519
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/041.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/041.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/041.txt:
--------------------------------------------------------------------------------
1 | 1841.41174
2 | 1841.41174
3 | 900.00000
4 | 506.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/041_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/041_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/041_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.09783
 2 | -0.96769
 3 | -0.23238
 4 | -0.89872
 5 | 0.15031
 6 | -0.24519
 7 | 0.95775
 8 | 1.10148
 9 | -0.98379
10 | 0.05877
11 | 0.16945
12 | -12.55235
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/072.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/072.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/072.txt:
--------------------------------------------------------------------------------
1 | 1810.31189
2 | 1810.31189
3 | 899.50000
4 | 506.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/072_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/072_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/072_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.71123
 2 | 0.32405
 3 | 0.62381
 4 | -1.13456
 5 | -0.64928
 6 | 0.64295
 7 | 0.40628
 8 | 1.63326
 9 | -0.26942
10 | -0.69398
11 | 0.66768
12 | -14.39440
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/099.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/099.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/099.txt:
--------------------------------------------------------------------------------
1 | 1877.19104
2 | 1877.19104
3 | 899.50000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/099_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/099_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/099_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.27407
 2 | 0.94088
 3 | -0.19909
 4 | -0.06255
 5 | -0.95632
 6 | -0.24475
 7 | 0.15986
 8 | 1.67176
 9 | 0.10169
10 | 0.23420
11 | 0.96686
12 | -15.96998
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/107.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/107.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/107.txt:
--------------------------------------------------------------------------------
1 | 1880.61340
2 | 1880.61340
3 | 899.50000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/107_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/107_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/107_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.60230
 2 | 0.62224
 3 | -0.50005
 4 | -0.95324
 5 | -0.78889
 6 | -0.55970
 7 | 0.25374
 8 | 1.60947
 9 | -0.12199
10 | 0.54731
11 | 0.82799
12 | -14.19761
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/118.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/118.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/118.txt:
--------------------------------------------------------------------------------
1 | 1869.02002
2 | 1869.02002
3 | 899.50000
4 | 506.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/118_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/118_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/118_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.70884
 2 | -0.21607
 3 | -0.67147
 4 | -0.24353
 5 | -0.34049
 6 | -0.72887
 7 | 0.59398
 8 | 1.05765
 9 | -0.61775
10 | 0.64966
11 | 0.44308
12 | -12.96866
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/130.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/130.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/130.txt:
--------------------------------------------------------------------------------
1 | 1849.60815
2 | 1849.60815
3 | 899.00000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/130_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/31_1359_4114/130_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/31_1359_4114/130_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.24049
 2 | -0.90529
 3 | -0.35018
 4 | -0.82363
 5 | 0.09862
 6 | -0.38168
 7 | 0.91902
 8 | 1.25895
 9 | -0.96563
10 | 0.18648
11 | 0.18107
12 | -12.66461
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/000.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/000.txt:
--------------------------------------------------------------------------------
1 | 556.14813
2 | 556.14813
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/000_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/000_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/000_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.88550
 2 | 0.35273
 3 | -0.30244
 4 | 0.10619
 5 | -0.42134
 6 | -0.88396
 7 | 0.20269
 8 | 0.85745
 9 | -0.19585
10 | 0.30691
11 | 0.93137
12 | -6.91243
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/028.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/028.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/028.txt:
--------------------------------------------------------------------------------
1 | 533.91620
2 | 533.91620
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/028_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/028_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/028_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.19513
 2 | 0.97940
 3 | -0.05199
 4 | -0.13463
 5 | -0.98037
 6 | -0.19630
 7 | -0.01825
 8 | 0.63380
 9 | -0.02808
10 | 0.04741
11 | 0.99848
12 | -6.94058
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/044.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/044.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/044.txt:
--------------------------------------------------------------------------------
1 | 549.66217
2 | 549.66217
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/044_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/044_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/044_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.34252
 2 | 0.92934
 3 | 0.13789
 4 | -0.14764
 5 | -0.93949
 6 | 0.33970
 7 | 0.04425
 8 | 0.57568
 9 | -0.00572
10 | -0.14470
11 | 0.98946
12 | -7.51767
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/062.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/062.txt:
--------------------------------------------------------------------------------
1 | 544.20050
2 | 544.20050
3 | 175.00000
4 | 319.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/062_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/062_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/062_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.86101
 2 | 0.36287
 3 | 0.35635
 4 | -0.27652
 5 | -0.44818
 6 | 0.87255
 7 | 0.19439
 8 | 1.03958
 9 | -0.24039
10 | -0.32708
11 | 0.91391
12 | -7.43995
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/068.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/068.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/068.txt:
--------------------------------------------------------------------------------
1 | 542.48267
2 | 542.48267
3 | 175.00000
4 | 319.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/068_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/068_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/068_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.89591
 2 | 0.11736
 3 | 0.42844
 4 | -0.26967
 5 | -0.25885
 6 | 0.92173
 7 | 0.28880
 8 | 0.92617
 9 | -0.36101
10 | -0.36964
11 | 0.85617
12 | -7.90157
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/074.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/074.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/074.txt:
--------------------------------------------------------------------------------
1 | 558.13892
2 | 558.13892
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/074_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/074_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/074_ext.txt:
--------------------------------------------------------------------------------
 1 | 0.88700
 2 | -0.14835
 3 | 0.43729
 4 | -0.21878
 5 | -0.04606
 6 | 0.91384
 7 | 0.40345
 8 | 0.57693
 9 | -0.45947
10 | -0.37800
11 | 0.80374
12 | -7.53494
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/098.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/098.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/098.txt:
--------------------------------------------------------------------------------
1 | 560.99487
2 | 560.99487
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/098_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/098_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/098_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.07326
 2 | -0.99731
 3 | 0.00211
 4 | -0.38476
 5 | 0.70560
 6 | -0.05034
 7 | 0.70682
 8 | 0.43857
 9 | -0.70482
10 | 0.05327
11 | 0.70739
12 | -7.58088
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/101.jpg


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/101.txt:
--------------------------------------------------------------------------------
1 | 553.15106
2 | 553.15106
3 | 175.00000
4 | 319.00000
5 | 


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/101_depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/co3dv2-samples/422_58670_113666/101_depth.png


--------------------------------------------------------------------------------
/examples/co3dv2-samples/422_58670_113666/101_ext.txt:
--------------------------------------------------------------------------------
 1 | -0.16198
 2 | -0.98575
 3 | -0.04545
 4 | -0.64704
 5 | 0.69978
 6 | -0.14722
 7 | 0.69903
 8 | 0.52612
 9 | -0.69576
10 | 0.08143
11 | 0.71365
12 | -7.49813
13 | 0.00000
14 | 0.00000
15 | 0.00000
16 | 1.00000
17 | 


--------------------------------------------------------------------------------
/examples/single-view/armor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/armor.png


--------------------------------------------------------------------------------
/examples/single-view/armor.txt:
--------------------------------------------------------------------------------
1 | 55
2 | 


--------------------------------------------------------------------------------
/examples/single-view/ghost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/ghost.png


--------------------------------------------------------------------------------
/examples/single-view/ghost.txt:
--------------------------------------------------------------------------------
1 | 60
2 | 


--------------------------------------------------------------------------------
/examples/single-view/jacket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/jacket.png


--------------------------------------------------------------------------------
/examples/single-view/pile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/pile.png


--------------------------------------------------------------------------------
/examples/single-view/pile.txt:
--------------------------------------------------------------------------------
1 | 65
2 | 


--------------------------------------------------------------------------------
/examples/single-view/skull.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/single-view/skull.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/061.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/061.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/061.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/072.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/072.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/072.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/081.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41069043/081.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41069043/081.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/052.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/052.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/052.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/053.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/053.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/053.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/054.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/arkitscenes/41125709/054.png


--------------------------------------------------------------------------------
/examples/unposed-samples/arkitscenes/41125709/054.txt:
--------------------------------------------------------------------------------
1 | 1579.02808
2 | 1579.02808
3 | 717.76849
4 | 957.03693
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/051.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/051.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/051.txt:
--------------------------------------------------------------------------------
1 | 1206.45020
2 | 1206.45020
3 | 340.00000
4 | 605.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/084.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/084.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/084.txt:
--------------------------------------------------------------------------------
1 | 1206.86255
2 | 1206.86255
3 | 339.00000
4 | 604.50000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/126.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/195_20989_41543/126.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/195_20989_41543/126.txt:
--------------------------------------------------------------------------------
1 | 1238.41101
2 | 1238.41101
3 | 340.00000
4 | 605.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/000.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/000.txt:
--------------------------------------------------------------------------------
1 | 2601.26636
2 | 2601.26636
3 | 945.50000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/038.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/038.txt:
--------------------------------------------------------------------------------
1 | 2602.71191
2 | 2602.71191
3 | 944.50000
4 | 531.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/074.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/198_21285_41285/074.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/198_21285_41285/074.txt:
--------------------------------------------------------------------------------
1 | 2551.71631
2 | 2551.71631
3 | 945.50000
4 | 531.50000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/000.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/000.txt:
--------------------------------------------------------------------------------
1 | 811.81531
2 | 811.81531
3 | 237.50000
4 | 418.50000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/038.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/038.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/038.txt:
--------------------------------------------------------------------------------
1 | 812.97998
2 | 812.97998
3 | 237.00000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/067.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/201_21613_43652/067.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/201_21613_43652/067.txt:
--------------------------------------------------------------------------------
1 | 814.11646
2 | 814.11646
3 | 237.50000
4 | 419.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/010.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/010.txt:
--------------------------------------------------------------------------------
1 | 1894.62366
2 | 1894.62366
3 | 899.50000
4 | 505.50000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/041.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/041.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/041.txt:
--------------------------------------------------------------------------------
1 | 1841.41174
2 | 1841.41174
3 | 900.00000
4 | 506.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/072.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/31_1359_4114/072.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/31_1359_4114/072.txt:
--------------------------------------------------------------------------------
1 | 1810.31189
2 | 1810.31189
3 | 899.50000
4 | 506.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/000.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/000.txt:
--------------------------------------------------------------------------------
1 | 556.14813
2 | 556.14813
3 | 175.50000
4 | 318.50000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/062.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/062.txt:
--------------------------------------------------------------------------------
1 | 544.20050
2 | 544.20050
3 | 175.00000
4 | 319.00000
5 | 


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-matrix3d/26faf1aac8994ce5e0ac3fc3ed0fd29acde6c3ce/examples/unposed-samples/co3dv2/422_58670_113666/101.jpg


--------------------------------------------------------------------------------
/examples/unposed-samples/co3dv2/422_58670_113666/101.txt:
--------------------------------------------------------------------------------
1 | 553.15106
2 | 553.15106
3 | 175.00000
4 | 319.00000
5 | 


--------------------------------------------------------------------------------
/model/dinov2.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | from typing import Optional, Tuple, Union, Dict, List
  6 | import torch
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | from transformers.modeling_outputs import BaseModelOutput
 10 | from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
 11 | from transformers.models.dinov2.modeling_dinov2 import (
 12 |     BaseModelOutput,
 13 |     BaseModelOutputWithPooling,
 14 |     Dinov2SelfAttention,
 15 |     Dinov2Layer,
 16 |     Dinov2PreTrainedModel,
 17 |     Dinov2Embeddings,
 18 |     Dinov2PatchEmbeddings,
 19 | )
 20 | 
 21 | from .dinov2_adaln.adaln import AdaLayerNorm
 22 | 
 23 | class Dinov2SelfAttentionSDP(Dinov2SelfAttention):
 24 |     def __init__(self, config: Dinov2Config) -> None:
 25 |         super().__init__(config)
 26 |         self.dropout_prob = config.attention_probs_dropout_prob
 27 |         assert self.dropout_prob == 0.0
 28 | 
 29 |     def forward(
 30 |         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
 31 |     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
 32 |         assert head_mask is None
 33 |         assert not output_attentions
 34 | 
 35 |         mixed_query_layer = self.query(hidden_states)
 36 | 
 37 |         key_layer = self.transpose_for_scores(self.key(hidden_states))
 38 |         value_layer = self.transpose_for_scores(self.value(hidden_states))
 39 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 40 | 
 41 |         context_layer = F.scaled_dot_product_attention(query_layer, key_layer, value_layer, dropout_p=self.dropout_prob)
 42 | 
 43 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 44 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 45 |         context_layer = context_layer.view(new_context_layer_shape)
 46 | 
 47 |         outputs = (context_layer,)
 48 | 
 49 |         return outputs
 50 | 
 51 | class AdaDinov2Layer(Dinov2Layer):
 52 |     def __init__(self, config: Dinov2Config) -> None:
 53 |         super().__init__(config)
 54 |         self.norm1 = AdaLayerNorm(config.hidden_size, eps=config.layer_norm_eps, mod_act=config.hidden_act)
 55 |         self.norm2 = AdaLayerNorm(config.hidden_size, eps=config.layer_norm_eps, mod_act=config.hidden_act)
 56 |         self.attention.attention = Dinov2SelfAttentionSDP(config)
 57 | 
 58 |     def forward(
 59 |         self,
 60 |         hidden_states: torch.Tensor,
 61 |         head_mask: Optional[torch.Tensor] = None,
 62 |         modulation: Optional[torch.Tensor] = None,
 63 |         output_attentions: bool = False,
 64 |     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
 65 |         self_attention_outputs = self.attention(
 66 |             self.norm1(hidden_states, modulation),  # in Dinov2, layernorm is applied before self-attention
 67 |             head_mask,
 68 |             output_attentions=output_attentions,
 69 |         )
 70 |         attention_output = self_attention_outputs[0]
 71 | 
 72 |         attention_output = self.layer_scale1(attention_output)
 73 |         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 74 | 
 75 |         # first residual connection
 76 |         hidden_states = attention_output + hidden_states
 77 | 
 78 |         # in Dinov2, layernorm is also applied after self-attention
 79 |         layer_output = self.norm2(hidden_states, modulation)
 80 |         layer_output = self.mlp(layer_output)
 81 |         layer_output = self.layer_scale2(layer_output)
 82 | 
 83 |         # second residual connection
 84 |         layer_output = layer_output + hidden_states
 85 | 
 86 |         outputs = (layer_output,) + outputs
 87 | 
 88 |         return outputs
 89 | 
 90 | class AdaDinov2Encoder(nn.Module):
 91 |     def __init__(self, config: Dinov2Config) -> None:
 92 |         super().__init__()
 93 |         self.config = config
 94 |         self.layer = nn.ModuleList([AdaDinov2Layer(config) for _ in range(config.num_hidden_layers)])
 95 |         self.gradient_checkpointing = False
 96 | 
 97 |     def forward(
 98 |         self,
 99 |         hidden_states: torch.Tensor,
100 |         head_mask: Optional[torch.Tensor] = None,
101 |         modulation: Optional[torch.Tensor] = None,
102 |         output_attentions: bool = False,
103 |         output_hidden_states: bool = False,
104 |         return_dict: bool = True,
105 |     ) -> Union[tuple, BaseModelOutput]:
106 |         all_hidden_states = () if output_hidden_states else None
107 |         all_self_attentions = () if output_attentions else None
108 | 
109 |         for i, layer_module in enumerate(self.layer):
110 |             if output_hidden_states:
111 |                 all_hidden_states = all_hidden_states + (hidden_states,)
112 | 
113 |             layer_head_mask = head_mask[i] if head_mask is not None else None
114 |             layer_inputs = (hidden_states, layer_head_mask, modulation, output_attentions)
115 | 
116 |             if self.gradient_checkpointing and self.training:
117 |                 layer_outputs = self._gradient_checkpointing_func(
118 |                     layer_module.__call__,
119 |                     *layer_inputs
120 |                 )
121 |             else:
122 |                 layer_outputs = layer_module(*layer_inputs)
123 | 
124 |             hidden_states = layer_outputs[0]
125 | 
126 |             if output_attentions:
127 |                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
128 | 
129 |         if output_hidden_states:
130 |             all_hidden_states = all_hidden_states + (hidden_states,)
131 | 
132 |         if not return_dict:
133 |             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
134 |         return BaseModelOutput(
135 |             last_hidden_state=hidden_states,
136 |             hidden_states=all_hidden_states,
137 |             attentions=all_self_attentions,
138 |         )
139 | 
140 | class AdaDinov2PreTrainedModel(Dinov2PreTrainedModel):
141 |     def _init_weights(self, module: nn.Linear | nn.Conv2d | nn.LayerNorm | AdaLayerNorm) -> None:
142 |         super()._init_weights(module)
143 |         if isinstance(module, AdaLayerNorm):
144 |             module.mod_init()
145 | 
146 | class AdaDinov2Model(AdaDinov2PreTrainedModel):
147 |     def __init__(self, config: Dinov2Config):
148 |         super().__init__(config)
149 |         self.config = config
150 | 
151 |         self.embeddings = Dinov2Embeddings(config)
152 |         self.encoder = AdaDinov2Encoder(config)
153 | 
154 |         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # TODO whether change this to ada
155 | 
156 |         # Initialize weights and apply final processing
157 |         self.post_init()
158 | 
159 |     def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
160 |         return self.embeddings.patch_embeddings
161 | 
162 |     def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
163 |         """
164 |         Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
165 |         class PreTrainedModel
166 |         """
167 |         for layer, heads in heads_to_prune.items():
168 |             self.encoder.layer[layer].attention.prune_heads(heads)
169 |     
170 |     def set_patch_size(self, new_size):
171 |         # NOTE call immediately after from_pretrained
172 |         # NOTE error is large (~0.5), not used for now, input size is 224 or 448, patch size remain 14
173 |         num_patch = self.config.image_size // self.config.patch_size
174 |         self.config.patch_size = new_size
175 |         self.config.image_size = new_size * num_patch
176 |         new_projection = nn.Conv2d(
177 |             self.config.num_channels,
178 |             self.config.hidden_size,
179 |             kernel_size=new_size,
180 |             stride=new_size).eval()
181 |         with torch.no_grad():
182 |             new_projection.bias[:] = self.embeddings.patch_embeddings.projection.bias
183 |             new_projection.weight[:] = F.interpolate(
184 |                 self.embeddings.patch_embeddings.projection.weight,
185 |                 new_size, mode='bilinear', align_corners=False,
186 |             )
187 |         self.embeddings.patch_embeddings.projection = new_projection
188 | 
189 |     def forward(
190 |         self,
191 |         pixel_values: Optional[torch.Tensor] = None,
192 |         bool_masked_pos: Optional[torch.Tensor] = None,
193 |         head_mask: Optional[torch.Tensor] = None,
194 |         modulation: Optional[torch.Tensor] = None,
195 |         output_attentions: Optional[bool] = None,
196 |         output_hidden_states: Optional[bool] = None,
197 |         return_dict: Optional[bool] = None,
198 |     ) -> Union[Tuple, BaseModelOutputWithPooling]:
199 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
200 |         output_hidden_states = (
201 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
202 |         )
203 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
204 | 
205 |         if pixel_values is None:
206 |             raise ValueError("You have to specify pixel_values")
207 | 
208 |         # Prepare head mask if needed
209 |         # 1.0 in head_mask indicate we keep the head
210 |         # attention_probs has shape bsz x n_heads x N x N
211 |         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
212 |         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
213 |         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
214 | 
215 |         embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
216 | 
217 |         encoder_outputs = self.encoder(
218 |             embedding_output,
219 |             head_mask=head_mask,
220 |             modulation=modulation,
221 |             output_attentions=output_attentions,
222 |             output_hidden_states=output_hidden_states,
223 |             return_dict=return_dict,
224 |         )
225 |         sequence_output = encoder_outputs[0]
226 |         sequence_output = self.layernorm(sequence_output)
227 |         pooled_output = sequence_output[:, 0, :]
228 | 
229 |         if not return_dict:
230 |             head_outputs = (sequence_output, pooled_output)
231 |             return head_outputs + encoder_outputs[1:]
232 | 
233 |         return BaseModelOutputWithPooling(
234 |             last_hidden_state=sequence_output,
235 |             pooler_output=pooled_output,
236 |             hidden_states=encoder_outputs.hidden_states,
237 |             attentions=encoder_outputs.attentions,
238 |         )
239 | 


--------------------------------------------------------------------------------
/model/dinov2_adaln/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #


--------------------------------------------------------------------------------
/model/dinov2_adaln/adaln.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import torch
 6 | from torch import Tensor
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from torch.nn.modules.normalization import _shape_t
10 | from transformers.activations import get_activation
11 | 
12 | # https://github.com/yenchenlin/dinov2-adaLN/commit/b195e7b7ebeefc0b249173b23734b1fb64227a9f
13 | class AdaLayerNorm(nn.LayerNorm):
14 |     def __init__(self, normalized_shape: _shape_t, mod_shape: _shape_t = None, eps: float = 0.00001, mod_act='gelu', elementwise_affine: bool = True, device=None, dtype=None) -> None:
15 |         super().__init__(normalized_shape, eps, elementwise_affine, device, dtype)
16 |         if mod_shape is None:
17 |             mod_shape = normalized_shape
18 |         self.mod_linear = nn.Sequential(
19 |             get_activation(mod_act),
20 |             nn.Linear(mod_shape, 2 * normalized_shape, bias=True)
21 |         ) if mod_shape > 0 else None
22 | 
23 |     def modulate(self, x, scale, shift):
24 |         return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
25 |     
26 |     def mod_init(self):
27 |         nn.init.zeros_(self.mod_linear[-1].weight)  # TODO why cannot use .zeros_() here
28 |         nn.init.zeros_(self.mod_linear[-1].bias)
29 | 
30 |     def forward(self, input: Tensor, modulation: Tensor = None) -> Tensor:
31 |         normed = super().forward(input)
32 |         if modulation is None or self.mod_linear is None:
33 |             return normed
34 |         scale, shift = self.mod_linear(modulation).chunk(2, dim=1)
35 |         return self.modulate(normed, scale, shift)
36 | 


--------------------------------------------------------------------------------
/model/feature_extractors.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | def resize(image, size=None, scale_factor=None):
10 |     return nn.functional.interpolate(
11 |         image,
12 |         size=size,
13 |         scale_factor=scale_factor,
14 |         mode="bilinear",
15 |         align_corners=False,
16 |     )
17 | 
18 | 
19 | class SpatialDino(nn.Module):
20 |     def __init__(
21 |         self,
22 |         freeze_weights=True,
23 |         model_type="dinov2_vits14",
24 |         num_patches_x=16,
25 |         num_patches_y=16,
26 |     ):
27 |         super().__init__()
28 |         self.model = torch.hub.load("facebookresearch/dinov2", model_type)
29 |         self.feature_dim = self.model.embed_dim
30 |         self.num_patches_x = num_patches_x
31 |         self.num_patches_y = num_patches_y
32 |         if freeze_weights:
33 |             for param in self.model.parameters():
34 |                 param.requires_grad = False
35 | 
36 |     def forward(self, x, autoresize=False):
37 |         """
38 |         Spatial dimensions of output will be H // 14, W // 14. If autoresize is True,
39 |         then the output will be resized to the correct dimensions.
40 | 
41 |         Args:
42 |             x (torch.Tensor): Images (B, C, H, W). Should be ImageNet normalized.
43 |             autoresize (bool): Whether to resize the input to match the num_patch
44 |                 dimensions.
45 | 
46 |         Returns:
47 |             feature_map (torch.tensor): (B, C, h, w)
48 |         """
49 |         *B, c, h, w = x.shape
50 | 
51 |         x = x.reshape(-1, c, h, w)
52 | 
53 |         # Output will be (B, H * W, C)
54 |         features = self.model.forward_features(x)["x_norm_patchtokens"]
55 |         features = features.permute(0, 2, 1)
56 |         features = features.reshape(  # (B, C, H, W)
57 |             -1, self.feature_dim, h // 14, w // 14
58 |         )
59 |         if autoresize:
60 |             features = resize(features, size=(self.num_patches_y, self.num_patches_x))
61 | 
62 |         features = features.reshape(
63 |             *B, self.feature_dim, self.num_patches_y, self.num_patches_x
64 |         )
65 |         return features
66 | 


--------------------------------------------------------------------------------
/model/inference/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #


--------------------------------------------------------------------------------
/model/inference/ddpm.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | import ipdb  # noqa: F401
  6 | import torch
  7 | from tqdm.auto import tqdm
  8 | from typing import Any, Callable, Dict, List, Optional, Union
  9 | import inspect
 10 | 
 11 | rescale_fn = {
 12 |     "zero": lambda x: 0,
 13 |     "identity": lambda x: x,
 14 |     "square": lambda x: x**2,
 15 |     "square_root": lambda x: torch.sqrt(x),
 16 | }
 17 | 
 18 | def retrieve_timesteps(
 19 |     scheduler,
 20 |     num_inference_steps: Optional[int] = None,
 21 |     device: Optional[Union[str, torch.device]] = None,
 22 |     timesteps: Optional[List[int]] = None,
 23 |     **kwargs,
 24 | ):
 25 |     """
 26 |     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
 27 |     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
 28 | 
 29 |     Args:
 30 |         scheduler (`SchedulerMixin`):
 31 |             The scheduler to get timesteps from.
 32 |         num_inference_steps (`int`):
 33 |             The number of diffusion steps used when generating samples with a pre-trained model. If used,
 34 |             `timesteps` must be `None`.
 35 |         device (`str` or `torch.device`, *optional*):
 36 |             The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
 37 |         timesteps (`List[int]`, *optional*):
 38 |                 Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
 39 |                 timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
 40 |                 must be `None`.
 41 | 
 42 |     Returns:
 43 |         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
 44 |         second element is the number of inference steps.
 45 |     """
 46 |     if timesteps is not None:
 47 |         accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
 48 |         if not accepts_timesteps:
 49 |             raise ValueError(
 50 |                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
 51 |                 f" timestep schedules. Please check whether you are using the correct scheduler."
 52 |             )
 53 |         scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
 54 |         timesteps = scheduler.timesteps
 55 |         num_inference_steps = len(timesteps)
 56 |     else:
 57 |         scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
 58 |         timesteps = scheduler.timesteps
 59 |     return timesteps, num_inference_steps
 60 | 
 61 | 
 62 | def shift_scale_denormalize(x, shift, scale):
 63 |     '''denormalize the normalized data using the shfit/mean and scale/std.'''
 64 |     return x / scale + shift
 65 | 
 66 | 
 67 | def inference_ddpm_call_varmod(
 68 |     model,
 69 |     scheduler,
 70 |     device,
 71 |     data=None,
 72 |     num_inference_steps=1000,
 73 |     guidance_scale=1.0,
 74 |     cfg=None,
 75 | ):
 76 |     """
 77 |     Implements DDPM-style inference.
 78 | 
 79 |     To get multiple samples, batch the images multiple times.
 80 | 
 81 |     Args:
 82 |         model: Ray Diffuser.
 83 |         images (torch.Tensor): (B, N, C, H, W).
 84 |         crop_parameters (torch.Tensor): (B, N, 4) or None.
 85 |         pbar (bool): If True, shows a progress bar.
 86 |     """ 
 87 |     # batch_size, num_images, num_channel, num_patches_x, num_patches_y = data['rgb']['data'].shape
 88 |     batch_size, num_images = data['view_id'].shape[:2]
 89 |     scheduler.set_timesteps(num_inference_steps, device=device)
 90 |     timesteps = scheduler.timesteps
 91 |     
 92 |     cond_mods = [mod for mod in model.modalities if mod in data['conds']]
 93 |     gen_mods = [mod for mod in model.modalities if mod in data['gens']]
 94 |     use_rgb, use_ray, use_depth = 'rgb' in gen_mods, 'ray' in gen_mods, 'depth' in gen_mods
 95 | 
 96 |     x_t = data
 97 |     with torch.no_grad():
 98 |         for t in tqdm(timesteps): 
 99 |             # predict the noise residual        
100 |             mmod_preds = model(
101 |                 t=t.repeat(batch_size)-1,
102 |                 data={**x_t, 'uncond': False},
103 |             )
104 |             noise_pred_rgb = mmod_preds['rgb'] if use_rgb else None
105 |             noise_pred_ray = mmod_preds['ray'] if use_ray else None
106 |             noise_pred_depth = mmod_preds['depth'] if use_depth else None
107 | 
108 |             if guidance_scale > 1.0:
109 |                 mmod_preds_uncond = model(
110 |                     t=t.repeat(batch_size)-1,
111 |                     data={**x_t, 'uncond': True},
112 |                 )
113 |                 noise_pred_rgb_uncond = mmod_preds_uncond['rgb'] if use_rgb else None
114 |                 noise_pred_ray_uncond = mmod_preds_uncond['ray'] if use_ray else None
115 |                 noise_pred_depth_uncond = mmod_preds_uncond['depth'] if use_depth else None
116 |                 noise_pred_rgb = noise_pred_rgb_uncond + guidance_scale * (noise_pred_rgb - noise_pred_rgb_uncond) if use_rgb else None
117 |                 noise_pred_ray = noise_pred_ray_uncond + guidance_scale * (noise_pred_ray - noise_pred_ray_uncond) if use_ray else None
118 |                 noise_pred_depth = noise_pred_depth_uncond + guidance_scale * (noise_pred_depth - noise_pred_depth_uncond) if use_depth else None
119 | 
120 |             # compute the previous noisy sample x_t -> x_t-1
121 |             if use_rgb:
122 |                 x_t_rgb = scheduler.step(noise_pred_rgb.flatten(0, 1).float(), t-1, x_t['gens']['rgb'].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_rgb.dtype)
123 |                 x_t_rgb = x_t_rgb.reshape((batch_size, num_images) + x_t_rgb.shape[1:]) 
124 |                 x_t['gens']['rgb'] = x_t_rgb
125 |                 x_t['gens']['rgb_mask'] = mmod_preds['rgb_mask']
126 |             if use_ray:
127 |                 x_t_ray = scheduler.step(noise_pred_ray.flatten(0, 1).float(), t-1, x_t['gens']['ray'].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_ray.dtype)
128 |                 x_t_ray = x_t_ray.reshape((batch_size, num_images) + x_t_ray.shape[1:])
129 |                 x_t['gens']['ray'] = x_t_ray
130 |                 x_t['gens']['ray_mask'] = mmod_preds['ray_mask']
131 |             if use_depth:
132 |                 x_t_depth = scheduler.step(noise_pred_depth.flatten(0, 1).float(), t-1, x_t['gens']['depth'][:, :, 0:1].flatten(0, 1).float(), return_dict=False)[0].to(noise_pred_depth.dtype)
133 |                 x_t_depth = x_t_depth.reshape((batch_size, num_images) + x_t_depth.shape[1:])
134 |                 x_t['gens']['depth'] = torch.cat([x_t_depth, x_t['gens']['depth'][:, :, 1:2]], dim=2)
135 |                 x_t['gens']['depth_mask'] = mmod_preds['depth_mask']   
136 |         
137 |         # shift-scale denormalize
138 |         if use_rgb:
139 |             x_t['gens']['rgb'] = shift_scale_denormalize(x_t['gens']['rgb'], cfg.data.shift_scales.rgb[0], cfg.data.shift_scales.rgb[1])   
140 |         if use_ray:
141 |             if cfg.data.use_plucker:
142 |                 x_t['gens']['ray'][:, :, :3] = shift_scale_denormalize(x_t['gens']['ray'][:, :, :3], cfg.data.shift_scales.ray.dirs[0], cfg.data.shift_scales.ray.dirs[1])
143 |                 x_t['gens']['ray'][:, :, 3:] = shift_scale_denormalize(x_t['gens']['ray'][:, :, 3:], cfg.data.shift_scales.ray.moms[0], cfg.data.shift_scales.ray.moms[1])
144 |             else:
145 |                 x_t['gens']['ray'][:, :, :3] = shift_scale_denormalize(x_t['gens']['ray'][:, :, :3], cfg.data.shift_scales.ray.origins[0], cfg.data.shift_scales.ray.origins[1])
146 |                 x_t['gens']['ray'][:, :, 3:] = shift_scale_denormalize(x_t['gens']['ray'][:, :, 3:], cfg.data.shift_scales.ray.directions[0], cfg.data.shift_scales.ray.directions[1])
147 |         if use_depth:
148 |             x_t['gens']['depth'][:, :, :1] = shift_scale_denormalize(x_t['gens']['depth'][:, :, :1], cfg.data.shift_scales.depth[0], cfg.data.shift_scales.depth[1])       
149 | 
150 | 
151 |     return x_t


--------------------------------------------------------------------------------
/model/load.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import os
 6 | import safetensors
 7 | import torch
 8 | import torch.utils.checkpoint
 9 | from diffusers import DDPMScheduler, DDIMScheduler
10 | from diffusers import HunyuanDiTPipeline
11 | 
12 | from model.feature_extractors import SpatialDino
13 | from model.dit import DiT
14 | 
15 | def load_model(cfg, checkpoint_path, device='cuda:0', weight_dtype=torch.float16):
16 |     # Load scheduler, tokenizer and models.
17 |     if cfg.eval.scheduler == "DDPM":
18 |         noise_scheduler = DDPMScheduler.from_pretrained(cfg.model.scheduler_url, subfolder="scheduler")
19 |     elif cfg.eval.scheduler == "DDIM":
20 |         noise_scheduler = DDIMScheduler.from_pretrained(cfg.model.scheduler_url, subfolder="scheduler")
21 |     
22 |     # Freeze vae and text_encoder
23 |     feature_extractor = SpatialDino(
24 |         freeze_weights=True, 
25 |         model_type="dinov2_vitb14",
26 |         num_patches_x=cfg.modalities.rgb.width,
27 |         num_patches_y=cfg.modalities.rgb.width,
28 |     )
29 |     hunyuan_pipe = HunyuanDiTPipeline.from_pretrained(cfg.model.decoder_url, torch_dtype=torch.float16)
30 |     tokenizer, text_encoder = hunyuan_pipe.tokenizer, hunyuan_pipe.text_encoder
31 |     tokenizer_2, text_encoder_2 = hunyuan_pipe.tokenizer_2, hunyuan_pipe.text_encoder_2
32 |     vae = hunyuan_pipe.vae
33 |     del hunyuan_pipe
34 |     
35 |     # Build model and load from checkpoint
36 |     cfg.used_modalities = {key: cfg.modalities[key] for key in ['rgb', 'ray', 'depth', 'local_caption', 'global_caption']}
37 |     model = DiT(modalities=cfg.used_modalities, **cfg.model)
38 |     if os.path.splitext(checkpoint_path)[-1] == '.safetensors':
39 |         state_dict = safetensors.torch.load_file(checkpoint_path)
40 |     else:
41 |         state_dict = torch.load(checkpoint_path)['module']
42 |     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
43 |     model.eval()
44 |     print('Loaded model from:', checkpoint_path)
45 |     print('missing_keys', missing_keys)
46 |     print('unexpected_keys', unexpected_keys)
47 |     
48 |     # Move non-trainables and cast to weight_dtype
49 |     vae.to(device, dtype=weight_dtype)
50 |     feature_extractor.to(device, dtype=weight_dtype)
51 |     text_encoder.to(device, dtype=weight_dtype)
52 |     text_encoder_2.to(device, dtype=weight_dtype)
53 |     model.to(device, dtype=weight_dtype)
54 |     
55 |     # Package all components into one dict
56 |     models = {
57 |         'model': model,
58 |         'noise_scheduler': noise_scheduler,
59 |         'tokenizer': tokenizer,
60 |         'text_encoder': text_encoder,
61 |         'tokenizer_2': tokenizer_2,
62 |         'text_encoder_2': text_encoder_2,
63 |         'vae': vae,
64 |         'feature_extractor': feature_extractor
65 |     }
66 |     
67 |     return models


--------------------------------------------------------------------------------
/model/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #


--------------------------------------------------------------------------------
/model/utils/nn.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | from typing import Optional
  6 | import math
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | import xformers.ops as xops
 11 | from diffusers.models.attention_processor import Attention
 12 | 
 13 | from .pos_encoder import FeaturePositionalEncoding
 14 | 
 15 | 
 16 | def modulate(x, shift, scale):
 17 |     if x.is_nested:
 18 |         return x * (1 + scale) + shift
 19 |     else:
 20 |         return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
 21 | 
 22 | 
 23 | def convert_tensor_to_nested_tensor(tensor_list, in_nt_tensor):
 24 |     '''convert tensor to nested tensor'''
 25 |     batch_size = in_nt_tensor.size(0)
 26 |     out_nt_tensor = []
 27 |     for tensor in tensor_list:
 28 |         nt_tensor = torch.nested.as_nested_tensor([tensor[i].unsqueeze(0).repeat(in_nt_tensor[i].shape[0], 1) for i in range(batch_size)])
 29 |         out_nt_tensor.append(nt_tensor)
 30 |         
 31 |     return out_nt_tensor
 32 | 
 33 | 
 34 | def restore_nested_tensor_to_tensor(nt_tensor, orig_shape, mask, value=0.):
 35 |     restored_tensor = torch.ones(orig_shape, dtype=nt_tensor.dtype, device=nt_tensor.device) * value
 36 |     for i, m in enumerate(mask):
 37 |         restored_tensor[i][m] = nt_tensor[i]
 38 | 
 39 |     return restored_tensor
 40 | 
 41 | 
 42 | def full_to_packed(data, mask):
 43 |     seqlist = [seq[m] for seq, m in zip(data, mask)]
 44 |     seqlen = [seq.shape[0] for seq in seqlist]
 45 |     packed = torch.cat(seqlist, dim=0).unsqueeze(0)
 46 |     return packed, seqlen
 47 | 
 48 | 
 49 | def packed_to_nested(data, seqlen):
 50 |     data = torch.nested.as_nested_tensor(list(data[0].split(seqlen, dim=0)))
 51 |     return data
 52 | 
 53 | 
 54 | def nested_to_packed(data):
 55 |     seqlist = [seq for seq in data]
 56 |     seqlen = [seq.shape[0] for seq in seqlist]
 57 |     packed = torch.cat(seqlist, dim=0).unsqueeze(0)
 58 |     return packed, seqlen
 59 | 
 60 | 
 61 | def packed_to_padded(data, seqlen, total=None, fill=0.):
 62 |     if total is None:
 63 |         total = max(seqlen)
 64 |     return torch.stack([
 65 |         torch.cat([
 66 |             seq,
 67 |             torch.full((total-seq.shape[0], *seq.shape[1:]), fill, dtype=seq.dtype, device=seq.device)
 68 |         ], dim=0)
 69 |         for seq in data[0].split(seqlen, dim=0)
 70 |     ], dim=0), seqlen
 71 | 
 72 | 
 73 | def padded_to_packed(data, seqlen):
 74 |     return torch.cat([
 75 |         seq[:l] for seq, l in zip(data, seqlen)
 76 |     ], dim=0).unsqueeze(0), seqlen
 77 | 
 78 | 
 79 | def packed_to_full(data, mask, fill=0.):
 80 |     out = torch.full((*mask.shape, *data.shape[2:]), fill, dtype=data.dtype, device=data.device)
 81 |     out[mask] = data[0]
 82 |     return out, mask
 83 | 
 84 | 
 85 | def full_to_padded(data, mask, total=None, fill=0.):
 86 |     return packed_to_padded(*full_to_packed(data, mask), total=total, fill=fill)
 87 | 
 88 | 
 89 | def padded_to_full(data, seqlen, mask, fill=0.):
 90 |     return packed_to_full(padded_to_packed(data, seqlen)[0], mask, fill=fill)
 91 | 
 92 | 
 93 | # modified from https://github.com/meta-llama/llama/blob/main/llama/model.py
 94 | class RMSNorm(torch.nn.Module):
 95 |     def __init__(self, dim: int, eps: float = 1e-6):
 96 |         """
 97 |         Initialize the RMSNorm normalization layer.
 98 | 
 99 |         Args:
100 |             dim (int): The dimension of the input tensor.
101 |             eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
102 | 
103 |         Attributes:
104 |             eps (float): A small value added to the denominator for numerical stability.
105 |             weight (nn.Parameter): Learnable scaling parameter.
106 | 
107 |         """
108 |         super().__init__()
109 |         self.eps = eps
110 |         self.dim = dim
111 |         self.weight = nn.Parameter(torch.zeros(dim))
112 | 
113 |     def _norm(self, x):
114 |         """
115 |         Apply the RMSNorm normalization to the input tensor.
116 | 
117 |         Args:
118 |             x (torch.Tensor): The input tensor.
119 | 
120 |         Returns:
121 |             torch.Tensor: The normalized tensor.
122 | 
123 |         """
124 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
125 | 
126 |     def forward(self, x):
127 |         """
128 |         Forward pass through the RMSNorm layer.
129 | 
130 |         Args:
131 |             x (torch.Tensor): The input tensor.
132 | 
133 |         Returns:
134 |             torch.Tensor: The output tensor after applying RMSNorm.
135 | 
136 |         """
137 |         nested = False
138 |         if x.is_nested:  # NOTE assume seq dim is 1
139 |             nested = True
140 |             x, seqlen = nested_to_packed(x)
141 |         x = self._norm(x) * (1 + self.weight)
142 |         if nested:
143 |             x = packed_to_nested(x, seqlen)
144 |         return x
145 | 
146 | 
147 | class HolisticAttnProcessor:
148 |     def __call__(
149 |         self,
150 |         attn: Attention,
151 |         hidden_states: torch.Tensor,
152 |         encoder_hidden_states: Optional[torch.Tensor] = None,
153 |         attention_mask: Optional[torch.Tensor] = None,
154 |         query_pos_s=None, key_pos_s=None, query_pos_r=None, key_pos_r=None, seqlen_q=None, seqlen_kv=None,
155 |     ) -> torch.Tensor:
156 |         query = hidden_states
157 |         key = value = encoder_hidden_states
158 | 
159 |         if query_pos_s is not None:
160 |             query = query + query_pos_s
161 |         if key_pos_s is not None:
162 |             key = key + key_pos_s
163 | 
164 |         query, _ = padded_to_packed(query, seqlen_q)
165 |         key, _ = padded_to_packed(key, seqlen_kv)
166 |         value, _ = padded_to_packed(value, seqlen_kv)
167 | 
168 |         q = attn.to_q(query)
169 |         k = attn.to_k(key)
170 |         v = attn.to_v(value)
171 | 
172 |         assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
173 |         inner_dim = k.shape[-1]
174 |         head_dim = inner_dim // attn.heads
175 | 
176 |         q = q.reshape(1, -1, attn.heads, head_dim)  # batch_size=1 because it's packed
177 |         k = k.reshape(1, -1, attn.heads, head_dim)
178 |         v = v.reshape(1, -1, attn.heads, head_dim)
179 | 
180 |         q = attn.norm_q(q).to(q.dtype)
181 |         k = attn.norm_k(k).to(k.dtype)
182 | 
183 |         if query_pos_r is not None:
184 |             query_pos_r, _ = padded_to_packed(query_pos_r, seqlen_q)
185 |             q = FeaturePositionalEncoding.apply_rotary_emb(q, query_pos_r)
186 |         if key_pos_r is not None:
187 |             key_pos_r, _ = padded_to_packed(key_pos_r, seqlen_kv)
188 |             k = FeaturePositionalEncoding.apply_rotary_emb(k, key_pos_r)
189 | 
190 |         x = xops.memory_efficient_attention(
191 |             q, k, v,
192 |             attn_bias=xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(seqlen_q, seqlen_kv), 
193 |             p=attn.attn_drop.p if attn.training and hasattr(attn, 'attn_drop') else 0.,
194 |         )
195 |         x = x.reshape(1, -1, inner_dim)
196 |         x = attn.to_out[0](x)  # linear
197 |         x = attn.to_out[1](x)  # dropout
198 | 
199 |         x, _ = packed_to_padded(x, seqlen_q, max(seqlen_q))
200 |         
201 |         return x
202 | 
203 | 
204 | 
205 | class FinalLayer(nn.Module):
206 |     """
207 |     The final layer of DiT.
208 |     """
209 | 
210 |     def __init__(self, hidden_size, patch_size, out_channels):
211 |         super().__init__()
212 |         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
213 |         self.linear = nn.Linear(
214 |             hidden_size, patch_size * patch_size * out_channels, bias=True
215 |         )
216 |         self.adaLN_modulation = nn.Sequential(
217 |             nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
218 |         )
219 | 
220 |     def initialize_weights(self):
221 |         nn.init.constant_(self.linear.weight, 0)
222 |         nn.init.constant_(self.linear.bias, 0)
223 |         nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
224 |         nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
225 | 
226 |     def forward(self, x, c):
227 |         shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
228 |         x = modulate(self.norm_final(x), shift, scale)
229 |         x = self.linear(x)
230 |         return x
231 | 
232 | 
233 | class MultiLayerPatchEmbed(nn.Module):
234 |     def __init__(self, img_size, patch_size, in_chans, embed_dim):
235 |         super().__init__()
236 |         assert patch_size in [2, 4, 6, 8, 16]
237 |         self.img_size = img_size
238 |         self.patch_size = patch_size
239 |         self.in_chans = in_chans
240 |         self.embed_dim = embed_dim
241 | 
242 |         n_down = round(math.log2(patch_size))
243 |         self.proj = [nn.Conv2d(in_chans, embed_dim, 3, 1, 1)]
244 |         for i in range(n_down):
245 |             self.proj.append(nn.SiLU(inplace=True))
246 |             self.proj.append(nn.Conv2d(embed_dim, embed_dim, 2, 2, 0))
247 |             self.proj.append(nn.SiLU(inplace=True))
248 |             self.proj.append(nn.Conv2d(embed_dim, embed_dim, 3, 1, 1))
249 |         self.proj = nn.Sequential(*self.proj)
250 | 
251 |     def initialize_weights(self):
252 |         for m in self.proj.modules():
253 |             if isinstance(m, nn.Conv2d):
254 |                 w = m.weight.data
255 |                 nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
256 |                 nn.init.constant_(m.bias, 0)
257 | 
258 |     def forward(self, x):
259 |         '''
260 |         x: [N, C, H, W]
261 |         out: [N, C, H, W]
262 |         '''
263 |         return self.proj(x)
264 | 
265 | class MultiLayerFinalLayer(nn.Module):
266 |     def __init__(self, hidden_size, patch_size, out_channels):
267 |         super().__init__()
268 |         assert patch_size in [2, 4, 6, 8, 16]
269 |         self.hidden_size = hidden_size
270 |         self.patch_size = patch_size
271 |         self.out_channels = out_channels
272 | 
273 |         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
274 | 
275 |         n_up = round(math.log2(patch_size))
276 |         self.proj = []
277 |         for i in range(n_up - 1):
278 |             self.proj.append(nn.ConvTranspose2d(hidden_size, hidden_size, 2, 2, 0, 0))
279 |             self.proj.append(nn.SiLU(inplace=True))
280 |             self.proj.append(nn.Conv2d(hidden_size, hidden_size, 3, 1, 1))
281 |             self.proj.append(nn.SiLU(inplace=True))
282 |         self.proj.append(nn.ConvTranspose2d(hidden_size, out_channels, 2, 2, 0, 0))
283 |         self.proj = nn.Sequential(*self.proj)
284 | 
285 |         self.adaLN_modulation = nn.Sequential(
286 |             nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
287 |         )
288 | 
289 |     def initialize_weights(self):
290 |         nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
291 |         nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
292 |         for m in self.proj.modules():
293 |             if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
294 |                 w = m.weight.data
295 |                 nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
296 |                 nn.init.constant_(m.bias, 0)
297 |         nn.init.constant_(self.proj[-1].weight, 0)
298 | 
299 |     def forward(self, x, c):
300 |         '''
301 |         x: [B, N, H, W, D]
302 |         c: [B, D]
303 |         out: [B, N, D, H, W]
304 |         '''
305 |         shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
306 |         seq_dims = x.shape[1:4]
307 |         x = x.flatten(1, 3)  # [B, L, D]
308 |         x = modulate(self.norm_final(x), shift, scale)
309 |         x = x.unflatten(1, seq_dims)  # [B, N, H, W, D]
310 |         x = x.permute(0, 1, 4, 2, 3)  # [B, N, D, H, W]
311 |         batch_dims = x.shape[0:2]
312 |         x = x.flatten(0, 1)  # [BN, D, H, W]
313 |         x = self.proj(x)
314 |         return x.unflatten(0, batch_dims)
315 | 


--------------------------------------------------------------------------------
/model/utils/normalize.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | """
  7 | Adapted from code originally written by David Novotny.
  8 | """
  9 | 
 10 | import ipdb  # noqa: F401
 11 | import torch
 12 | from pytorch3d.transforms import Rotate, Translate
 13 | 
 14 | 
 15 | def intersect_skew_line_groups(p, r, mask):
 16 |     # p, r both of shape (B, N, n_intersected_lines, 3)
 17 |     # mask of shape (B, N, n_intersected_lines)
 18 |     p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
 19 |     if p_intersect is None:
 20 |         return None, None, None, None
 21 |     _, p_line_intersect = point_line_distance(
 22 |         p, r, p_intersect[..., None, :].expand_as(p)
 23 |     )
 24 |     intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(
 25 |         dim=-1
 26 |     )
 27 |     return p_intersect, p_line_intersect, intersect_dist_squared, r
 28 | 
 29 | 
 30 | def intersect_skew_lines_high_dim(p, r, mask=None):
 31 |     # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
 32 |     dim = p.shape[-1]
 33 |     # make sure the heading vectors are l2-normed
 34 |     if mask is None:
 35 |         mask = torch.ones_like(p[..., 0])
 36 |     r = torch.nn.functional.normalize(r, dim=-1)
 37 | 
 38 |     eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
 39 |     I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
 40 |     sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
 41 | 
 42 |     # I_eps = torch.zeros_like(I_min_cov.sum(dim=-3)) + 1e-10
 43 |     # p_intersect = torch.pinverse(I_min_cov.sum(dim=-3) + I_eps).matmul(sum_proj)[..., 0]
 44 |     p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
 45 | 
 46 |     # I_min_cov.sum(dim=-3): torch.Size([1, 1, 3, 3])
 47 |     # sum_proj: torch.Size([1, 1, 3, 1])
 48 | 
 49 |     # p_intersect = np.linalg.lstsq(I_min_cov.sum(dim=-3).numpy(), sum_proj.numpy(), rcond=None)[0]
 50 | 
 51 |     if torch.any(torch.isnan(p_intersect)):
 52 |         print(p_intersect)
 53 |         return None, None
 54 |         ipdb.set_trace()
 55 |         assert False
 56 |     return p_intersect, r
 57 | 
 58 | 
 59 | def point_line_distance(p1, r1, p2):
 60 |     df = p2 - p1
 61 |     proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
 62 |     line_pt_nearest = p2 - proj_vector
 63 |     d = (proj_vector).norm(dim=-1)
 64 |     return d, line_pt_nearest
 65 | 
 66 | 
 67 | def compute_optical_axis_intersection(cameras):
 68 |     centers = cameras.get_camera_center()
 69 |     principal_points = cameras.principal_point
 70 | 
 71 |     one_vec = torch.ones((len(cameras), 1), device=centers.device)
 72 |     optical_axis = torch.cat((principal_points, one_vec), -1)
 73 | 
 74 |     # optical_axis = torch.cat(
 75 |     #     (principal_points, cameras.focal_length[:, 0].unsqueeze(1)), -1
 76 |     # )
 77 | 
 78 |     pp = cameras.unproject_points(optical_axis, from_ndc=True, world_coordinates=True)
 79 |     pp2 = torch.diagonal(pp, dim1=0, dim2=1).T
 80 | 
 81 |     directions = pp2 - centers
 82 |     centers = centers.unsqueeze(0).unsqueeze(0)
 83 |     directions = directions.unsqueeze(0).unsqueeze(0)
 84 | 
 85 |     p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(
 86 |         p=centers, r=directions, mask=None
 87 |     )
 88 | 
 89 |     if p_intersect is None:
 90 |         dist = None
 91 |     else:
 92 |         p_intersect = p_intersect.squeeze().unsqueeze(0)
 93 |         dist = (p_intersect - centers).norm(dim=-1)
 94 | 
 95 |     return p_intersect, dist, p_line_intersect, pp2, r
 96 | 
 97 | class IntersectionException(Exception):
 98 |     pass
 99 | 
100 | def normalize_cameras(cameras, scale=1.0, add_cameras=False):
101 |     """
102 |     Normalizes cameras such that the optical axes point to the origin, the rotation is
103 |     identity, and the norm of the translation of the first camera is 1.
104 | 
105 |     Args:
106 |         cameras (pytorch3d.renderer.cameras.CamerasBase).
107 |         scale (float): Norm of the translation of the first camera.
108 | 
109 |     Returns:
110 |         new_cameras (pytorch3d.renderer.cameras.CamerasBase): Normalized cameras.
111 |         undo_transform (function): Function that undoes the normalization.
112 |     """
113 | 
114 |     # Let distance from first camera to origin be unit
115 |     new_cameras = cameras.clone()
116 |     new_transform = (
117 |         new_cameras.get_world_to_view_transform()
118 |     )  # potential R is not valid matrix
119 |     p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(
120 |         cameras
121 |     )
122 | 
123 |     if p_intersect is None:
124 |         raise IntersectionException
125 | 
126 |     d = dist.squeeze(dim=1).squeeze(dim=0)[0]
127 |     # Degenerate case
128 |     if d == 0:
129 |         # print(cameras.T)
130 |         # print(new_transform.get_matrix()[:, 3, :3])
131 |         raise IntersectionException
132 | 
133 |     # Can't figure out how to make scale part of the transform too without messing up R.
134 |     # Ideally, we would just wrap it all in a single Pytorch3D transform so that it
135 |     # would work with any structure (eg PointClouds, Meshes).
136 |     tR = Rotate(new_cameras.R[0].unsqueeze(0)).inverse()
137 |     tT = Translate(p_intersect)
138 |     t = tR.compose(tT)
139 | 
140 |     new_transform2 = t.compose(new_transform)    # = t.get_matrix() @ new_transform.get_matrix()
141 |     new_cameras.R = new_transform2.get_matrix()[:, :3, :3]
142 |     new_cameras.T = new_transform2.get_matrix()[:, 3, :3] / d * scale
143 |     
144 |     scene_scale = scale / d
145 | 
146 |     def undo_transform(cameras):
147 |         cameras_copy = cameras.clone()
148 |         cameras_copy.T *= d / scale
149 |         new_t = (
150 |             t.inverse().compose(cameras_copy.get_world_to_view_transform()).get_matrix()
151 |         )
152 |         cameras_copy.R = new_t[:, :3, :3]
153 |         cameras_copy.T = new_t[:, 3, :3]
154 |         return cameras_copy
155 | 
156 |     if add_cameras:
157 |         return new_cameras, undo_transform, scene_scale, t
158 |     else:
159 |         return new_cameras, undo_transform, scene_scale
160 | 
161 | 
162 | def normalize_cameras_inference(cameras, scale=1.0):
163 |     """
164 |     Normalizes cameras such that the optical axes point to the origin, the rotation is
165 |     identity, and the norm of the translation of the first camera is 1.
166 | 
167 |     Args:
168 |         cameras (pytorch3d.renderer.cameras.CamerasBase).
169 |         scale (float): Norm of the translation of the first camera.
170 | 
171 |     Returns:
172 |         new_cameras (pytorch3d.renderer.cameras.CamerasBase): Normalized cameras.
173 |         undo_transform (function): Function that undoes the normalization.
174 |     """
175 | 
176 |     # Let distance from first camera to origin be unit
177 |     new_cameras = cameras.clone()
178 |     new_transform = (
179 |         new_cameras.get_world_to_view_transform()
180 |     )  # potential R is not valid matrix
181 | 
182 |     # Can't figure out how to make scale part of the transform too without messing up R.
183 |     # Ideally, we would just wrap it all in a single Pytorch3D transform so that it
184 |     # would work with any structure (eg PointClouds, Meshes).
185 |     tR = Rotate(new_cameras.R[0].unsqueeze(0)).inverse()
186 |     d = new_cameras.T[0].norm()
187 |     T = torch.Tensor([0., 0., 1.]) - new_cameras.T[0]
188 |     tT = Translate(T.unsqueeze(0))
189 |     t = tR.compose(tT)
190 | 
191 |     new_transform = t.compose(new_transform)
192 |     new_cameras.R = new_transform.get_matrix()[:, :3, :3]
193 |     new_cameras.T = new_transform.get_matrix()[:, 3, :3] / d * scale
194 | 
195 |     def undo_transform(cameras):
196 |         cameras_copy = cameras.clone()
197 |         cameras_copy.T *= d / scale
198 |         new_t = (
199 |             t.inverse().compose(cameras_copy.get_world_to_view_transform()).get_matrix()
200 |         )
201 |         cameras_copy.R = new_t[:, :3, :3]
202 |         cameras_copy.T = new_t[:, 3, :3]
203 |         return cameras_copy
204 | 
205 |     return new_cameras, undo_transform
206 | 
207 | 
208 | def first_camera_transform(cameras, rotation_only=True):
209 |     new_cameras = cameras.clone()
210 |     new_transform = new_cameras.get_world_to_view_transform()
211 |     tR = Rotate(new_cameras.R[0].unsqueeze(0))
212 |     if rotation_only:
213 |         t = tR.inverse()
214 |     else:
215 |         tT = Translate(new_cameras.T[0].unsqueeze(0))
216 |         t = tR.compose(tT).inverse()
217 | 
218 |     new_transform = t.compose(new_transform)
219 |     new_cameras.R = new_transform.get_matrix()[:, :3, :3]
220 |     new_cameras.T = new_transform.get_matrix()[:, 3, :3]
221 | 
222 |     return new_cameras
223 | 


--------------------------------------------------------------------------------
/pipeline_depth_prediction.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import os
 6 | import cv2
 7 | import argparse
 8 | import numpy as np
 9 | import torch
10 | import trimesh
11 | from omegaconf import OmegaConf
12 | from model.load import load_model
13 | from utils.train_utils import model_inference
14 | from utils.data_utils import DataHandler, get_rgbd_point_cloud_numpy
15 | from data import Preprocessor
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     parser = argparse.ArgumentParser(description="Simple example of a test script.")
20 |     parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.")
21 |     parser.add_argument("--exp_name", type=str, default='depth-prediction')
22 |     parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114")
23 |     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
24 |     parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.")
25 |     parser.add_argument("--checkpoint_path", type=str, default=None)
26 |     parser.add_argument("--guidance_scale", type=float, default=1.0, help="inference cfg. 1.0 denotes not use cfg")
27 |     args = parser.parse_args()
28 | 
29 |     # Make experiment directory
30 |     exp_folder = f'./results/{args.exp_name}'
31 |     os.makedirs(exp_folder, exist_ok=True)
32 | 
33 |     # Set cuda and mixed precision
34 |     device = torch.device(f"cuda:{args.gpu}")
35 |     if args.mixed_precision == "no":
36 |         weight_dtype = torch.float32
37 |     elif args.mixed_precision == "fp16":
38 |         weight_dtype = torch.float16
39 |     elif args.mixed_precision == "bf16":
40 |         weight_dtype = torch.bfloat16
41 | 
42 |     # Load config and model
43 |     cfg = OmegaConf.load(args.config)
44 |     models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype)
45 |     
46 |     # Load data pre-processor
47 |     preprocessor = Preprocessor(cfg)
48 |         
49 |     # Get data
50 |     data = preprocessor(args.data_path, input_type='multi-view')
51 |     data_handler = DataHandler(data)
52 |     
53 |     # Hyper-parameters setting & Mod flag editing
54 |     # In this example, 3 views (id = 0, 1, 2) are used for inference
55 |     # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth',
56 |     # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used
57 |     # The order of letters in each modality denotes the state order
58 |     # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used
59 |     # would be auto-cutted based on the view numbers
60 |     used_view_ids = torch.arange(3)
61 |     mod_flags = 'cccccccc,cccccccc,gggggggg'
62 |     
63 |     # Set random seed
64 |     SEED = np.random.randint(0, 2147483647)
65 |         
66 |     # Inference
67 |     np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \
68 |         model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED)
69 | 
70 |     # Write paired visualizations
71 |     num_view = len(used_view_ids)
72 |     gt_part = np_image[0][:512*num_view, :512*4]
73 |     pred_part = np_image[0][-512*num_view:, 512*3:512*4]
74 |     # from left to right: gt_rgb - gt_pose (dir + mom) - gt-depth - pred depth
75 |     concat_images = np.concatenate([gt_part, pred_part], axis=1)
76 |     file_name = f"{data_handler('scene_id')}-{SEED}-compare.png"
77 |     cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1])
78 |     
79 |     # Back-project depth images to point clouds
80 |     camera = data_handler('gt_pyt3d_camera')[0][used_view_ids]
81 |     mask = depth_mask[0].cpu()
82 |     gt_images = data_handler('gen_image')[0][used_view_ids][mask] * 0.5 + 0.5
83 |     # write predictions
84 |     pred_depths = 1.0 / mmod_preds['gens']['depth'][0].cpu()[:, 0:1]
85 |     pred_points, pred_colors = get_rgbd_point_cloud_numpy(camera, gt_images, pred_depths)
86 |     output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-{SEED}-depth-pred.ply")
87 |     combined_ply = trimesh.PointCloud(pred_points, pred_colors * 255)
88 |     _ = combined_ply.export(output_path) 
89 |     # write groundtruths
90 |     gt_depths = 1.0 / data_handler('gen_depth')[0][used_view_ids][mask].cpu()
91 |     gt_depth_masks = torch.logical_and(gt_depths > 0, ~torch.isinf(gt_depths)).to(gt_depths)
92 |     gt_points, gt_colors = get_rgbd_point_cloud_numpy(camera, gt_images, gt_depths, depth_masks=gt_depth_masks, mask_thr=0.5)
93 |     output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-depth-gt.ply")
94 |     combined_ply = trimesh.PointCloud(gt_points, gt_colors * 255)
95 |     _ = combined_ply.export(output_path)                


--------------------------------------------------------------------------------
/pipeline_novel_view_synthesis.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import os
 6 | import cv2
 7 | import argparse
 8 | import numpy as np
 9 | import torch
10 | from omegaconf import OmegaConf
11 | from model.load import load_model
12 | from utils.train_utils import model_inference
13 | from utils.data_utils import DataHandler
14 | from data import Preprocessor
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser(description="Simple example of a test script.")
19 |     parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.")
20 |     parser.add_argument("--exp_name", type=str, default='novel-view-synthesis')
21 |     parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114")
22 |     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
23 |     parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.")
24 |     parser.add_argument("--checkpoint_path", type=str, default=None)
25 |     parser.add_argument("--guidance_scale", type=float, default=1.5, help="inference cfg. 1.0 denotes not use cfg")
26 |     args = parser.parse_args()
27 | 
28 |     # Make experiment directory
29 |     exp_folder = f'./results/{args.exp_name}'
30 |     os.makedirs(exp_folder, exist_ok=True)
31 | 
32 |     # Set cuda and mixed precision
33 |     device = torch.device(f"cuda:{args.gpu}")
34 |     if args.mixed_precision == "no":
35 |         weight_dtype = torch.float32
36 |     elif args.mixed_precision == "fp16":
37 |         weight_dtype = torch.float16
38 |     elif args.mixed_precision == "bf16":
39 |         weight_dtype = torch.bfloat16
40 | 
41 |     # Load config and model
42 |     cfg = OmegaConf.load(args.config)
43 |     models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype)
44 |     
45 |     # Load data pre-processor
46 |     preprocessor = Preprocessor(cfg)
47 |         
48 |     # Get data
49 |     data = preprocessor(args.data_path, input_type='multi-view')
50 |     data_handler = DataHandler(data)
51 |     
52 |     # Hyper-parameters setting & Mod flag editing
53 |     # In this example, 4 views (id = 0, 1, 2, 3) are used for inference
54 |     # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth',
55 |     # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used
56 |     # The order of letters in each modality denotes the state order
57 |     # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used
58 |     # would be auto-cutted based on the view numbers
59 |     used_view_ids = torch.arange(4)
60 |     mod_flags = 'cggggggg,cccccccc,cccccccc'
61 | 
62 |     # Set random seed
63 |     SEED = np.random.randint(0, 2147483647)
64 |         
65 |     # Inference
66 |     np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \
67 |         model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED)
68 | 
69 |     # Write paired visualizations
70 |     num_view = len(used_view_ids)
71 |     gt_part = np_image[0][:512*num_view, :512*3]
72 |     pred_part = np_image[0][-512*num_view:, :512]
73 |     # from left to right: gt_rgb - gt_pose (dir + mom) - pred_rgb
74 |     concat_images = np.concatenate([gt_part, pred_part], axis=1)
75 |     file_name = f"{data_handler('scene_id')}-{SEED}-compare.png"
76 |     cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1])


--------------------------------------------------------------------------------
/pipeline_pose_estimation.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | import os
  6 | import io
  7 | import cv2
  8 | import base64
  9 | import plotly
 10 | import argparse
 11 | import numpy as np
 12 | import torch
 13 | import matplotlib
 14 | import matplotlib.pyplot as plt
 15 | from omegaconf import OmegaConf
 16 | from pytorch3d.renderer import PerspectiveCameras
 17 | from pytorch3d.vis.plotly_vis import plot_scene
 18 | from model.load import load_model
 19 | from model.utils.rays import Rays, rays_to_cameras_homography
 20 | from utils.train_utils import model_inference
 21 | from utils.data_utils import DataHandler, tensor_recursive_to
 22 | from utils.vis import view_color_coded_images_from_tensor
 23 | from data import Preprocessor
 24 | 
 25 | 
 26 | HTML_TEMPLATE = """<html><head><meta charset="utf-8"/></head>
 27 | <body><img src="data:image/png;charset=utf-8;base64,{image_encoded}"/>
 28 | {plotly_html}</body></html>"""
 29 | 
 30 | 
 31 | def plotly_scene_visualization_dual(pred_camera, gt_camera, scale=0.03):
 32 |     num_frames = len(pred_camera)
 33 |     camera = {}
 34 |     R_pred, T_pred = pred_camera.R, pred_camera.T
 35 |     for i in range(num_frames):
 36 |         camera[i] = PerspectiveCameras(R=R_pred[i, None], T=T_pred[i, None])
 37 |     if gt_camera is not None:
 38 |         R_gt, T_gt = gt_camera.R, gt_camera.T
 39 |         for i in range(num_frames):
 40 |             camera[i + num_frames] = PerspectiveCameras(R=R_gt[i, None], T=T_gt[i, None])
 41 | 
 42 |     fig = plot_scene(
 43 |         {"scene": camera},
 44 |         camera_scale=scale,
 45 |     )
 46 |     fig.update_scenes(aspectmode="data")
 47 | 
 48 |     cmap = plt.get_cmap("hsv")
 49 |     for i in range(num_frames):
 50 |         fig.data[i].line.color = matplotlib.colors.to_hex(cmap(i / (num_frames)))
 51 |     if gt_camera is not None:
 52 |         for i in range(num_frames):
 53 |             fig.data[i + num_frames].line.color = matplotlib.colors.to_hex((0.0, 0.0, 0.0, 1.0))
 54 |     return fig
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 |     parser = argparse.ArgumentParser(description="Simple example of a test script.")
 59 |     parser.add_argument("--gpu", type=int, default=0, help="which GPU to use.")
 60 |     parser.add_argument("--exp_name", type=str, default='pose-estimation')
 61 |     parser.add_argument("--data_path", type=str, default=None, help="examples/co3dv2-samples/31_1359_4114")
 62 |     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
 63 |     parser.add_argument("--config", type=str, default="configs/config_stage3.yaml", help="Path to training config yaml file.")
 64 |     parser.add_argument("--checkpoint_path", type=str, default=None)
 65 |     parser.add_argument("--guidance_scale", type=float, default=1.5, help="inference cfg. 1.0 denotes not use cfg")
 66 |     parser.add_argument("--default_fov", type=float, default=60.0)
 67 |     args = parser.parse_args()
 68 | 
 69 |     # Make experiment directory
 70 |     exp_folder = f'./results/{args.exp_name}'
 71 |     os.makedirs(exp_folder, exist_ok=True)
 72 | 
 73 |     # Set cuda and mixed precision
 74 |     device = torch.device(f"cuda:{args.gpu}")
 75 |     if args.mixed_precision == "no":
 76 |         weight_dtype = torch.float32
 77 |     elif args.mixed_precision == "fp16":
 78 |         weight_dtype = torch.float16
 79 |     elif args.mixed_precision == "bf16":
 80 |         weight_dtype = torch.bfloat16
 81 | 
 82 |     # Load config and model
 83 |     cfg = OmegaConf.load(args.config)
 84 |     models = load_model(cfg, args.checkpoint_path, device=device, weight_dtype=weight_dtype)
 85 |     
 86 |     # Load data pre-processor
 87 |     preprocessor = Preprocessor(cfg, fov=args.default_fov)
 88 |         
 89 |     # Get data
 90 |     data = preprocessor(args.data_path, input_type='multi-view')
 91 |     data_handler = DataHandler(data)
 92 |     
 93 |     # Hyper-parameters setting & Mod flag editing
 94 |     # In this example, 8 views (id = 0, 1, 2, 3, 4, 5, 6, 7) are used for inference
 95 |     # Set mod flags as 'cggggggg,cccccccc,xxxxxxxx' denotes states of 'rgb,pose,depth',
 96 |     # where 'c' denotes condition, 'g' denotes generation, 'x' denotes not used
 97 |     # The order of letters in each modality denotes the state order
 98 |     # e.g., 'ccggx' denotes view 0-1 as condition, view 2-3 as generation, and view 4 as not used
 99 |     # would be auto-cutted based on the view numbers
100 |     used_view_ids = torch.arange(8)
101 |     mod_flags = 'cccccccc,cggggggg,xxxxxxxx'
102 | 
103 |     # Set random seed
104 |     SEED = np.random.randint(0, 2147483647)
105 |         
106 |     # Inference
107 |     np_image, pred_rgb, rgb_mask, pred_ray, ray_mask, pred_depth, depth_mask, mmod_preds, batch = \
108 |         model_inference(models, data_handler, used_view_ids, mod_flags, preprocessor, cfg, args, device, weight_dtype, guidance_scale=args.guidance_scale, seed=SEED)
109 | 
110 |     # Write paired visualizations
111 |     num_view = len(used_view_ids)
112 |     gt_part = np_image[0][:512*num_view, :512*3]
113 |     pred_part = np_image[0][-512*num_view:, 512:512*3]
114 |     # from left to right: gt_rgb - gt_pose (dir + mom) - pred_pose (dir + mom)
115 |     concat_images = np.concatenate([gt_part, pred_part], axis=1)
116 |     file_name = f"{data_handler('scene_id')}-{SEED}-compare.png"
117 |     cv2.imwrite(os.path.join(exp_folder, file_name), concat_images[..., ::-1])
118 | 
119 |     # Save camera visualization html following RayDiffuison
120 |     gt_camera = data_handler('gt_pyt3d_camera')[0][used_view_ids] if data_handler('gt_pyt3d_camera') else None
121 |     gt_rays = data_handler('cond_rays')[0][used_view_ids].float().cpu()
122 |     pred_ray = mmod_preds['gens']['ray'][0].float().cpu()
123 |     pred_ray[0] = gt_rays[0]
124 |     # create camera from rays
125 |     pred_camera = rays_to_cameras_homography(
126 |         Rays.from_spatial(pred_ray),
127 |         crop_parameters=None,
128 |         num_patches_x=cfg.data.raymap_size,
129 |         num_patches_y=cfg.data.raymap_size,
130 |     )
131 |     fig = plotly_scene_visualization_dual(pred_camera, gt_camera, scale=0.1)
132 |     output_path = os.path.join(exp_folder, f"{data_handler('scene_id')}-{SEED}-cameras-vis.html")
133 |     html_plot = plotly.io.to_html(fig, full_html=False, include_plotlyjs="cdn")
134 |     s = io.BytesIO()
135 |     images = torch.nn.functional.interpolate(data_handler('cond_image')[0][used_view_ids], size=(128, 128), mode='bilinear', align_corners=False).permute(0, 2, 3, 1)
136 |     view_color_coded_images_from_tensor(images)
137 |     plt.savefig(s, format="png", bbox_inches="tight")
138 |     plt.close()
139 |     image_encoded = base64.b64encode(s.getvalue()).decode("utf-8").replace("\n", "")
140 |     with open(output_path, "w") as f:
141 |         s = HTML_TEMPLATE.format(
142 |             image_encoded=image_encoded,
143 |             plotly_html=html_plot,
144 |         )
145 |         f.write(s)                
146 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ipdb
 2 | imageio
 3 | imageio-ffmpeg
 4 | matplotlib
 5 | numpy==1.26
 6 | scipy
 7 | omegaconf
 8 | opencv-python
 9 | plotly
10 | transformers
11 | tensorboard
12 | open3d
13 | sentencepiece
14 | deepspeed
15 | torchtyping
16 | diffusers==0.31.0
17 | accelerate
18 | scikit-image
19 | torchmetrics
20 | git+https://github.com/NVlabs/tiny-cuda-nn#subdirectory=bindings/torch
21 | nerfstudio @ git+https://github.com/nerfstudio-project/nerfstudio@fc4fc5cb15ad994ea82d8c651c9d42172d890de1
22 | 


--------------------------------------------------------------------------------
/scripts/depth_prediction.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | data_path=$1
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_depth_prediction.py \
 9 |     --config configs/config_stage3.yaml \
10 |     --data_path $data_path \
11 |     --mixed_precision fp16 \
12 |     --guidance_scale 1.0 \
13 |     --checkpoint_path checkpoints/matrix3d_512.pt
14 | 


--------------------------------------------------------------------------------
/scripts/novel_view_synthesis.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | data_path=$1
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_novel_view_synthesis.py \
 9 |     --config configs/config_stage3.yaml \
10 |     --data_path $data_path \
11 |     --mixed_precision fp16 \
12 |     --guidance_scale 1.5 \
13 |     --checkpoint_path checkpoints/matrix3d_512.pt
14 | 


--------------------------------------------------------------------------------
/scripts/pose_estimation.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | data_path=$1
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0 python pipeline_pose_estimation.py \
 9 |     --config configs/config_stage3.yaml \
10 |     --data_path $data_path \
11 |     --mixed_precision fp16 \
12 |     --guidance_scale 1.5 \
13 |     --checkpoint_path checkpoints/matrix3d_512.pt
14 | 


--------------------------------------------------------------------------------
/scripts/single_view_to_3d.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 7 | REPO_DIR=$(dirname "$SCRIPT_DIR")
 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method"
 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR
10 | 
11 | EXP_NAME=$1
12 | INPUT_PATH=$2
13 | NAME_EXT=$(basename "$INPUT_PATH")
14 | NAME="${NAME_EXT%.*}"
15 | 
16 | ### Step 1: Generation: Create novel view observations
17 | CUDA_VISIBLE_DEVICES=0 python pipeline_single_to_3d.py \
18 |     --config configs/config_stage3.yaml \
19 |     --exp_name $EXP_NAME \
20 |     --data_path $INPUT_PATH \
21 |     --default_fov 60 \
22 |     --num_samples 80 \
23 |     --checkpoint_path checkpoints/matrix3d_512.pt \
24 |     --mixed_precision fp16 \
25 |     --random_seed 1
26 | 
27 | 
28 | ### Step 2: Reconstruction: 3DGS optimization
29 | cd results/$EXP_NAME/$NAME
30 | 
31 | # 1. optimization
32 | ITERS=1200
33 | NUM_IMG=10
34 | ns-train splatfacto_matrix3d \
35 |     --data transforms_train.json \
36 |     --mixed-precision False \
37 |     --output-dir outputs \
38 |     --timestamp exps \
39 |     --viewer.quit-on-train-completion True \
40 |     --max-num-iterations $ITERS \
41 |     --steps-per-save 1000 \
42 |     --pipeline.model.num-downscales -1 \
43 |     --pipeline.model.resolution-schedule 1000 \
44 |     --pipeline.datamanager.max-num-iterations $ITERS \
45 |     --pipeline.datamanager.num_image_each_iteration $NUM_IMG \
46 |     --pipeline.model.background-color white \
47 |     --pipeline.model.warmup-length 200 \
48 |     --pipeline.model.densify-grad-thresh 0.0008 \
49 |     --pipeline.model.cull-alpha-thresh 0.05 \
50 |     --pipeline.model.cull-scale-thresh 0.5 \
51 |     --pipeline.model.cull-screen-size 0.5 \
52 |     --pipeline.model.reset-alpha-every 20 \
53 |     --pipeline.model.refine-every 50 \
54 |     --pipeline.model.use_scale_regularization True \
55 |     --pipeline.model.max-gauss-ratio 3 \
56 |     --pipeline.model.stop-screen-size-at 4000 \
57 |     --pipeline.model.stop-split-at 1000 \
58 |     --pipeline.model.sh-degree 2 \
59 |     --pipeline.model.sh-degree-interval 500 \
60 |     --pipeline.model.full-accumulation-lambda 0.0 \
61 |     --pipeline.model.accumulation-lambda 5.0 \
62 |     --pipeline.model.mask_lambda 5.0 \
63 |     --pipeline.model.ssim-lambda 0.2 \
64 |     --pipeline.model.lpips-lambda 10.0 \
65 |     --pipeline.model.l1-lambda-on-captured-views 20.0 \
66 |     --pipeline.model.l1-lambda-on-generation-views 1.0 \
67 |     --pipeline.model.apply-annealing False \
68 |     --pipeline.model.rasterize-mode antialiased \
69 |     --pipeline.model.use-absgrad False \
70 |     --pipeline.model.lpips-downsample 1 \
71 |     --pipeline.model.lpips-min-img-size 128 \
72 |     --pipeline.model.lpips-patch-size 512 \
73 |     --pipeline.model.lpips-no-resize True \
74 |     --pipeline.model.depth-l1-lambda 10.0 \
75 |     --pipeline.model.depth-ranking-lambda 10.0 \
76 |     --pipeline.model.output-depth-during-training True \
77 |     --pipeline.model.use-bilateral-grid False \
78 |     nerfstudio-data  --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0
79 | # 2. use ns-render to render frames
80 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders
81 | # 3. write frames into videos
82 | python $REPO_DIR/utils/write_videos.py --render_root renders --type object


--------------------------------------------------------------------------------
/scripts/unposed_fewshot_to_3d_arkitscenes.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 7 | REPO_DIR=$(dirname "$SCRIPT_DIR")
 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method"
 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR
10 | 
11 | EXP_NAME=$1
12 | INPUT_PATH=$2
13 | NAME_EXT=$(basename "$INPUT_PATH")
14 | NAME="${NAME_EXT%.*}"
15 | 
16 | ### Step 1: Generation: Create novel view observations
17 | CUDA_VISIBLE_DEVICES=0 python pipeline_unposed_few_shot_to_3d.py \
18 |     --config configs/config_stage3.yaml \
19 |     --exp_name $EXP_NAME \
20 |     --data_path $INPUT_PATH \
21 |     --spline_scales 3 \
22 |     --num_samples 80 \
23 |     --num_depth_runs_for_init_depth 15 \
24 |     --checkpoint_path checkpoints/matrix3d_512.pt \
25 |     --mixed_precision fp16 \
26 |     --random_seed 1 \
27 |     --use_loop_traj 1 \
28 |     --dataset arkitscenes
29 | 
30 | 
31 | ### Step 2: Reconstruction: 3DGS optimization
32 | cd results/$EXP_NAME/$NAME
33 | 
34 | # 1. optimization
35 | ITERS=3000
36 | NUM_IMG=5
37 | ns-train splatfacto_matrix3d \
38 |     --data transforms_train.json \
39 |     --mixed-precision False \
40 |     --output-dir outputs \
41 |     --timestamp exps \
42 |     --viewer.quit-on-train-completion True \
43 |     --max-num-iterations $ITERS \
44 |     --steps-per-save 1000 \
45 |     --pipeline.model.num-downscales 0 \
46 |     --pipeline.model.resolution-schedule 500 \
47 |     --pipeline.datamanager.max-num-iterations $ITERS \
48 |     --pipeline.datamanager.num_image_each_iteration $NUM_IMG \
49 |     --pipeline.model.warmup-length 500 \
50 |     --pipeline.model.densify-grad-thresh 0.0008 \
51 |     --pipeline.model.cull-alpha-thresh 0.2 \
52 |     --pipeline.model.cull-scale-thresh 0.5 \
53 |     --pipeline.model.cull-screen-size 0.5 \
54 |     --pipeline.model.reset-alpha-every 15 \
55 |     --pipeline.model.refine-every 100 \
56 |     --pipeline.model.use_scale_regularization True \
57 |     --pipeline.model.max-gauss-ratio 6 \
58 |     --pipeline.model.apply-annealing False \
59 |     --pipeline.model.stop-screen-size-at 4000 \
60 |     --pipeline.model.stop-split-at 2000 \
61 |     --pipeline.model.sh-degree 3 \
62 |     --pipeline.model.sh-degree-interval 800 \
63 |     --pipeline.model.accumulation-lambda 0.5 \
64 |     --pipeline.model.full-accumulation-lambda 5.0 \
65 |     --pipeline.model.start-full-accumulation 1500 \
66 |     --pipeline.model.ssim-lambda 0.2 \
67 |     --pipeline.model.lpips-lambda 20.0 \
68 |     --pipeline.model.l1-lambda-on-captured-views 20.0 \
69 |     --pipeline.model.l1-lambda-on-generation-views 1.0 \
70 |     --pipeline.model.rasterize-mode antialiased \
71 |     --pipeline.model.use-absgrad True \
72 |     --pipeline.model.lpips-downsample 4 \
73 |     --pipeline.model.lpips-min-img-size 256 \
74 |     --pipeline.model.lpips-patch-size 512 \
75 |     --pipeline.model.lpips-no-resize True \
76 |     --pipeline.model.depth-l1-lambda 10.0 \
77 |     --pipeline.model.depth-ranking-lambda 20.0 \
78 |     --pipeline.model.output-depth-during-training True \
79 |     --pipeline.model.use-bilateral-grid False \
80 |     nerfstudio-data  --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0
81 | # 2. use ns-render to render frames
82 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders
83 | # 3. write frames into videos
84 | python $REPO_DIR/utils/write_videos.py --render_root renders --type scene --num_splines 3


--------------------------------------------------------------------------------
/scripts/unposed_fewshot_to_3d_co3dv2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | #
 3 | # For licensing see accompanying LICENSE file.
 4 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 5 | #
 6 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 7 | REPO_DIR=$(dirname "$SCRIPT_DIR")
 8 | export NERFSTUDIO_METHOD_CONFIGS="splatfacto_matrix3d=splatfacto_matrix3d.splatfacto_configs:splatfacto_method"
 9 | export PYTHONPATH=$PYTHONPATH:$REPO_DIR
10 | 
11 | EXP_NAME=$1
12 | INPUT_PATH=$2
13 | NAME_EXT=$(basename "$INPUT_PATH")
14 | NAME="${NAME_EXT%.*}"
15 | 
16 | ### Step 1: Generation: Create novel view observations
17 | CUDA_VISIBLE_DEVICES=0 python pipeline_unposed_few_shot_to_3d.py \
18 |     --config configs/config_stage3.yaml \
19 |     --exp_name $EXP_NAME \
20 |     --data_path $INPUT_PATH \
21 |     --spline_scales 3 \
22 |     --num_samples 80 \
23 |     --num_depth_runs_for_init_depth 15 \
24 |     --checkpoint_path checkpoints/matrix3d_512.pt \
25 |     --mixed_precision fp16 \
26 |     --random_seed 1 \
27 |     --use_loop_traj 0 \
28 |     --dataset co3dv2
29 | 
30 | 
31 | ### Step 2: Reconstruction: 3DGS optimization
32 | cd results/$EXP_NAME/$NAME
33 | 
34 | # 1. optimization
35 | ITERS=3000
36 | NUM_IMG=5
37 | ns-train splatfacto_matrix3d \
38 |     --data transforms_train.json \
39 |     --mixed-precision False \
40 |     --output-dir outputs \
41 |     --timestamp exps \
42 |     --viewer.quit-on-train-completion True \
43 |     --max-num-iterations $ITERS \
44 |     --steps-per-save 1000 \
45 |     --pipeline.model.num-downscales 0 \
46 |     --pipeline.model.resolution-schedule 500 \
47 |     --pipeline.datamanager.max-num-iterations $ITERS \
48 |     --pipeline.datamanager.num_image_each_iteration $NUM_IMG \
49 |     --pipeline.model.warmup-length 500 \
50 |     --pipeline.model.densify-grad-thresh 0.0008 \
51 |     --pipeline.model.cull-alpha-thresh 0.2 \
52 |     --pipeline.model.cull-scale-thresh 0.5 \
53 |     --pipeline.model.cull-screen-size 0.5 \
54 |     --pipeline.model.reset-alpha-every 15 \
55 |     --pipeline.model.refine-every 100 \
56 |     --pipeline.model.use_scale_regularization True \
57 |     --pipeline.model.max-gauss-ratio 6 \
58 |     --pipeline.model.apply-annealing False \
59 |     --pipeline.model.stop-screen-size-at 4000 \
60 |     --pipeline.model.stop-split-at 2000 \
61 |     --pipeline.model.sh-degree 3 \
62 |     --pipeline.model.sh-degree-interval 800 \
63 |     --pipeline.model.accumulation-lambda 0.5 \
64 |     --pipeline.model.full-accumulation-lambda 5.0 \
65 |     --pipeline.model.start-full-accumulation 1500 \
66 |     --pipeline.model.ssim-lambda 0.2 \
67 |     --pipeline.model.lpips-lambda 20.0 \
68 |     --pipeline.model.l1-lambda-on-captured-views 20.0 \
69 |     --pipeline.model.l1-lambda-on-generation-views 1.0 \
70 |     --pipeline.model.rasterize-mode antialiased \
71 |     --pipeline.model.use-absgrad True \
72 |     --pipeline.model.lpips-downsample 4 \
73 |     --pipeline.model.lpips-min-img-size 256 \
74 |     --pipeline.model.lpips-patch-size 512 \
75 |     --pipeline.model.lpips-no-resize True \
76 |     --pipeline.model.depth-l1-lambda 10.0 \
77 |     --pipeline.model.depth-ranking-lambda 20.0 \
78 |     --pipeline.model.output-depth-during-training True \
79 |     --pipeline.model.use-bilateral-grid False \
80 |     nerfstudio-data  --center-method none --orientation-method none --auto-scale-poses False --train-split-fraction 1.0 --load-3D-points True --depth-unit-scale-factor 1.0
81 | # 2. use ns-render to render frames
82 | ns-render dataset --load-config outputs/splatfacto_matrix3d/exps/config.yml --image-format png --split=train --output-path renders
83 | # 3. write frames into videos
84 | python $REPO_DIR/utils/write_videos.py --render_root renders --type scene --num_splines 3


--------------------------------------------------------------------------------
/splatfacto_matrix3d/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4 | #


--------------------------------------------------------------------------------
/splatfacto_matrix3d/splatfacto_configs.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | from __future__ import annotations
 6 | 
 7 | from nerfstudio.configs.base_config import ViewerConfig
 8 | from nerfstudio.data.datamanagers.base_datamanager import VanillaDataManager, VanillaDataManagerConfig
 9 | from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig
10 | from nerfstudio.data.datasets.depth_dataset import DepthDataset
11 | from nerfstudio.engine.optimizers import AdamOptimizerConfig
12 | from nerfstudio.engine.schedulers import (
13 |     CosineDecaySchedulerConfig,
14 |     ExponentialDecaySchedulerConfig,
15 |     MultiStepSchedulerConfig,
16 | )
17 | from nerfstudio.engine.trainer import TrainerConfig
18 | from nerfstudio.pipelines.base_pipeline import VanillaPipelineConfig
19 | from nerfstudio.plugins.types import MethodSpecification
20 | 
21 | 
22 | from .batch_full_images_datamanager import FullImageDatamanager, BatchFullImageDatamanagerConfig
23 | from .splatfacto import SplatfactoModelConfig
24 | 
25 | 
26 | splatfacto_method = MethodSpecification(
27 |     config=TrainerConfig(
28 |         method_name="splatfacto_matrix3d",
29 |         steps_per_eval_image=100,
30 |         steps_per_eval_batch=0,
31 |         steps_per_save=200,
32 |         steps_per_eval_all_images=1000,
33 |         max_num_iterations=1000,
34 |         mixed_precision=False,
35 |         pipeline=VanillaPipelineConfig(
36 |             datamanager=BatchFullImageDatamanagerConfig(
37 |                 _target=FullImageDatamanager[DepthDataset],
38 |                 # dataparser=NerfstudioDataParserConfig(load_3D_points=True),
39 |                 cache_images_type="uint8",
40 |             ),
41 |             model=SplatfactoModelConfig(),
42 |         ),
43 |         optimizers={
44 |             "means": {
45 |                 "optimizer": AdamOptimizerConfig(lr=1.6e-4, eps=1e-15),
46 |                 "scheduler": ExponentialDecaySchedulerConfig(
47 |                     lr_final=1.6e-6,
48 |                     max_steps=30000,
49 |                 ),
50 |             },
51 |             "features_dc": {
52 |                 "optimizer": AdamOptimizerConfig(lr=0.0025, eps=1e-15),
53 |                 "scheduler": None,
54 |             },
55 |             "features_rest": {
56 |                 "optimizer": AdamOptimizerConfig(lr=0.0025 / 20, eps=1e-15),
57 |                 "scheduler": None,
58 |             },
59 |             "opacities": {
60 |                 "optimizer": AdamOptimizerConfig(lr=0.05, eps=1e-15),
61 |                 "scheduler": None,
62 |             },
63 |             "scales": {
64 |                 "optimizer": AdamOptimizerConfig(lr=0.005, eps=1e-15),
65 |                 "scheduler": None,
66 |             },
67 |             "quats": {"optimizer": AdamOptimizerConfig(lr=0.001, eps=1e-15), "scheduler": None},
68 |             "camera_opt": {
69 |                 "optimizer": AdamOptimizerConfig(lr=1e-4, eps=1e-15),
70 |                 "scheduler": ExponentialDecaySchedulerConfig(
71 |                     lr_final=5e-7, max_steps=30000, warmup_steps=1000, lr_pre_warmup=0
72 |                 ),
73 |             },
74 |             "bilateral_grid": {
75 |                 "optimizer": AdamOptimizerConfig(lr=2e-3, eps=1e-15),
76 |                 "scheduler": ExponentialDecaySchedulerConfig(
77 |                     lr_final=1e-4, max_steps=30000, warmup_steps=1000, lr_pre_warmup=0
78 |                 ),
79 |             },
80 |         },
81 |         viewer=ViewerConfig(num_rays_per_chunk=1 << 15),
82 |         vis="viewer",
83 |     ),
84 |     description="Matrix3D modified Gaussian-Splatting model for 3D reconstruction"
85 | )
86 | 


--------------------------------------------------------------------------------
/utils/camera_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | import os
  6 | import numpy as np
  7 | import torch
  8 | import json
  9 | import splines
 10 | import splines.quaternion
 11 | from pytorch3d.renderer import PerspectiveCameras
 12 | from pytorch3d.renderer.cameras import look_at_view_transform
 13 | from pytorch3d.transforms import quaternion_to_matrix, matrix_to_quaternion
 14 | from pytorch3d.utils import opencv_from_cameras_projection
 15 | 
 16 | 
 17 | def fov_to_focal(fov, size):
 18 |     # convert fov angle in degree to focal
 19 |     return size / np.tan(fov * np.pi / 180.0 / 2.0) / 2.0
 20 | 
 21 | 
 22 | def focal_to_fov(focal, size):
 23 |     # convert focal to fov angle in degree
 24 |     return 2.0 * np.arctan(size / (2.0 * focal)) * 180.0 / np.pi
 25 | 
 26 | 
 27 | def set_pytorch3d_cameras_eye_at_up(azimuths, elevations, distance=1.0):
 28 |     nv = azimuths.shape[0]
 29 |     azimuths, elevations = np.deg2rad(azimuths), np.deg2rad(elevations)
 30 |     x = distance * np.sin(azimuths) * np.cos(elevations)
 31 |     y = distance * np.sin(elevations)
 32 |     z = distance * np.cos(azimuths) * np.cos(elevations) * -1
 33 |     
 34 |     at = torch.tensor([[0., 0., 0.]]).repeat(nv, 1).float()
 35 |     up = torch.tensor([[0., 1., 0.]]).repeat(nv, 1).float()
 36 |     eye = torch.tensor([x, y, z]).T.float()
 37 |     
 38 |     R, T = look_at_view_transform(eye=eye, at=at, up=up)
 39 |     
 40 |     return R, T
 41 | 
 42 | 
 43 | 
 44 | def fit_spline_given_pyt3d_cameras(pyt3d_camera, n_frames=80, scales=8, tension=0.5, 
 45 |                                           continuity=0.0, bias=0.0, is_loop=True):
 46 |     num_keyframes = len(pyt3d_camera)
 47 |     end_frame = num_keyframes if is_loop else num_keyframes - 1
 48 |     timestamps = np.linspace(0, end_frame, n_frames, endpoint=False, )
 49 |     quaternions_wxyz = matrix_to_quaternion(pyt3d_camera.R).numpy()
 50 |     positions = pyt3d_camera.get_camera_center().numpy()
 51 |     focals = pyt3d_camera.focal_length.numpy()
 52 |     orientation_spline = splines.quaternion.KochanekBartels(
 53 |         [
 54 |             splines.quaternion.UnitQuaternion.from_unit_xyzw(np.roll(wxyz, shift=-1))
 55 |             for wxyz in quaternions_wxyz
 56 |         ],
 57 |         tcb=(tension, continuity, bias),
 58 |         endconditions="closed" if is_loop else "natural",
 59 |     )
 60 |     position_spline = splines.KochanekBartels(
 61 |         [position for position in positions],
 62 |         tcb=(tension, continuity, bias),
 63 |         endconditions="closed" if is_loop else "natural",
 64 |     )
 65 |     focal_spline = splines.KochanekBartels(
 66 |         [foc for foc in focals],
 67 |         tcb=(tension, continuity, bias),
 68 |         endconditions="closed" if is_loop else "natural",
 69 |     )
 70 |     quats = orientation_spline.evaluate(timestamps)
 71 |     quat_array = np.array([[quat.scalar, *quat.vector] for quat in quats], dtype=np.float32)
 72 |     points_array = position_spline.evaluate(timestamps).astype(np.float32)
 73 |     focal_array = focal_spline.evaluate(timestamps).astype(np.float32)
 74 |     
 75 |     # convert back to pyt3d
 76 |     R = quaternion_to_matrix(torch.from_numpy(quat_array))
 77 |     points = torch.from_numpy(points_array).float()
 78 |     T = torch.bmm(-R.permute(0, 2, 1), points[..., None])[..., 0]
 79 |     spline_focal = torch.from_numpy(focal_array)
 80 |     spline_p0 = pyt3d_camera.principal_point[0].unsqueeze(0).repeat(n_frames, 1)
 81 |     image_size = pyt3d_camera.image_size[0].unsqueeze(0).repeat(n_frames, 1)
 82 |     
 83 |     # scale the cameras based on the scales
 84 |     if scales == 1:
 85 |         scales_values = torch.Tensor([1.0 + 0.0 * s for s in range(scales)])
 86 |     elif scales == 2:
 87 |         scales_values = torch.Tensor([1.0 + 0.05 * s for s in range(scales)])
 88 |     elif scales == 3:
 89 |         scales_values = torch.Tensor([0.8 + 0.2 * s for s in range(scales)])
 90 |     elif scales == 8:
 91 |         scales_values = torch.Tensor([0.9 + 0.05 * s for s in range(scales)])
 92 |     else:
 93 |         raise NotImplementedError("Unsupported number of scales for spline fitting. Please configure it manually.")
 94 |     R_matrices = R[None].repeat(scales, 1, 1, 1)
 95 |     T_matrices = T[None].repeat(scales, 1, 1) * scales_values.unsqueeze(-1).unsqueeze(-1).repeat(1, n_frames, 1)
 96 |     
 97 | 
 98 |     new_R_matrices = []
 99 |     new_T_matrices = []
100 |     from scipy.spatial.transform import Rotation
101 |     for i in range(scales):
102 |         new_T = T_matrices[i]
103 |         # quat = Rotation.from_matrix(R_matrices[i].cpu().numpy()).as_quat()
104 |         # rotation_matrix = Rotation.from_quat(quat).as_matrix()
105 |         new_R_matrices.append(R_matrices[i])
106 |         new_T_matrices.append(new_T)
107 |     new_R_matrices = torch.stack(new_R_matrices).flatten(0, 1)
108 |     new_T_matrices = torch.stack(new_T_matrices).flatten(0, 1)
109 |     
110 |     
111 |     spline_focal = spline_focal.repeat(scales, 1)
112 |     spline_p0 = spline_p0.repeat(scales, 1)
113 |     image_size = image_size.repeat(scales, 1)
114 |     
115 |     spline_cam = PerspectiveCameras(
116 |         R=new_R_matrices,
117 |         T=new_T_matrices,
118 |         focal_length=spline_focal,
119 |         principal_point=spline_p0,
120 |         image_size=image_size,
121 |         device=R.device,
122 |     )
123 |     return spline_cam
124 | 
125 | 
126 | def write_pyt3d_camera_to_nerfstudio_json(folder, ref_camera, gen_camera, eval_camera=None, has_ply=False, has_mask=False, has_depth=False):
127 |     # train jsons
128 |     transform = {}
129 |     frames_list = []
130 |     # reference_cameras
131 |     num_ref_frames = len(ref_camera)
132 |     camera_centers = ref_camera.get_camera_center()
133 |     R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(ref_camera, image_size=ref_camera.image_size)
134 |     for i in range(num_ref_frames):
135 |         frame = {}
136 |         R_c2w = ref_camera.R[i]
137 |         R_c2w_blender = R_c2w.clone()
138 |         # convert pytorch3d camera to blender/opengl camera
139 |         R_c2w_blender[:, [0, 2]] *= -1  
140 |         # R_c2w = R_cv_w2c[i]#.T
141 |         T_c2w = camera_centers[i].unsqueeze(-1)
142 |         c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1)
143 |         c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float()
144 |         frame["file_path"] = f"images/ref_frame_{i:04d}.png"
145 |         frame["transform_matrix"] = c2w_homo.tolist()
146 |         frame["fl_x"] = Ks[i][0, 0].item()
147 |         frame["fl_y"] = Ks[i][1, 1].item()
148 |         frame["cx"] = Ks[i][0, 2].item()
149 |         frame["cy"] = Ks[i][1, 2].item()
150 |         frame["w"] = ref_camera.image_size[0, 1].item()
151 |         frame["h"] = ref_camera.image_size[0, 0].item()
152 |         if has_mask:
153 |             frame["mask_path"] = f"masks/ref_frame_{i:04d}.png"
154 |         if has_depth:
155 |             frame["depth_file_path"] = f"depths/ref_frame_{i:04d}.npy"
156 |         frames_list.append(frame)
157 |     # generation cameras
158 |     num_gen_frames = len(gen_camera)
159 |     camera_centers = gen_camera.get_camera_center()
160 |     R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(gen_camera, image_size=gen_camera.image_size)
161 |     for i in range(num_gen_frames):
162 |         frame = {}
163 |         R_c2w = gen_camera.R[i]
164 |         R_c2w_blender = R_c2w.clone()
165 |         # convert pytorch3d camera to blender/opengl camera
166 |         R_c2w_blender[:, [0, 2]] *= -1  
167 |         # R_c2w = R_cv_w2c[i]#.T
168 |         T_c2w = camera_centers[i].unsqueeze(-1)
169 |         c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1)
170 |         c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float()
171 |         frame["file_path"] = f"images/frame_{i:04d}.png"
172 |         frame["transform_matrix"] = c2w_homo.tolist()
173 |         frame["fl_x"] = Ks[i][0, 0].item()
174 |         frame["fl_y"] = Ks[i][1, 1].item()
175 |         frame["cx"] = Ks[i][0, 2].item()
176 |         frame["cy"] = Ks[i][1, 2].item()
177 |         frame["w"] = gen_camera.image_size[0, 1].item()
178 |         frame["h"] = gen_camera.image_size[0, 0].item()
179 |         if has_mask:
180 |             frame["mask_path"] = f"masks/frame_{i:04d}.png"
181 |         if has_depth:
182 |             frame["depth_file_path"] = f"depths/frame_{i:04d}.npy"
183 |         frames_list.append(frame)
184 |     transform["frames"] = frames_list
185 |     if has_ply:
186 |         transform["ply_file_path"] = "ref_pred_pointcloud.ply"
187 |     with open(os.path.join(folder, 'transforms_train.json'), 'w') as json_file:
188 |         json.dump(transform, json_file, indent=4)
189 |     
190 |     # test jsons
191 |     if eval_camera is not None:
192 |         transform = {}
193 |         frames_list = []
194 |         # evaluation_cameras
195 |         num_eval_frames = len(eval_camera)
196 |         camera_centers = eval_camera.get_camera_center()
197 |         R_cv_w2c, tvec_cv, Ks = opencv_from_cameras_projection(eval_camera, image_size=eval_camera.image_size)
198 |         for i in range(num_eval_frames):
199 |             frame = {}
200 |             R_c2w = eval_camera.R[i]
201 |             R_c2w_blender = R_c2w.clone()
202 |             # convert pytorch3d camera to blender/opengl camera
203 |             R_c2w_blender[:, [0, 2]] *= -1  
204 |             # R_c2w = R_cv_w2c[i]#.T
205 |             T_c2w = camera_centers[i].unsqueeze(-1)
206 |             c2w = torch.cat([R_c2w_blender, T_c2w], dim=-1)
207 |             c2w_homo = torch.cat([c2w, torch.Tensor([[0, 0, 0, 1]])]).float()
208 |             frame["file_path"] = f"images/eval_frame_{i:04d}.png"
209 |             frame["transform_matrix"] = c2w_homo.tolist()
210 |             frame["fl_x"] = Ks[i][0, 0].item()
211 |             frame["fl_y"] = Ks[i][1, 1].item()
212 |             frame["cx"] = Ks[i][0, 2].item()
213 |             frame["cy"] = Ks[i][1, 2].item()
214 |             frame["w"] = eval_camera.image_size[0, 1].item()
215 |             frame["h"] = eval_camera.image_size[0, 0].item()
216 |             if has_mask:
217 |                 frame["mask_path"] = f"masks/eval_frame_{i:04d}.png"
218 |             if has_depth:
219 |                 frame["depth_file_path"] = f"depths/eval_frame_{i:04d}.npy"
220 |             frames_list.append(frame)
221 |         transform["frames"] = frames_list
222 |         with open(os.path.join(folder, 'transforms_test.json'), 'w') as json_file:
223 |             json.dump(transform, json_file, indent=4)
224 |         


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
  4 | #
  5 | import torch
  6 | import numpy as np
  7 | import os
  8 | import cv2
  9 | from pytorch3d.renderer import PerspectiveCameras
 10 | from pytorch3d.implicitron.tools.point_cloud_utils import (
 11 |     render_point_cloud_pytorch3d,
 12 |     get_rgbd_point_cloud,
 13 | )
 14 | 
 15 | 
 16 | MOD_FLAG_TABLE = {
 17 |     'c': 0,
 18 |     'g': 1,
 19 |     'x': -1,
 20 | }
 21 | 
 22 | class DataHandler():
 23 |     '''DataHandler for multi-view multi-modal data'''
 24 |     def __init__(self, data: dict, pad_length=None, except_keys=None):
 25 |         if not isinstance(data, dict):
 26 |             raise ValueError("Input data must be a dictionary.")
 27 |         self.data = data
 28 |         self.batch_size, self.num_view_raw = data['view_id'].shape
 29 |         if pad_length:
 30 |             self.pad_batch_data_using_first_value(pad_length, except_keys)
 31 | 
 32 | 
 33 |     def pad_batch_data_using_first_value(self, 
 34 |                                          target_length, 
 35 |                                          except_keys=['scene_id', 'global_caption', 'num_views', 'train_ids', 'test_ids', 'scene_scale']):
 36 |         # pad every value to target length
 37 |         for key in self.data.keys():
 38 |             if key in except_keys: continue  
 39 |             elif type(self.data[key]) == dict:
 40 |                 if key == 'mods_flags':
 41 |                     # use -1 (not used flag) for all mod flags
 42 |                     for sub_key in self.data[key].keys():
 43 |                         current_length = self.data[key][sub_key].size(1)
 44 |                         padding_size = target_length - current_length
 45 |                         padding = torch.ones([1])[None].repeat(self.batch_size, padding_size) * -1
 46 |                         self.data[key][sub_key] = torch.cat([self.data[key][sub_key], padding], dim=1)
 47 |                 else:
 48 |                     raise NotImplementedError()
 49 |             elif isinstance(self.data[key], torch.Tensor):
 50 |                 current_length = self.data[key].size(1)
 51 |                 if current_length < target_length:
 52 |                     padding_size = target_length - current_length
 53 |                     first_value = self.data[key][:, :1, ...] 
 54 |                     padding = first_value.repeat(1, padding_size, *[1] * (self.data[key].dim() - 2))
 55 |                     self.data[key] = torch.cat([self.data[key], padding], dim=1)
 56 |             elif isinstance(self.data[key], list):
 57 |                 for i in range(len(self.data[key])):
 58 |                     if isinstance(self.data[key][i], list):
 59 |                         current_length = len(self.data[key][i])
 60 |                         self.data[key][i].extend([self.data[key][i][0] for _ in range(target_length - current_length)])
 61 |                     elif isinstance(self.data[key][i], PerspectiveCameras): 
 62 |                         current_length = len(self.data[key][i])
 63 |                         padding_size = target_length - current_length
 64 |                         indices = [k for k in range(current_length)] + [0 for j in range(padding_size)]
 65 |                         self.data[key][i] = self.data[key][i][indices]
 66 |                         # hard code pass pytorch3d camera
 67 |                     elif isinstance(self.data[key][i], str): continue
 68 |                         # hard code pass global caption   
 69 |                     else: raise NotImplementedError(f'meet type {type(self.data[key])} not implemented! key={key}')
 70 |     
 71 | 
 72 |     def select_via_indices(self, 
 73 |                            indices=np.array([0, 1]), 
 74 |                            except_keys=['scene_id', 'global_caption', 'num_views', 'train_ids', 'test_ids', 'scene_scale'], 
 75 |                            reset_viewid=True):
 76 |         new_data = {}
 77 |         for key, value in self.data.items():
 78 |             if key in except_keys:
 79 |                 new_data[key] = value
 80 |             elif isinstance(value, dict):
 81 |                 if key == 'mods_flags':
 82 |                     new_data[key] = {}
 83 |                     for sub_key in value.keys():
 84 |                         new_data[key][sub_key] = value[sub_key][:, indices].clone()
 85 |                 else:
 86 |                     raise NotImplementedError()
 87 |             elif isinstance(value, torch.Tensor):
 88 |                 new_data[key] = value[:, indices].clone()
 89 |             elif isinstance(value, list):
 90 |                 new_list = []
 91 |                 for item in value:
 92 |                     if isinstance(item, list):
 93 |                         new_list.append([item[idx] for idx in indices])
 94 |                     elif isinstance(item, PerspectiveCameras):
 95 |                         new_list.append(item[indices.tolist()])
 96 |                     elif isinstance(item, str):
 97 |                         new_list.append(item)
 98 |                     else:
 99 |                         raise NotImplementedError(f'meet type {type(item)} not implemented! key={key}')
100 |                 new_data[key] = new_list
101 |             elif isinstance(value, bool):
102 |                 new_data[key] = value
103 |             else:
104 |                 raise NotImplementedError(f'meet type {type(value)} not implemented! key={key}')
105 |             
106 |         if reset_viewid and 'view_id' in new_data:
107 |             bs, num_view = new_data['view_id'].shape
108 |             new_data['view_id'] = torch.arange(num_view)[None].repeat(bs, 1)
109 |         return new_data
110 |     
111 |     @staticmethod
112 |     def mod_flags_update(batch, mod_flags):
113 |         num_view = batch['view_id'].shape[1]
114 |         for mod_name, mod_flags in zip(['rgb', 'ray', 'depth'], mod_flags.split(',')):
115 |             for view_i, mod_flag in enumerate(mod_flags):
116 |                 if view_i < int(num_view):
117 |                     batch['mods_flags'][mod_name][:, view_i] = MOD_FLAG_TABLE[mod_flag]
118 |                     # force set first-view pose flag as condition
119 |                     if mod_name == 'ray' and view_i == 0:
120 |                         batch['mods_flags'][mod_name][:, view_i] = MOD_FLAG_TABLE['c']
121 |         return batch
122 |     
123 |     def update(self, key, indices, values):
124 |         if key in self.data:
125 |             self.data[key][:, indices] = values
126 |     
127 |     def __call__(self, key):
128 |         return self.data[key] if key in self.data else None
129 |      
130 | 
131 | 
132 | 
133 | def tensor_recursive_to(d: dict, func):
134 |     if isinstance(d, (list)):
135 |         iterator = range(len(d))
136 |     elif isinstance(d, dict):
137 |         iterator = d.keys()
138 |     for it in iterator:
139 |         if isinstance(d[it], (list, dict, tuple)):
140 |             if isinstance(d[it], tuple):
141 |                 d[it] = list(d[it])
142 |             tensor_recursive_to(d[it], func)
143 |         elif isinstance(d[it], (int, float, str, np.ndarray, PerspectiveCameras)):
144 |             pass
145 |         elif d[it] == None:
146 |             pass
147 |         else:
148 |             d[it] = func(d[it])
149 |             
150 |             
151 | def save_compare_image(np_image, path):
152 |     N, H, W, C = np_image.shape
153 |     np_image = np_image.transpose(1, 0, 2, 3).reshape(H, N * W, C)
154 |     os.makedirs(os.path.dirname(path), exist_ok=True)
155 |     cv2.imwrite(path, np_image[..., ::-1])
156 | 
157 | 
158 | def get_rgbd_point_cloud_numpy(cam, images, depths, depth_masks=None, mask_thr=None):
159 |     point_cloud = get_rgbd_point_cloud(cam, images, depths, mask=depth_masks, mask_thr=mask_thr)
160 |     points, colors = point_cloud.points_list()[0].detach().numpy(), point_cloud.features_list()[0].detach().numpy()
161 |     # remove invalid points
162 |     valid_mask = np.isfinite(points).all(axis=1)
163 |     points, colors = points[valid_mask], colors[valid_mask]
164 |     
165 |     return points, colors


--------------------------------------------------------------------------------
/utils/vis.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import io
 6 | import os
 7 | import os.path as osp
 8 | import matplotlib.pyplot as plt
 9 | import numpy as np
10 | import torch
11 | from PIL import Image
12 | 
13 | 
14 | def unnormalize_image(image):
15 |     if isinstance(image, torch.Tensor):
16 |         image = image.cpu().numpy()
17 |     if image.shape[0] == 3:
18 |         image = image.transpose(1, 2, 0)
19 |     mean = np.array([0.5, 0.5, 0.5])
20 |     std = np.array([0.5, 0.5, 0.5])
21 |     image = image * std + mean
22 |     return (image * 255.0).astype(np.uint8)
23 | 
24 | 
25 | def plot_to_image(figure, dpi=100):
26 |     """Converts matplotlib fig to a png for logging with tf.summary.image."""
27 |     buffer = io.BytesIO()
28 |     figure.savefig(buffer, format="raw", dpi=dpi)
29 |     plt.close(figure)
30 |     buffer.seek(0)
31 |     image = np.reshape(
32 |         np.frombuffer(buffer.getvalue(), dtype=np.uint8),
33 |         newshape=(int(figure.bbox.bounds[3]), int(figure.bbox.bounds[2]), -1),
34 |     )
35 |     return image[..., :3]
36 | 
37 | 
38 | def view_color_coded_images_from_path(image_dir):
39 |     cmap = plt.get_cmap("hsv")
40 |     num_rows = 2
41 |     num_cols = 4
42 |     figsize = (num_cols * 2, num_rows * 2)
43 |     fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize)
44 |     axs = axs.flatten()
45 | 
46 |     def hidden(x):
47 |         return not x.startswith(".")
48 | 
49 |     image_paths = sorted(os.listdir(image_dir))
50 |     image_paths = list(filter(hidden, image_paths))
51 |     image_paths = image_paths[0 : (min(len(image_paths), 8))]
52 |     num_frames = len(image_paths)
53 | 
54 |     for i in range(num_rows * num_cols):
55 |         if i < num_frames:
56 |             img = np.asarray(Image.open(osp.join(image_dir, image_paths[i])))
57 |             print(img.shape)
58 |             axs[i].imshow(img)
59 |             for s in ["bottom", "top", "left", "right"]:
60 |                 axs[i].spines[s].set_color(cmap(i / (num_frames)))
61 |                 axs[i].spines[s].set_linewidth(5)
62 |             axs[i].set_xticks([])
63 |             axs[i].set_yticks([])
64 |         else:
65 |             axs[i].axis("off")
66 |     plt.tight_layout()
67 |     return fig, num_frames
68 | 
69 | 
70 | def view_color_coded_images_from_tensor(images):
71 |     num_frames = images.shape[0]
72 |     cmap = plt.get_cmap("hsv")
73 |     num_rows = 2
74 |     num_cols = 4
75 |     figsize = (num_cols * 2, num_rows * 2)
76 |     fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize)
77 |     axs = axs.flatten()
78 |     for i in range(num_rows * num_cols):
79 |         if i < num_frames:
80 |             axs[i].imshow(unnormalize_image(images[i]))
81 |             for s in ["bottom", "top", "left", "right"]:
82 |                 axs[i].spines[s].set_color(cmap(i / (num_frames)))
83 |                 axs[i].spines[s].set_linewidth(5)
84 |             axs[i].set_xticks([])
85 |             axs[i].set_yticks([])
86 |         else:
87 |             axs[i].axis("off")
88 |     plt.tight_layout()
89 | 


--------------------------------------------------------------------------------
/utils/write_videos.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2025 Apple Inc. All Rights Reserved.
 4 | #
 5 | import argparse
 6 | import os
 7 | import cv2
 8 | import imageio
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument('--render_root', type=str, default='logs/exp-xxxxx/renders/xxxxx-train-set')
14 |     parser.add_argument('--num_samples', type=int, default=80)
15 |     parser.add_argument('--num_splines', type=int, default=3)
16 |     parser.add_argument('--type', type=str, default='scene')
17 |     
18 |     args = parser.parse_args()
19 |     render_folder = os.path.join(args.render_root, 'train')
20 |     pred_root = os.path.join(render_folder, 'rgb')
21 |     num_frames = args.num_samples
22 |     output_folder = os.path.dirname(os.path.dirname(args.render_root))
23 |     # scene_id = args.render_root.split('/')[-1]
24 |     
25 |     if args.type == 'scene':
26 |         all_frames = sorted(os.listdir(pred_root))
27 |         for i in range(args.num_splines):
28 |             st_id, ed_id = i * num_frames, (i + 1) * num_frames
29 |             img_list = []
30 |             for j in range(st_id, ed_id):
31 |                 file = os.path.join(pred_root, f'frame_{j:04d}.png')
32 |                 img_list.append(cv2.imread(file)[..., ::-1])
33 |             video_file = os.path.join(output_folder, f'3DGS-render-traj{i}.mp4')
34 |             imageio.mimsave(video_file, img_list, fps=30)
35 |     elif args.type == 'object':
36 |         all_frames = sorted(os.listdir(pred_root))     
37 |         for i in range(1):
38 |             st_id, ed_id = i * num_frames, (i + 1) * num_frames
39 |             img_list = []
40 |             first_view_file = os.path.join(pred_root, 'ref_frame_0000.png')
41 |             img_list.append(cv2.imread(first_view_file)[..., ::-1])
42 |             for j in range(st_id, ed_id - 1):
43 |                 file = os.path.join(pred_root, f'frame_{j:04d}.png')
44 |                 img_list.append(cv2.imread(file)[..., ::-1])
45 |             video_file = os.path.join(output_folder, f'3DGS-render-traj.mp4')
46 |             imageio.mimsave(video_file, img_list, fps=30)            


--------------------------------------------------------------------------------