├── .gitignore
├── LICENSE
├── README.md
├── arguments.py
├── captioning.py
├── carving.py
├── docs
├── example_vid.gif
└── overview.png
├── evaluation.py
├── feature_fusion.py
├── gpt_inference.py
├── material_proposal.py
├── ns_reconstruction.py
├── predict_property.py
├── requirements.txt
├── utils.py
└── visualization.py
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | preds/
3 | viz/
4 | *.pt
5 | my_api_key.py
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 | cover/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | .pybuilder/
82 | target/
83 |
84 | # Jupyter Notebook
85 | .ipynb_checkpoints
86 |
87 | # IPython
88 | profile_default/
89 | ipython_config.py
90 |
91 | # pyenv
92 | # For a library or package, you might want to ignore these files since the code is
93 | # intended to run in multiple environments; otherwise, check them in:
94 | # .python-version
95 |
96 | # pipenv
97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | # install all needed dependencies.
101 | #Pipfile.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/#use-with-ide
116 | .pdm.toml
117 |
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 |
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 |
125 | # SageMath parsed files
126 | *.sage.py
127 |
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 |
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 |
141 | # Rope project settings
142 | .ropeproject
143 |
144 | # mkdocs documentation
145 | /site
146 |
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 |
152 | # Pyre type checker
153 | .pyre/
154 |
155 | # pytype static type analyzer
156 | .pytype/
157 |
158 | # Cython debug symbols
159 | cython_debug/
160 |
161 | # PyCharm
162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | # and can be added to the global gitignore or merged into this file. For a more nuclear
165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | #.idea/
167 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 ajzhai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NeRF2Physics: Physical Property Understanding from Language-Embedded Feature Fields
2 |
3 | Albert J. Zhai, Yuan Shen, Emily Y. Chen, Gloria X. Wang, Xinlei Wang, Sheng Wang, Kaiyu Guan, Shenlong Wang
4 | University of Illinois at Urbana-Champaign
5 |
6 | CVPR 2024
7 |
8 | [Paper](https://arxiv.org/abs/2404.04242) │ [Project Page](https://ajzhai.github.io/NeRF2Physics/)
9 |
10 |
11 | **Infer physical properties densely in 3D for any object!**
12 | 
13 |
14 | ## Requirements
15 | ### Installing Dependencies
16 | Our method involves 3D reconstruction using [Nerfstudio](https://docs.nerf.studio/). If you want to modify the reconstruction or reproduce it on your own data, you will need to follow the [official instructions to install Nerfstudio](https://docs.nerf.studio/quickstart/installation.html).
17 |
18 | Besides the initial reconstruction, the rest of our method operates on the extracted depth maps/point cloud (which we provide in our dataset) and thus does not require Nerfstudio. So if you don't wish to run Nerfstudio, the only things you need to do are 1) [install PyTorch](https://pytorch.org/get-started/locally/), and 2) run
19 | ```
20 | pip install -r requirements.txt
21 | ```
22 | to install the remaining dependencies.
23 |
24 | ### BLIP-2 Model
25 | Our method uses [BLIP-2-Flan-T5-XL](https://huggingface.co/Salesforce/blip2-flan-t5-xl) for image captioning. To download the model weights, clone the repository into the root directory of this repository (you will need Git LFS). You can also download it elsewhere and specify the location via the `--blip2_model_dir` argument.
26 |
27 | ### OpenAI API Key
28 | Our method involves calling GPT via the [OpenAI API](https://platform.openai.com/). This requires having an account with some credits on it (usage will be fairly minimal). Once you have an account, find your API key [here](https://platform.openai.com/api-keys) and set a variable named `OPENAI_API_KEY` to your key in a Python file named `my_api_key.py`. Example (replace `` with your API key):
29 | ```
30 | echo "OPENAI_API_KEY = ''" >> ./my_api_key.py
31 | ```
32 |
33 | ## ABO-500 Dataset
34 | We provide **ABO-500**, a dataset of multi-view images of objects from [Amazon Berkeley Objects (ABO)](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) with camera parameters and ground-truth object weights. We also provide the intermediate outputs from our method so that you can run any part of our method without needing to run the previous parts. Please download the dataset via [this Box link](https://uofi.box.com/shared/static/743ydh4n1xi0dj05lcyyg4evqk2n4dko.zip)
35 | and unzip into a directory `data/` (you can also put it elsewhere and specify `--data_dir` later).
36 |
37 | Example with curl:
38 | ```
39 | curl -L https://uofi.box.com/shared/static/743ydh4n1xi0dj05lcyyg4evqk2n4dko.zip \
40 | --output ./abo_500.zip
41 | unzip ./abo_500.zip -d ./data/
42 | ```
43 |
44 | ## Usage
45 | Overview of our method:
46 | 
47 | We provide separate Python scripts for running each component in our method. Command-line arguments for all of the scripts can be found in `arguments.py`. Intermediate outputs get stored in the scene data directories. If you are using our provided dataset, you can start from anywhere along the pipeline without running the previous components.
48 |
49 | ### 3D Reconstruction
50 | We use Nerfstudio to train NeRFs and extract depth maps and point clouds. We have wrapped all of the Nerfstudio commands into `ns_reconstruction.py`. Example (only processes one scene):
51 | ```
52 | python ns_reconstruction.py --end_idx 1
53 | ```
54 | This is the only step that requires Nerfstudio.
55 |
56 |
57 | ### CLIP Feature Fusion
58 | Feature fusion is done in `feature_fusion.py`. Example (only processes one scene):
59 | ```
60 | python feature_fusion.py --end_idx 1
61 | ```
62 |
63 | ### Captioning and View Selection
64 | Captioning and view selection are both done in `captioning.py`. Example (only processes one scene):
65 | ```
66 | python captioning.py --end_idx 1
67 | ```
68 | This requires downloading the BLIP-2 model (see Requirements above).
69 |
70 |
71 | ### LLM Material Proposal
72 | Material proposal is done in `material_proposal.py`. You can specify the physical property of interest using the `--property_name` argument. Currently, only mass density, friction, and Shore hardness are supported, but feel free to make prompts for other properties (see `gpt_inference.py`). Example (only processes one scene):
73 | ```
74 | python material_proposal.py --property_name density --end_idx 1
75 | ```
76 | This requires setting your OpenAI API Key (see Requirements above).
77 |
78 | ### CLIP-based Kernel Regression (Final Prediction)
79 | Physical properties are predicted using CLIP-based kernel regression in `predict_property.py`. Example (only processes one scene):
80 | ```
81 | python material_proposal.py --property_name density --end_idx 1
82 | ```
83 | By default, the script will predict a volume integral of the physical property (e.g. predicting mass by integrating density). You can instead get dense results for a 3D grid of points by setting `--prediction_mode` to `grid`. You can also write your own code using the `predict_physical_property_query` function to query points however you want.
84 |
85 | ### Evaluation
86 | We provide a script for quantitative evaluation of mass predictions in `evaluation.py`. The results will be printed in your terminal. Example:
87 | ```
88 | python evaluation.py
89 | ```
90 | Explanations of each metric can be found in our paper.
91 |
92 | ### Visualization
93 | We provide a script for interactively viewing and saving 3D visualizations in `visualization.py`. You should specify the scene name using the `--scene_name` argument. Example:
94 | ```
95 | python visualization.py --scene_name B075YQXRBS_ATVPDKIKX0DER
96 | ```
97 |
98 | ## Using Custom Data
99 | To run our method on your own data, you can use [Nerfstudio's data processing tool](https://docs.nerf.studio/quickstart/custom_dataset.html) to convert your data into the right format. You can then run the components of our method in order.
100 |
101 |
102 | ## Citation
103 | Please cite our paper if you find this repo useful!
104 | ```bibtex
105 | @inproceedings{zhai2024physical,
106 | title={Physical Property Understanding from Language-Embedded Feature Fields},
107 | author={Zhai, Albert J and Shen, Yuan and Chen, Emily Y and Wang, Gloria X and Wang, Xinlei and Wang, Sheng and Guan, Kaiyu and Wang, Shenlong},
108 | booktitle={CVPR},
109 | year={2024}
110 | }
111 | ```
--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def get_args():
5 | parser = argparse.ArgumentParser(description='NeRF2Physics')
6 |
7 | # General arguments
8 | parser.add_argument('--data_dir', type=str, default="./data/abo_500/",
9 | help='path to data (default: ./data/abo_500/)')
10 | parser.add_argument('--split', type=str, default="all",
11 | help='dataset split, either train, val, train+val, test, or all (default: all)')
12 | parser.add_argument('--start_idx', type=int, default=0,
13 | help='starting scene index, useful for evaluating only a few scenes (default: 0)')
14 | parser.add_argument('--end_idx', type=int, default=-1,
15 | help='ending scene index, useful for evaluating only a few scenes (default: -1)')
16 | parser.add_argument('--different_Ks', type=int, default=0,
17 | help='whether data has cameras with different intrinsic matrices (default: 0)')
18 | parser.add_argument('--device', type=str, default="cuda",
19 | help='device for torch (default: cuda)')
20 |
21 | # NeRF training
22 | parser.add_argument('--training_iters', type=int, default=20000,
23 | help='number of training iterations (default: 20000)')
24 | parser.add_argument('--near_plane', type=float, default=0.4,
25 | help='near plane for ray sampling (default: 0.4)')
26 | parser.add_argument('--far_plane', type=float, default=6.0,
27 | help='far plane for ray sampling (default: 6.0)')
28 | parser.add_argument('--vis_mode', type=str, default='wandb',
29 | help='nerfstudio visualization mode (default: wandb)')
30 | parser.add_argument('--project_name', type=str, default='NeRF2Physics',
31 | help='project name used by wandb (default: NeRF2Physics)')
32 |
33 | # NeRF point cloud
34 | parser.add_argument('--num_points', type=int, default=100000,
35 | help='number of points for point cloud (default: 100000)')
36 | parser.add_argument('--bbox_size', type=float, default=1.0,
37 | help='bounding box (cube) size, relative to scaled scene (default: 1.0)')
38 |
39 | # CLIP feature fusion
40 | parser.add_argument('--patch_size', type=int, default=56,
41 | help='patch size (default: 56)')
42 | parser.add_argument('--batch_size', type=int, default=16,
43 | help='batch size (default: 16)')
44 | parser.add_argument('--feature_voxel_size', type=int, default=0.01,
45 | help='voxel downsampling size for features, relative to scaled scene (default: 0.01)')
46 | parser.add_argument('--feature_save_name', type=str, default="ps56",
47 | help='feature save name (default: ps56)')
48 | parser.add_argument('--occ_thr', type=float, default=0.01,
49 | help='occlusion threshold, relative to scaled scene (default: 0.01)')
50 |
51 | # Captioning and view selection
52 | parser.add_argument('--blip2_model_dir', type=str, default="./blip2-flan-t5-xl",
53 | help='path to BLIP2 model directory (default: ./blip2-flan-t5-xl)')
54 | parser.add_argument('--mask_area_percentile', type=float, default=0.75,
55 | help='mask area percentile for canonical view (default: 0.75)')
56 | parser.add_argument('--caption_save_name', type=str, default="info_new",
57 | help='caption save name (default: info_new)')
58 |
59 | # Material proposal
60 | parser.add_argument('--caption_load_name', type=str, default="info_new",
61 | help='name of saved caption to load (default: info_new)')
62 | parser.add_argument('--property_name', type=str, default="density",
63 | help='property to predict (default: density)')
64 | parser.add_argument('--include_thickness', type=int, default=1,
65 | help='whether to also predict thickness (default: 1)')
66 | parser.add_argument('--gpt_model_name', type=str, default="gpt-3.5-turbo",
67 | help='GPT model name (default: gpt-3.5-turbo)')
68 | parser.add_argument('--mats_save_name', type=str, default="info_new",
69 | help='candidate materials save name (default: info_new)')
70 |
71 | # Physical property prediction (uses property_name argument from above)
72 | parser.add_argument('--mats_load_name', type=str, default="info",
73 | help='candidate materials load name (default: info)')
74 | parser.add_argument('--feature_load_name', type=str, default="ps56",
75 | help='feature load name (default: ps56)')
76 | parser.add_argument('--prediction_mode', type=str, default="integral",
77 | help="can be either 'integral' or 'grid' (default: integral)")
78 | parser.add_argument('--temperature', type=float, default=0.1,
79 | help='softmax temperature for kernel regression (default: 0.01)')
80 | parser.add_argument('--sample_voxel_size', type=float, default=0.005,
81 | help='voxel downsampling size for sampled points, relative to scaled scene (default: 0.005)')
82 | parser.add_argument('--volume_method', type=str, default="thickness",
83 | help="method for volume estimation, either 'thickness' or 'carving' (default: thickness)")
84 | parser.add_argument('--correction_factor', type=float, default=0.6,
85 | help='correction factor for integral prediction (default: 0.6)')
86 | parser.add_argument('--show_mat_seg', type=int, default=0,
87 | help="whether to show visualization of material segmentation (default: 0)")
88 | parser.add_argument('--save_preds', type=int, default=1,
89 | help='whether to save predictions (default: 1)')
90 | parser.add_argument('--preds_save_name', type=str, default="mass",
91 | help='predictions save name (default: mass)')
92 |
93 | # Evaluation
94 | parser.add_argument('--preds_json_path', type=str, default="./preds/preds_mass.json",
95 | help='path to predictions JSON file (default: ./preds/preds_mass.json)')
96 | parser.add_argument('--gts_json_path', type=str, default="./data/abo_500/filtered_product_weights.json",
97 | help='path to ground truth JSON file (default: ./data/abo_500_50/filtered_product_weights.json)')
98 | parser.add_argument('--clamp_min', type=float, default=0.01,
99 | help='minimum value to clamp predictions (default: 0.01)')
100 | parser.add_argument('--clamp_max', type=float, default=100.,
101 | help='maximum value to clamp predictions (default: 100.)')
102 |
103 | # Visualization
104 | parser.add_argument('--scene_name', type=str,
105 | help='scene name for visualization (must be provided)')
106 | parser.add_argument('--show', type=int, default=1,
107 | help='whether to show interactive viewer (default: 1)')
108 | parser.add_argument('--compositing_alpha', type=float, default=0.2,
109 | help='alpha for compositing with RGB image (default: 0.2)')
110 | parser.add_argument('--cmap_min', type=float, default=500,
111 | help='minimum physical property value for colormap (default: 500)')
112 | parser.add_argument('--cmap_max', type=float, default=3500,
113 | help='maximum physical property value for colormap (default: 3500)')
114 | parser.add_argument('--viz_save_name', type=str, default="tmp",
115 | help='visualization save name (default: tmp)')
116 |
117 | args = parser.parse_args()
118 |
119 | return args
--------------------------------------------------------------------------------
/captioning.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import json
4 | import matplotlib.pyplot as plt
5 | import torch
6 | from transformers import AutoProcessor, Blip2ForConditionalGeneration
7 |
8 | from utils import load_images, get_scenes_list
9 | from arguments import get_args
10 |
11 |
12 | CAPTIONING_PROMPT = "Question: Give a detailed description of the object. Answer:"
13 |
14 |
15 | def load_blip2(model_name, device='cuda'):
16 | processor = AutoProcessor.from_pretrained(model_name)
17 | model = Blip2ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16)
18 | model = model.to(device)
19 | model.eval()
20 | return model, processor
21 |
22 |
23 | def display_model_size(model):
24 | param_size = 0
25 | for param in model.parameters():
26 | param_size += param.nelement() * param.element_size()
27 | buffer_size = 0
28 | for buffer in model.buffers():
29 | buffer_size += buffer.nelement() * buffer.element_size()
30 |
31 | size_all_mb = (param_size + buffer_size) / 1024**2
32 | print('model size: {:.3f}MB'.format(size_all_mb))
33 |
34 |
35 | def generate_text(img, model, processor, prompt=CAPTIONING_PROMPT, device='cuda'):
36 | if prompt is not None:
37 | inputs = processor(img, text=prompt, return_tensors="pt").to(device, torch.float16)
38 | else:
39 | inputs = processor(img, return_tensors="pt").to(device, torch.float16)
40 |
41 | generated_ids = model.generate(**inputs, max_new_tokens=30)
42 | generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
43 | return generated_text
44 |
45 |
46 | def predict_caption(args, scene_dir, vqa_model, vqa_processor, show=False):
47 | img_dir = os.path.join(scene_dir, 'images')
48 | imgs, masks = load_images(img_dir, return_masks=True)
49 | mask_areas = [np.mean(mask) for mask in masks]
50 |
51 | idx_to_caption = np.argsort(mask_areas)[int(len(mask_areas) * args.mask_area_percentile)]
52 | img_to_caption = imgs[idx_to_caption]
53 |
54 | with torch.no_grad():
55 | caption = generate_text(img_to_caption, vqa_model, vqa_processor, device=args.device)
56 |
57 | info = {'idx_to_caption': str(idx_to_caption), 'caption': caption}
58 |
59 | print('scene: %s, info:' % os.path.basename(scene_dir), info)
60 | if show:
61 | plt.imshow(img_to_caption)
62 | plt.show()
63 |
64 | # save info to json
65 | with open(os.path.join(scene_dir, '%s.json' % args.caption_save_name), 'w') as f:
66 | json.dump(info, f, indent=4)
67 |
68 | return info
69 |
70 |
71 | if __name__ == '__main__':
72 |
73 | args = get_args()
74 |
75 | scenes_dir = os.path.join(args.data_dir, 'scenes')
76 | scenes = get_scenes_list(args)
77 |
78 | model, processor = load_blip2(args.blip2_model_dir, device=args.device)
79 |
80 | for j, scene in enumerate(scenes):
81 | caption_info = predict_caption(args, os.path.join(scenes_dir, scene), model, processor)
82 |
83 |
--------------------------------------------------------------------------------
/carving.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import open3d as o3d
4 | import time
5 | import torch
6 | from PIL import Image
7 | from utils import *
8 |
9 |
10 | def get_bounding_box(pts, percentile=1.0, buffer=0.1):
11 | """Get the bounding box of a point cloud."""
12 | xyz1 = np.percentile(pts, percentile, axis=0)
13 | xyz2 = np.percentile(pts, 100 - percentile, axis=0)
14 | lwh = xyz2 - xyz1
15 | xyz1 -= buffer * lwh
16 | xyz2 += buffer * lwh
17 | return xyz1, xyz2
18 |
19 |
20 | def get_grid_points(xyz1, xyz2, grid_cell_size):
21 | """Get grid points."""
22 | x1, y1, z1 = xyz1
23 | x2, y2, z2 = xyz2
24 | x = np.arange(x1, x2, grid_cell_size)
25 | y = np.arange(y1, y2, grid_cell_size)
26 | z = np.arange(z1, z2, grid_cell_size)
27 | xx, yy, zz = np.meshgrid(x, y, z)
28 | grid_pts = np.stack([xx, yy, zz], axis=-1).reshape(-1, 3)
29 | return grid_pts
30 |
31 |
32 |
33 | def project_3d_to_2d(pts, w2c, K, return_dists=False):
34 | """Project 3D points to 2D (nerfstudio format)."""
35 | pts = np.array(pts)
36 | K = np.hstack([K, np.zeros((3, 1))])
37 | pts = np.concatenate([pts, np.ones((pts.shape[0], 1))], axis=1)
38 | pts = np.dot(pts, w2c.T)
39 | pts[:, [1, 2]] *= -1
40 | if return_dists:
41 | dists = np.linalg.norm(pts[:, :3], axis=-1)
42 | pts = np.dot(pts, K.T)
43 | pts_2d = pts[:, :2] / pts[:, 2:]
44 | if return_dists:
45 | return pts_2d, dists
46 | return pts_2d
47 |
48 | def project_3d_to_2d_torch(pts, w2c, K, return_dists=False):
49 | """Project 3D points to 2D (nerfstudio format)."""
50 | device = pts.device
51 | K = torch.cat([K, torch.zeros((3, 1), device=device)], 1)
52 | pts = torch.cat([pts, torch.ones((pts.shape[0], 1), device=device)], 1)
53 | pts = torch.matmul(pts, w2c.t())
54 | pts[:, [1, 2]] *= -1
55 | if return_dists:
56 | dists = torch.norm(pts[:, :3], dim=-1)
57 | pts = torch.matmul(pts, K.t())
58 | pts_2d = pts[:, :2] / pts[:, 2:]
59 | if return_dists:
60 | return pts_2d, dists
61 | return pts_2d
62 |
63 |
64 | def depth_to_distance(depth, K):
65 | """Convert depth map to distance from camera."""
66 | h, w = depth.shape
67 | x, y = np.meshgrid(np.arange(w), np.arange(h))
68 | x = x.flatten()
69 | y = y.flatten()
70 | depth = depth.flatten()
71 | pts = np.stack([x, y, np.ones_like(x)], axis=1)
72 | pts = np.dot(pts, np.linalg.inv(K).T)
73 | pts *= depth[:, None]
74 | dists = np.linalg.norm(pts, axis=1)
75 | dists = dists.reshape(h, w)
76 | return dists
77 |
78 |
79 | def depth_to_distance_torch(depth, K):
80 | """Convert depth map to distance from camera."""
81 | h, w = depth.shape
82 | x, y = torch.meshgrid(torch.arange(w), torch.arange(h))
83 | x = x.flatten()
84 | y = y.flatten()
85 | depth = depth.flatten()
86 | pts = torch.stack([x, y, torch.ones_like(x)], dim=1).float().to(depth.device)
87 | pts = torch.matmul(pts, torch.inverse(K).t())
88 | pts *= depth[:, None]
89 | dists = torch.norm(pts, dim=1)
90 | dists = dists.reshape(h, w)
91 | return dists
92 |
93 |
94 | def carve_numpy(pts, masks, depths, w2cs, K, dist_thr):
95 | n_imgs = len(masks)
96 |
97 | for i in range(n_imgs):
98 | h, w = masks[i].shape
99 | pts_2d, dists = project_3d_to_2d(pts, w2cs[i], K, return_dists=True)
100 | pts_2d = np.round(pts_2d).astype(np.int32)
101 | pts_2d = np.clip(pts_2d, 0, [w - 1, h - 1])
102 |
103 | observed_dists = depths[i]
104 |
105 | is_in_mask = masks[i][pts_2d[:, 1], pts_2d[:, 0]]
106 | is_behind_depth = dists > observed_dists[pts_2d[:, 1], pts_2d[:, 0]] - dist_thr
107 | pts = pts[is_in_mask & is_behind_depth]
108 |
109 | return pts
110 |
111 | def carve_torch(pts, masks, depths, w2cs, K, dist_thr, mask_only=False):
112 | n_imgs = len(masks)
113 |
114 | with torch.no_grad():
115 | mask_votes = torch.zeros(len(pts), device=pts.device, dtype=torch.int32)
116 | depth_votes = torch.zeros(len(pts), device=pts.device, dtype=torch.int32)
117 | for i in range(n_imgs):
118 | h, w = masks[i].shape
119 | pts_2d, dists = project_3d_to_2d_torch(pts, w2cs[i], K, return_dists=True)
120 | pts_2d = torch.round(pts_2d).long().to(pts.device)
121 | pts_2d[:, 0] = torch.clamp(pts_2d[:, 0], 0, w - 1)
122 | pts_2d[:, 1] = torch.clamp(pts_2d[:, 1], 0, h - 1)
123 |
124 | observed_dists = depths[i]
125 |
126 | is_in_mask = masks[i][pts_2d[:, 1], pts_2d[:, 0]]
127 | is_behind_depth = dists > observed_dists[pts_2d[:, 1], pts_2d[:, 0]] - dist_thr
128 | mask_votes[is_in_mask] += 1
129 | depth_votes[is_behind_depth] += 1
130 | if mask_only:
131 | pts = pts[mask_votes == n_imgs]
132 | else:
133 | pts = pts[(mask_votes == n_imgs) & (depth_votes == n_imgs)]
134 |
135 | return pts
136 |
137 |
138 | def get_carved_pts(scene_dir, grid_cell_size_ns=1/512, dist_thr_ns=0.01, verbose=False, device='cuda'):
139 | scene_name = os.path.basename(scene_dir)
140 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply')
141 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json')
142 | t_file = os.path.join(scene_dir, 'transforms.json')
143 | img_dir = os.path.join(scene_dir, 'images')
144 | depth_dir = os.path.join(scene_dir, 'ns', 'renders', 'depth')
145 |
146 | pts = load_ns_point_cloud(pcd_file, dt_file)
147 | w2cs, K = parse_transforms_json(t_file, return_w2c=True)
148 | ns_transform, scale = parse_dataparser_transforms_json(dt_file)
149 | imgs, masks = load_images(img_dir, return_masks=True)
150 | depths = load_depths(depth_dir, Ks=None)
151 |
152 | xyz1, xyz2 = get_bounding_box(pts)
153 | grid_cell_size = grid_cell_size_ns / scale
154 | grid_pts = get_grid_points(xyz1, xyz2, grid_cell_size)
155 | dist_thr = dist_thr_ns / scale
156 |
157 | grid_pts = torch.from_numpy(grid_pts).float().to(device)
158 | masks = [torch.from_numpy(mask).to(device) for mask in masks]
159 | depths = [torch.from_numpy(depth).to(device) for depth in depths]
160 | w2cs = [torch.from_numpy(w2c).float().to(device) for w2c in w2cs]
161 | K = torch.from_numpy(K).float().to(device)
162 |
163 | carved = carve_torch(grid_pts, masks, depths, w2cs, K, dist_thr)
164 | if verbose:
165 | print('scene: %s, num. surface points: %d, num. carved points: %d, scale: %.4f' %
166 | (scene_name, len(pts), len(carved), scale))
167 |
168 | return carved, grid_cell_size
169 |
170 |
171 | if __name__ == '__main__':
172 | scene_dir = '/home/azhai/n2p/data/debug/B075X4J15G_ATVPDKIKX0DER'
173 |
174 | carved, grid_cell_size = get_carved_pts(scene_dir)
175 | carved = carved.cpu().numpy()
176 | pcd = o3d.geometry.PointCloud()
177 | pcd.points = o3d.utility.Vector3dVector(carved)
178 | o3d.visualization.draw_geometries([pcd])
--------------------------------------------------------------------------------
/docs/example_vid.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajzhai/NeRF2Physics/6d81c093ed05434f31c0c735fdad1e51355bde86/docs/example_vid.gif
--------------------------------------------------------------------------------
/docs/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajzhai/NeRF2Physics/6d81c093ed05434f31c0c735fdad1e51355bde86/docs/overview.png
--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 |
4 | from arguments import get_args
5 |
6 |
7 | # preds should be Nx2, gts should be N
8 | def ADE(preds, gts):
9 | point_preds = np.mean(preds, axis=1)
10 | ade = np.abs(point_preds - gts)
11 | return np.mean(ade)
12 |
13 | def ALDE(preds, gts):
14 | point_preds = np.mean(preds, axis=1)
15 | alde = np.abs(np.log(point_preds) - np.log(gts))
16 | return np.mean(alde)
17 |
18 | def APE(preds, gts):
19 | point_preds = np.mean(preds, axis=1)
20 | ape = np.abs(point_preds - gts) / gts
21 | return np.mean(ape)
22 |
23 | def MnRE(preds, gts):
24 | point_preds = np.mean(preds, axis=1)
25 | p_over_t = point_preds / gts
26 | t_over_p = gts / point_preds
27 | ratios = np.vstack([p_over_t, t_over_p])
28 | mnre = np.min(ratios, axis=0)
29 | return np.mean(mnre)
30 |
31 |
32 | def show_metrics(preds, gts):
33 | print('ADE %.3f' % ADE(preds, gts))
34 | print('ALDE %.3f' % ALDE(preds, gts))
35 | print('APE %.3f' % APE(preds, gts))
36 | print('MnRE %.3f' % MnRE(preds, gts))
37 |
38 |
39 | if __name__ == '__main__':
40 |
41 | args = get_args()
42 |
43 | with open(args.preds_json_path, 'r') as f:
44 | preds_dict = json.load(f)
45 | with open(args.gts_json_path, 'r') as f:
46 | gts_dict = json.load(f)
47 |
48 | preds = np.zeros((len(preds_dict), 2))
49 | gts = np.zeros(len(preds_dict))
50 | for i, (k, v) in enumerate(preds_dict.items()):
51 | preds[i] = v
52 | gts[i] = gts_dict[k.split('_')[0]]
53 | print(preds, gts)
54 |
55 | show_metrics(preds, gts)
56 |
--------------------------------------------------------------------------------
/feature_fusion.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import json
4 | import torch
5 | import open_clip
6 | from PIL import Image
7 |
8 | from utils import *
9 | from arguments import get_args
10 |
11 |
12 | CLIP_BACKBONE = 'ViT-B-16'
13 | CLIP_CHECKPOINT = 'datacomp_xl_s13b_b90k'
14 | CLIP_INPUT_SIZE = 224
15 | CLIP_OUTPUT_SIZE = 512
16 |
17 |
18 | def get_patch_features(pts, imgs, depths, w2cs, K, model, preprocess_fn, occ_thr,
19 | patch_size=56, batch_size=8, device='cuda'):
20 | n_imgs = len(imgs)
21 | n_pts = len(pts)
22 |
23 | patch_features = torch.zeros(n_imgs, n_pts, CLIP_OUTPUT_SIZE, device=device, requires_grad=False)
24 | is_visible = torch.zeros(n_imgs, n_pts, device=device, dtype=torch.bool, requires_grad=False)
25 | half_patch_size = patch_size // 2
26 |
27 | K = np.array(K)
28 | with torch.no_grad(), torch.cuda.amp.autocast():
29 | model.to(device)
30 |
31 | for i in range(n_imgs):
32 | h, w, c = imgs[i].shape
33 | if len(K.shape) == 3:
34 | curr_K = K[i]
35 | else:
36 | curr_K = K
37 | pts_2d, dists = project_3d_to_2d(pts, w2cs[i], curr_K, return_dists=True)
38 | pts_2d = np.round(pts_2d).astype(np.int32)
39 |
40 | observed_dists = depths[i]
41 |
42 | # loop through pts in batches
43 | for batch_start in range(0, n_pts, batch_size):
44 | curr_batch_size = min(batch_size, n_pts - batch_start)
45 | batch_patches = torch.zeros(curr_batch_size, 3, CLIP_INPUT_SIZE, CLIP_INPUT_SIZE, device=device)
46 |
47 | for j in range(curr_batch_size):
48 | x, y = pts_2d[batch_start + j]
49 |
50 | if x >= half_patch_size and x < w - half_patch_size and \
51 | y >= half_patch_size and y < h - half_patch_size:
52 | is_occluded = dists[batch_start + j] > observed_dists[y, x] + occ_thr
53 | if not is_occluded:
54 | patch = imgs[i][y - half_patch_size:y + half_patch_size, x - half_patch_size:x + half_patch_size]
55 | patch = Image.fromarray(patch)
56 |
57 | patch = preprocess_fn(patch).unsqueeze(0).to(device)
58 | batch_patches[j] = patch
59 | is_visible[i, batch_start + j] = True
60 |
61 | if is_visible[i, batch_start:batch_start + batch_size].any():
62 | patch_features[i, batch_start:batch_start + curr_batch_size] = model.encode_image(batch_patches)
63 |
64 | return patch_features, is_visible
65 |
66 |
67 | def process_scene(args, scene_dir, model, preprocess_fn):
68 |
69 | scene_name = os.path.basename(scene_dir)
70 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply')
71 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json')
72 | t_file = os.path.join(scene_dir, 'transforms.json')
73 | img_dir = os.path.join(scene_dir, 'images')
74 | depth_dir = os.path.join(scene_dir, 'ns', 'renders', 'depth')
75 |
76 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.feature_voxel_size)
77 | w2cs, K = parse_transforms_json(t_file, return_w2c=True, different_Ks=args.different_Ks)
78 | ns_transform, scale = parse_dataparser_transforms_json(dt_file)
79 | imgs = load_images(img_dir)
80 | depths = load_depths(depth_dir, Ks=None)
81 |
82 | print('scene: %s, points: %d, scale: %.4f' % (scene_name, len(pts), scale))
83 |
84 | with torch.no_grad():
85 | occ_thr = args.occ_thr * scale
86 | patch_features, is_visible = get_patch_features(pts, imgs, depths, w2cs, K,
87 | model, preprocess_fn,
88 | occ_thr, patch_size=args.patch_size, batch_size=args.batch_size,
89 | device=args.device)
90 |
91 | out_dir = os.path.join(scene_dir, 'features')
92 | os.makedirs(out_dir, exist_ok=True)
93 | torch.save(patch_features, os.path.join(out_dir, 'patch_features_%s.pt' % args.feature_save_name))
94 | torch.save(is_visible, os.path.join(out_dir, 'is_visible_%s.pt' % args.feature_save_name))
95 | with open(os.path.join(out_dir, 'voxel_size_%s.json' % args.feature_save_name), 'w') as f:
96 | json.dump({'voxel_size': args.feature_voxel_size}, f, indent=4)
97 |
98 | return pts, patch_features, is_visible
99 |
100 |
101 | if __name__ == '__main__':
102 |
103 | args = get_args()
104 |
105 | scenes_dir = os.path.join(args.data_dir, 'scenes')
106 | scenes = get_scenes_list(args)
107 |
108 | model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT)
109 | model.to(args.device)
110 |
111 | for j, scene in enumerate(scenes):
112 | pts, patch_features, is_visible = process_scene(args, os.path.join(scenes_dir, scene), model, preprocess)
113 |
114 |
--------------------------------------------------------------------------------
/gpt_inference.py:
--------------------------------------------------------------------------------
1 | import openai
2 | import base64
3 | import json
4 |
5 |
6 | PRED_CAND_MATS_DENSITY_SYS_MSG = """You will be provided with captions that each describe an image of an object. The captions will be delimited with quotes ("). Based on the caption, give me 5 materials that the object might be made of, along with the mass densities (in kg/m^3) of each of those materials. You may provide a range of values for the mass density instead of a single value. Try to consider all the possible parts of the object. Do not include coatings like "paint" in your answer.
7 |
8 | Format Requirement:
9 | You must provide your answer as a list of 5 (material: mass density) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like:
10 | (material 1: low-high kg/m^3);(material 2: low-high kg/m^3);(material 3: low-high kg/m^3);(material 4: low-high kg/m^3);(material 5: low-high kg/m^3)
11 | """
12 |
13 | PRED_CAND_MATS_DENSITY_SYS_MSG_4V = """You will be given an image of an object. Based on the image, give me a short (5-10 words) description of what the object is, and also 5 materials (e.g. wood, plastic, foam) that the object might be made of, along with the mass densities (in kg/m^3) of each of those materials. You may provide a range of values for the mass density instead of a single value. Try to consider all the possible parts of the object. Do not include coatings like "paint" in your answer.
14 |
15 | Format Requirement:
16 | You must provide your answer in the following JSON format, as it will be parsed by a code script later. Your answer must look like:
17 | {
18 | "description": description
19 | "materials": [
20 | {"name": material1, "mass density (kg/m^3)": low-high},
21 | {"name": material2, "mass density (kg/m^3)": low-high},
22 | {"name": material3, "mass density (kg/m^3)": low-high},
23 | {"name": material4, "mass density (kg/m^3)": low-high},
24 | {"name": material5, "mass density (kg/m^3)": low-high}
25 | ]
26 | }
27 | Do not include any other text in your answer. Do not include unnecessary words besides the material in the material name.
28 | """
29 |
30 |
31 | PRED_CAND_MATS_HARDNESS_SYS_MSG = """You will be provided with captions that each describe an image of an object. The captions will be delimited with quotes ("). Based on the caption, give me 3 materials that the object might be made of, along with the hardness of each of those materials. Choose whether to use Shore A hardness or Shore D hardness depending on the material. You may provide a range of values for hardness instead of a single value. Try to consider all the possible parts of the object.
32 |
33 | Format Requirement:
34 | You must provide your answer as a list of 3 (material: hardness, Shore A/D) tuples, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like:
35 | (material 1: low-high, );(material 2: low-high, );(material 3: low-high, )
36 | Make sure to use Shore A or Shore D hardness, not Mohs hardness.
37 | """
38 |
39 | PRED_CAND_MATS_FRICTION_SYS_MSG = """You will be provided with captions that each describe an image. The captions will be delimited with quotes ("). Based on the caption, give me 3 materials that the surfaces in the image might be made of, along with the kinetic friction coefficient of each material when sliding against a fabric surface. You may provide a range of values for the friction coefficient instead of a single value. Try to consider all the possible surfaces.
40 |
41 | Format Requirement:
42 | You must provide your answer as a list of 3 (material: friction coefficient) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like:
43 | (material 1: low-high);(material 2: low-high);(material 3: low-high)
44 | Try to provide as narrow of a range as possible for the friction coefficient.
45 | """
46 |
47 | PRED_THICKNESS_SYS_MSG = """You will be provided with captions that each describe an image of an object, along with a set of possible materials used to make the object. For each material, estimate the thickness (in cm) of that material in the object. You may provide a range of values for the thickness instead of a single value.
48 |
49 | Format Requirement:
50 | You must provide your answer as a list of 5 (material: thickness) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like:
51 | (material 1: low-high cm);(material 2: low-high cm);(material 3: low-high cm);(material 4: low-high cm);(material 5: low-high cm)
52 | """
53 |
54 | PRED_THICKNESS_EXAMPLE_INPUT_1 = 'Caption: "a lamp with a white shade" Materials: "fabric, plastic, metal, ceramic, glass"'
55 | PRED_THICKNESS_EXAMPLE_OUTPUT_1 = "(fabric: 0.1-0.2 cm);(plastic: 0.3-1.0 cm);(metal: 0.1-0.2 cm);(ceramic: 0.2-0.5 cm);(glass: 0.3-0.8 cm)"
56 | PRED_THICKNESS_EXAMPLE_INPUT_2 = 'Caption: "a grey ottoman" Materials: "wood, fabric, foam, metal, plastic"'
57 | PRED_THICKNESS_EXAMPLE_OUTPUT_2 = "(wood: 2.0-4.0 cm);(fabric: 0.2-0.5 cm);(foam: 5.0-15.0 cm);(metal: 0.1-0.2 cm);(plastic: 0.5-1.0 cm)"
58 | PRED_THICKNESS_EXAMPLE_INPUT_3 = 'Caption: "a white frame" Materials: "plastic, wood, aluminum, steel, glass"'
59 | PRED_THICKNESS_EXAMPLE_OUTPUT_3 = "(plastic: 0.1-0.3 cm);(wood: 1.0-1.5 cm);(aluminum: 0.1-0.3 cm);(steel: 0.1-0.2 cm);(glass: 0.2-0.5 cm)"
60 | PRED_THICKNESS_EXAMPLE_INPUT_4 = 'Caption: "a metal rack with three shelves" Materials: "steel, aluminum, wood, plastic, iron"'
61 | PRED_THICKNESS_EXAMPLE_OUTPUT_4 = "(steel: 0.1-0.2 cm);(aluminum: 0.1-0.3 cm);(wood: 1.0-2.0 cm);(plastic: 0.5-1.0 cm);(iron: 0.5-1.0 cm)"
62 |
63 |
64 | def gpt_candidate_materials(caption, property_name='density', model_name='gpt-3.5-turbo', seed=100):
65 |
66 | if property_name == 'density':
67 | sys_msg = PRED_CAND_MATS_DENSITY_SYS_MSG
68 | elif property_name == 'hardness':
69 | sys_msg = PRED_CAND_MATS_HARDNESS_SYS_MSG
70 | elif property_name == 'friction':
71 | sys_msg = PRED_CAND_MATS_FRICTION_SYS_MSG
72 | else:
73 | raise NotImplementedError
74 | response = openai.ChatCompletion.create(
75 | model=model_name,
76 | messages=[
77 | {"role": "system", "content": sys_msg},
78 | {"role": "user", "content": '"%s"' % caption},
79 | ],
80 | request_timeout=20,
81 | seed=seed,
82 | )
83 | return response['choices'][0]['message']['content']
84 |
85 |
86 | def gpt_thickness(caption, candidate_materials, mode='list', model_name='gpt-3.5-turbo', seed=100):
87 |
88 | if mode == 'list':
89 | mat_names, mat_vals = parse_material_list(candidate_materials)
90 | elif mode == 'json':
91 | caption, mat_names, mat_vals = parse_material_json(candidate_materials)
92 | else:
93 | raise NotImplementedError
94 | mat_names_str = ', '.join(mat_names)
95 | user_msg = 'Caption: "%s" Materials: "%s"' % (caption, mat_names_str)
96 |
97 | response = openai.ChatCompletion.create(
98 | model=model_name,
99 | messages=[
100 | {"role": "system", "content": PRED_THICKNESS_SYS_MSG},
101 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_1},
102 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_1},
103 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_2},
104 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_2},
105 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_3},
106 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_3},
107 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_4},
108 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_4},
109 | {"role": "user", "content": user_msg},
110 | ],
111 | request_timeout=20,
112 | seed=seed,
113 | )
114 | return response['choices'][0]['message']['content']
115 |
116 |
117 | def parse_material_list(matlist, max_n=5):
118 | elems = matlist.split(';')
119 | if len(elems) > max_n:
120 | print('too many materials %s' % matlist)
121 | return None
122 |
123 | mat_names = []
124 | mat_vals = []
125 |
126 | for elem in elems:
127 | elem_parts = elem.strip().split(':')
128 | if len(elem_parts) != 2:
129 | print('bad format %s' % matlist)
130 | return None
131 | mat_name, values = elem_parts
132 | if not mat_name.startswith('(') or mat_name[1].isnumeric() or mat_name.startswith('(material 1'):
133 | print('bad format %s' % matlist)
134 | return None
135 |
136 | mat_name = mat_name[1:]
137 | mat_names.append(mat_name.lower()) # force lowercase
138 |
139 | values = values.strip().split(' ')[0]
140 | values = values.replace(",", "")
141 | if values[-1] == ')':
142 | values = values[:-1]
143 |
144 | # Value may or may not be a range
145 | splitted = values.split('-')
146 | try:
147 | float(splitted[0])
148 | except ValueError:
149 | print('value cannot be converted to float %s' % matlist)
150 | return None
151 | if len(splitted) == 2:
152 | mat_vals.append([float(splitted[0]), float(splitted[1])])
153 | elif len(splitted) == 1:
154 | mat_vals.append([float(splitted[0]), float(splitted[0])])
155 | else:
156 | print('bad format %s' % matlist)
157 | return None
158 |
159 | return mat_names, mat_vals
160 |
161 |
162 | def parse_material_hardness(matlist, max_n=5):
163 | elems = matlist.split(';')
164 | if len(elems) > max_n:
165 | print('too many materials %s' % matlist)
166 | return None
167 |
168 | mat_names = []
169 | mat_vals = []
170 |
171 | for elem in elems:
172 | elem_parts = elem.strip().split(':')
173 | if len(elem_parts) != 2:
174 | print('bad format %s' % matlist)
175 | return None
176 | mat_name, values = elem_parts
177 | if not mat_name.startswith('(') or mat_name[1].isnumeric() or mat_name.startswith('(material 1'):
178 | print('bad name %s' % matlist)
179 | return None
180 |
181 | mat_name = mat_name[1:]
182 | mat_names.append(mat_name.lower()) # force lowercase
183 |
184 | values = values.strip().split(',')
185 | units = values[-1].split(' ')[-1][:-1]
186 | if units not in ['A', 'D']:
187 | print('bad units %s' % matlist)
188 | return None
189 | values = values[0]
190 | values = values.replace(",", "")
191 |
192 | # Value may or may not be a range
193 | splitted = values.split('-')
194 | try:
195 | float(splitted[0])
196 | except ValueError:
197 | print('value cannot be converted to float %s' % matlist)
198 | return None
199 | if len(splitted) == 2:
200 | mat_vals.append([float(splitted[0]), float(splitted[1])])
201 | elif len(splitted) == 1:
202 | mat_vals.append([float(splitted[0]), float(splitted[0])])
203 | else:
204 | print('bad format %s' % matlist)
205 | return None
206 |
207 | if units == 'D':
208 | mat_vals[-1][0] += 100
209 | mat_vals[-1][1] += 100
210 |
211 | return mat_names, mat_vals
212 |
213 |
214 | def encode_image(image_path):
215 | with open(image_path, "rb") as image_file:
216 | return base64.b64encode(image_file.read()).decode('utf-8')
217 |
218 |
219 | def gpt4v_candidate_materials(image_path, property_name='density', seed=100):
220 |
221 | if property_name == 'density':
222 | sys_msg = PRED_CAND_MATS_DENSITY_SYS_MSG_4V
223 | else:
224 | raise NotImplementedError
225 |
226 | base64_image = encode_image(image_path)
227 |
228 | response = openai.ChatCompletion.create(
229 | model="gpt-4-vision-preview",
230 | messages=[
231 | {
232 | "role": "system",
233 | "content": sys_msg
234 | },
235 | {
236 | "role": "user",
237 | "content": [
238 | {
239 | "type": "image_url",
240 | "image_url": {
241 | "url": f"data:image/png;base64,{base64_image}"
242 | }
243 | },
244 | ]
245 | }
246 | ],
247 | request_timeout=30,
248 | max_tokens=300,
249 | seed=seed,
250 | # response_format={"type": "json_object"},
251 | )
252 | return response['choices'][0]['message']['content']
253 |
254 |
255 | def parse_material_json(matjson, max_n=5, field_name='mass density (kg/m^3)'):
256 | desc_and_mats = json.loads(matjson)
257 | if 'description' not in desc_and_mats or 'materials' not in desc_and_mats:
258 | print('bad format %s' % matjson)
259 | return None
260 | mat_names = []
261 | mat_vals = []
262 | for mat in desc_and_mats['materials']:
263 | if 'name' not in mat or field_name not in mat:
264 | print('bad format %s' % matjson)
265 | return None
266 | mat_name = mat['name']
267 | mat_names.append(mat_name.lower()) # force lowercase
268 | values = mat[field_name]
269 | # Value may or may not be a range
270 | splitted = values.split('-')
271 | try:
272 | float(splitted[0])
273 | except ValueError:
274 | print('value cannot be converted to float %s' % matjson)
275 | return None
276 | if len(splitted) == 2:
277 | mat_vals.append([float(splitted[0]), float(splitted[1])])
278 | elif len(splitted) == 1:
279 | mat_vals.append([float(splitted[0]), float(splitted[0])])
280 | else:
281 | print('bad format %s' % matjson)
282 | return None
283 | return desc_and_mats['description'], mat_names, mat_vals
--------------------------------------------------------------------------------
/material_proposal.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import json
4 | import openai
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 | from PIL import Image
8 |
9 | from gpt_inference import gpt_candidate_materials, gpt_thickness, parse_material_list, \
10 | parse_material_hardness, gpt4v_candidate_materials, parse_material_json
11 | from utils import load_images, get_scenes_list
12 | from arguments import get_args
13 | from my_api_key import OPENAI_API_KEY
14 |
15 |
16 | BASE_SEED = 100
17 |
18 |
19 | def gpt_wrapper(gpt_fn, parse_fn, max_tries=10, sleep_time=3):
20 | """Wrap gpt_fn with error handling and retrying."""
21 | tries = 0
22 | # sleep to avoid overloading openai api
23 | time.sleep(sleep_time)
24 | try:
25 | gpt_response = gpt_fn(BASE_SEED + tries)
26 | result = parse_fn(gpt_response)
27 | except Exception as error:
28 | print('error:', error)
29 | result = None
30 | while result is None and tries < max_tries:
31 | tries += 1
32 | time.sleep(sleep_time)
33 | print('retrying...')
34 | try:
35 | gpt_response = gpt_fn(BASE_SEED + tries)
36 | result = parse_fn(gpt_response)
37 | except:
38 | result = None
39 | return gpt_response
40 |
41 |
42 | def show_img_to_caption(scene_dir, idx_to_caption):
43 | img_dir = os.path.join(scene_dir, 'images')
44 | imgs = load_images(img_dir, bg_change=None, return_masks=False)
45 | img_to_caption = imgs[idx_to_caption]
46 | plt.imshow(img_to_caption)
47 | plt.show()
48 | plt.close()
49 | return
50 |
51 |
52 | def predict_candidate_materials(args, scene_dir, show=False):
53 | # load caption info
54 | with open(os.path.join(scene_dir, '%s.json' % args.caption_load_name), 'r') as f:
55 | info = json.load(f)
56 |
57 | caption = info['caption']
58 |
59 | gpt_fn = lambda seed: gpt_candidate_materials(caption, property_name=args.property_name,
60 | model_name=args.gpt_model_name, seed=seed)
61 | parse_fn = parse_material_hardness if args.property_name == 'hardness' else parse_material_list
62 | candidate_materials = gpt_wrapper(gpt_fn, parse_fn)
63 |
64 | info['candidate_materials_%s' % args.property_name] = candidate_materials
65 |
66 | print('-' * 50)
67 | print('scene: %s, info:' % os.path.basename(scene_dir), info)
68 | print('candidate materials (%s):' % args.property_name)
69 | mat_names, mat_vals = parse_fn(candidate_materials)
70 | for mat_i, mat_name in enumerate(mat_names):
71 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1]))
72 | if show:
73 | show_img_to_caption(scene_dir, int(info['idx_to_caption']))
74 |
75 | # save info to json
76 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f:
77 | json.dump(info, f, indent=4)
78 |
79 | return info
80 |
81 |
82 | def predict_object_info_gpt4v(args, scene_dir, show=False):
83 | """(EXPERIMENTAL) Predict materials directly from image with GPT-4V."""
84 | img_dir = os.path.join(scene_dir, 'images')
85 | imgs, masks = load_images(img_dir, return_masks=True)
86 | mask_areas = [np.mean(mask) for mask in masks]
87 |
88 | idx_to_caption = np.argsort(mask_areas)[int(len(mask_areas) * args.mask_area_percentile)]
89 | img_to_caption = imgs[idx_to_caption]
90 |
91 | # save img_to_caption in img_dir
92 | img_to_caption = Image.fromarray(img_to_caption)
93 | img_path = os.path.join(scene_dir, 'img_to_caption.png')
94 | img_to_caption.save(img_path)
95 |
96 | gpt_fn = lambda seed: gpt4v_candidate_materials(img_path, property_name=args.property_name, seed=seed)
97 | candidate_materials = gpt_wrapper(gpt_fn, parse_material_json)
98 |
99 | info = {'idx_to_caption': str(idx_to_caption),
100 | 'candidate_materials_%s' % args.property_name: candidate_materials}
101 |
102 | print('-' * 50)
103 | print('scene: %s, info:' % os.path.basename(scene_dir), info)
104 | print('candidate materials (%s):' % args.property_name)
105 | mat_names, mat_vals = parse_material_list(candidate_materials)
106 | for mat_i, mat_name in enumerate(mat_names):
107 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1]))
108 | if show:
109 | show_img_to_caption(scene_dir, int(info['idx_to_caption']))
110 |
111 | # save info to json
112 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f:
113 | json.dump(info, f, indent=4)
114 |
115 | return info
116 |
117 |
118 | def predict_thickness(args, scene_dir, mode='list', show=False):
119 | # load info
120 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'r') as f:
121 | info = json.load(f)
122 |
123 | if mode == 'list':
124 | caption = info['caption']
125 | elif mode == 'json': # json contains caption inside
126 | caption = None
127 | else:
128 | raise NotImplementedError
129 | candidate_materials = info['candidate_materials_density']
130 |
131 | gpt_fn = lambda seed: gpt_thickness(caption, candidate_materials,
132 | model_name=args.gpt_model_name, mode=mode, seed=seed)
133 | thickness = gpt_wrapper(gpt_fn, parse_material_list)
134 |
135 | info['thickness'] = thickness
136 |
137 | print('thickness (cm):')
138 | mat_names, mat_vals = parse_material_list(thickness)
139 | for mat_i, mat_name in enumerate(mat_names):
140 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1]))
141 | if show:
142 | show_img_to_caption(scene_dir, int(info['idx_to_caption']))
143 |
144 | # save info to json
145 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f:
146 | json.dump(info, f, indent=4)
147 |
148 | return info
149 |
150 |
151 | if __name__ == '__main__':
152 |
153 | args = get_args()
154 |
155 | scenes_dir = os.path.join(args.data_dir, 'scenes')
156 | scenes = get_scenes_list(args)
157 |
158 | openai.api_key = OPENAI_API_KEY
159 |
160 | for j, scene in enumerate(scenes):
161 | mats_info = predict_candidate_materials(args, os.path.join(scenes_dir, scene))
162 | if args.include_thickness:
163 | mats_info = predict_thickness(args, os.path.join(scenes_dir, scene))
164 |
--------------------------------------------------------------------------------
/ns_reconstruction.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import shutil
4 | from utils import get_last_file_in_folder, get_scenes_list
5 | from arguments import get_args
6 |
7 |
8 | def move_files_to_folder(source_dir, target_dir):
9 | for file in os.listdir(source_dir):
10 | shutil.move(os.path.join(source_dir, file), os.path.join(target_dir, file))
11 |
12 |
13 | if __name__ == '__main__':
14 |
15 | args = get_args()
16 |
17 | scenes_dir = os.path.join(args.data_dir, 'scenes')
18 | scenes = get_scenes_list(args)
19 |
20 | for scene in scenes:
21 | base_dir = os.path.join(scenes_dir, scene, 'ns')
22 |
23 | # Calling ns-train
24 | result = subprocess.run([
25 | 'ns-train', 'nerfacto',
26 | '--data', os.path.join(scenes_dir, scene),
27 | '--output_dir', base_dir,
28 | '--vis', args.vis_mode,
29 | '--project_name', args.project_name,
30 | '--experiment_name', scene,
31 | '--max_num_iterations', str(args.training_iters),
32 | '--pipeline.model.background-color', 'random',
33 | '--pipeline.datamanager.camera-optimizer.mode', 'off',
34 | '--pipeline.model.proposal-initial-sampler', 'uniform',
35 | '--pipeline.model.near-plane', str(args.near_plane),
36 | '--pipeline.model.far-plane', str(args.far_plane),
37 | '--steps-per-eval-image', '10000',
38 | ])
39 |
40 | ns_dir = get_last_file_in_folder(os.path.join(base_dir, '%s/nerfacto' % scene))
41 |
42 | # Copying dataparser_transforms (contains scale)
43 | result = subprocess.run([
44 | 'scp', '-r',
45 | os.path.join(ns_dir, 'dataparser_transforms.json'),
46 | os.path.join(base_dir, 'dataparser_transforms.json')
47 | ])
48 |
49 | half_bbox_size = args.bbox_size / 2
50 |
51 | # Calling ns-export pcd
52 | result = subprocess.run([
53 | 'ns-export', 'pointcloud',
54 | '--load-config', os.path.join(ns_dir, 'config.yml'),
55 | '--output-dir', base_dir,
56 | '--num-points', str(args.num_points),
57 | '--remove-outliers', 'True',
58 | '--normal-method', 'open3d',
59 | '--use-bounding-box', 'True',
60 | '--bounding-box-min', str(-half_bbox_size), str(-half_bbox_size), str(-half_bbox_size),
61 | '--bounding-box-max', str(half_bbox_size), str(half_bbox_size), str(half_bbox_size),
62 | ])
63 |
64 | # Calling ns-render
65 | result = subprocess.run([
66 | 'ns-render', 'dataset',
67 | '--load-config', os.path.join(ns_dir, 'config.yml'),
68 | '--output-path', os.path.join(base_dir, 'renders'),
69 | '--rendered-output-names', 'raw-depth',
70 | '--split', 'train+test',
71 | ])
72 |
73 | # Collect all depths in one folder
74 | os.makedirs(os.path.join(base_dir, 'renders', 'depth'), exist_ok=True)
75 | move_files_to_folder(os.path.join(base_dir, 'renders', 'test', 'raw-depth'), os.path.join(base_dir, 'renders', 'depth'))
76 | move_files_to_folder(os.path.join(base_dir, 'renders', 'train', 'raw-depth'), os.path.join(base_dir, 'renders', 'depth'))
77 |
78 |
--------------------------------------------------------------------------------
/predict_property.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | import open_clip
5 | import numpy as np
6 | import open3d as o3d
7 | import matplotlib as mpl
8 |
9 | from feature_fusion import CLIP_BACKBONE, CLIP_CHECKPOINT
10 | from gpt_inference import parse_material_list, parse_material_hardness
11 | from carving import get_carved_pts
12 | from utils import load_ns_point_cloud, parse_dataparser_transforms_json, get_last_file_in_folder, get_scenes_list
13 | from arguments import get_args
14 |
15 |
16 | @torch.no_grad()
17 | def get_text_features(texts, clip_model, clip_tokenizer, prefix='', suffix='', device='cuda'):
18 | """Get CLIP text features, optionally with a fixed prefix and suffix."""
19 | extended_texts = [prefix + text + suffix for text in texts]
20 | tokenized = clip_tokenizer(extended_texts).to(device)
21 |
22 | with torch.no_grad(), torch.cuda.amp.autocast():
23 | text_features = clip_model.encode_text(tokenized)
24 | text_features = text_features / text_features.norm(dim=1, keepdim=True)
25 |
26 | return text_features
27 |
28 |
29 | @torch.no_grad()
30 | def get_agg_patch_features(patch_features, is_visible):
31 | """Get aggregated patch features by averaging over visible patches."""
32 | n_visible = is_visible.sum(0)
33 | is_valid = n_visible > 0
34 |
35 | visible_patch_features = patch_features * is_visible.unsqueeze(-1)
36 | avg_visible_patch_features = visible_patch_features.sum(0) / n_visible.unsqueeze(-1)
37 | avg_visible_patch_features = avg_visible_patch_features / avg_visible_patch_features.norm(dim=1, keepdim=True)
38 | return avg_visible_patch_features[is_valid], is_valid
39 |
40 |
41 | @torch.no_grad()
42 | def get_interpolated_values(source_pts, source_vals, inner_pts, batch_size=2048, k=1):
43 | """Interpolate values by k nearest neighbor."""
44 | n_inner = len(inner_pts)
45 | inner_vals = torch.zeros(n_inner, source_vals.shape[1], device=inner_pts.device)
46 | for batch_start in range(0, n_inner, batch_size):
47 | curr_batch_size = min(batch_size, n_inner - batch_start)
48 | curr_inner_pts = inner_pts[batch_start:batch_start + curr_batch_size]
49 |
50 | dists = torch.cdist(curr_inner_pts, source_pts)
51 | _, idxs = torch.topk(dists, k=k, dim=1, largest=False)
52 | curr_inner_vals = source_vals[idxs].mean(1)
53 |
54 | inner_vals[batch_start:batch_start + curr_batch_size] = curr_inner_vals
55 | return inner_vals
56 |
57 |
58 | @torch.no_grad()
59 | def predict_physical_property_integral(args, scene_dir, clip_model, clip_tokenizer):
60 | """Predict the volume integral of a physical property (e.g. for mass). Returns a [low, high] range."""
61 |
62 | scene_name = os.path.basename(scene_dir)
63 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply')
64 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json')
65 | info_file = os.path.join(scene_dir, '%s.json' % args.mats_load_name)
66 |
67 | with open(info_file, 'r') as f:
68 | info = json.load(f)
69 |
70 | # loading source point info
71 | with open(os.path.join(scene_dir, 'features', 'voxel_size_%s.json' % args.feature_load_name), 'r') as f:
72 | feature_voxel_size = json.load(f)['voxel_size']
73 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=feature_voxel_size)
74 | source_pts = torch.Tensor(pts).to(args.device)
75 | patch_features = torch.load(os.path.join(scene_dir, 'features', 'patch_features_%s.pt' % args.feature_load_name))
76 | is_visible = torch.load(os.path.join(scene_dir, 'features', 'is_visible_%s.pt' % args.feature_load_name))
77 |
78 | # preparing material info
79 | mat_val_list = info['candidate_materials_%s' % args.property_name]
80 | mat_names, mat_vals = parse_material_list(mat_val_list)
81 | mat_vals = torch.Tensor(mat_vals).to(args.device)
82 | mat_tn_list = info['thickness']
83 | mat_names, mat_tns = parse_material_list(mat_tn_list)
84 | mat_tns = torch.Tensor(mat_tns).to(args.device) / 100 # cm to m
85 |
86 | # predictions on source points
87 | text_features = get_text_features(mat_names, clip_model, clip_tokenizer, device=args.device)
88 | agg_patch_features, is_valid = get_agg_patch_features(patch_features, is_visible)
89 | source_pts = source_pts[is_valid]
90 |
91 | similarities = agg_patch_features @ text_features.T
92 |
93 | source_pred_probs = torch.softmax(similarities / args.temperature, dim=1)
94 | source_pred_mat_idxs = similarities.argmax(1)
95 | source_pred_vals = source_pred_probs @ mat_vals
96 |
97 | # volume integration
98 | ns_transform, scale = parse_dataparser_transforms_json(dt_file)
99 | surface_cell_size = args.sample_voxel_size / scale
100 | mat_cell_volumes = surface_cell_size**2 * mat_tns
101 | mat_cell_products = mat_vals * mat_cell_volumes
102 |
103 | if args.volume_method == 'thickness':
104 | dense_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.sample_voxel_size)
105 | dense_pts = torch.Tensor(dense_pts).to(args.device)
106 |
107 | dense_pred_probs = get_interpolated_values(source_pts, source_pred_probs, dense_pts, batch_size=2048, k=1)
108 | dense_pred_products = dense_pred_probs @ mat_cell_products
109 | total_pred_val = (dense_pred_products).sum(0)
110 |
111 | carved, grid_cell_size = get_carved_pts(scene_dir, dist_thr_ns=0.05)
112 | bound_volume = grid_cell_size ** 3 * len(carved)
113 | total_volume = (dense_pred_probs @ mat_cell_volumes).max(1)[0].sum(0)
114 | if total_volume > bound_volume:
115 | total_pred_val *= bound_volume / total_volume
116 | total_pred_val *= args.correction_factor
117 |
118 | elif args.volume_method == 'carving':
119 | carved, grid_cell_size = get_carved_pts(scene_dir)
120 | carved_pred_probs = get_interpolated_values(source_pts, source_pred_probs, carved, batch_size=2048, k=1)
121 | carved_pred_vals = carved_pred_probs @ mat_vals
122 | grid_cell_volume = grid_cell_size ** 3
123 | total_pred_val = carved_pred_vals.sum(0) * grid_cell_volume * args.correction_factor
124 |
125 | dense_pts = carved
126 | dense_pred_probs = carved_pred_probs
127 |
128 |
129 | else:
130 | raise NotImplementedError
131 |
132 | print('-' * 50)
133 | print('scene:', scene_name)
134 | print('-' * 50)
135 | print('num. dense points:', len(dense_pts))
136 | print('caption:', info['caption'])
137 | print('candidate materials:')
138 | for mat_i, mat_name in enumerate(mat_names):
139 | print('%16s: %8.1f -%8.1f kg/m^3, %5.1f -%5.1f cm' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1],
140 | mat_tns[mat_i][0] * 100, mat_tns[mat_i][1] * 100))
141 |
142 | print('surface cell size: %.4f cm' % (surface_cell_size * 100))
143 | print('predicted total mass: [%.4f - %.4f kg]' % (total_pred_val[0], total_pred_val[1]))
144 |
145 | if args.show_mat_seg:
146 | # Visualize material segmentation in open3d
147 | cmap = mpl.colormaps['tab10']
148 | mat_colors = [cmap(i)[:3] for i in range(len(mat_names))]
149 | dense_pred_colors = np.array([mat_colors[i] for i in dense_pred_probs.argmax(1)])
150 |
151 | pcd = o3d.geometry.PointCloud()
152 | pcd.points = o3d.utility.Vector3dVector(dense_pts.cpu().numpy())
153 | pcd.colors = o3d.utility.Vector3dVector(dense_pred_colors)
154 | o3d.visualization.draw_geometries([pcd])
155 |
156 | return total_pred_val.tolist()
157 |
158 |
159 | @torch.no_grad()
160 | def predict_physical_property_query(args, query_pts, scene_dir, clip_model, clip_tokenizer, return_all=False):
161 | """
162 | Predict a physical property at given array of 3D query points. query_pts can be set to 'grid'
163 | instead to automatically generate a grid of query points from source points. If return_all=True,
164 | returns various intermediate results. Otherwise, returns [low, high] range for each query point.
165 | """
166 |
167 | scene_name = os.path.basename(scene_dir)
168 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply')
169 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json')
170 | info_file = os.path.join(scene_dir, '%s.json' % args.mats_load_name)
171 |
172 | with open(info_file, 'r') as f:
173 | info = json.load(f)
174 |
175 | # loading source point info
176 | with open(os.path.join(scene_dir, 'features', 'voxel_size_%s.json' % args.feature_load_name), 'r') as f:
177 | feature_voxel_size = json.load(f)['voxel_size']
178 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=feature_voxel_size)
179 | source_pts = torch.Tensor(pts).to(args.device)
180 | patch_features = torch.load(os.path.join(scene_dir, 'features', 'patch_features_%s.pt' % args.feature_load_name))
181 | is_visible = torch.load(os.path.join(scene_dir, 'features', 'is_visible_%s.pt' % args.feature_load_name))
182 |
183 | # preparing material info
184 | mat_val_list = info['candidate_materials_%s' % args.property_name]
185 | if args.property_name == 'hardness':
186 | mat_names, mat_vals = parse_material_hardness(mat_val_list)
187 | else:
188 | mat_names, mat_vals = parse_material_list(mat_val_list)
189 | mat_vals = torch.Tensor(mat_vals).to(args.device)
190 |
191 | # predictions on source points
192 | text_features = get_text_features(mat_names, clip_model, clip_tokenizer, device=args.device)
193 | agg_patch_features, is_valid = get_agg_patch_features(patch_features, is_visible)
194 | source_pts = source_pts[is_valid]
195 |
196 | similarities = agg_patch_features @ text_features.T
197 |
198 | source_pred_probs = torch.softmax(similarities / args.temperature, dim=1)
199 | source_pred_mat_idxs = similarities.argmax(1)
200 | source_pred_vals = source_pred_probs @ mat_vals
201 |
202 | if query_pts == 'grid':
203 | query_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.sample_voxel_size)
204 | query_pts = torch.Tensor(query_pts).to(args.device)
205 | query_pred_probs = get_interpolated_values(source_pts, source_pred_probs, query_pts, batch_size=2048, k=1)
206 | query_pred_vals = query_pred_probs @ mat_vals
207 |
208 | print('-' * 50)
209 | print('scene:', scene_name)
210 | print('-' * 50)
211 | print('num. query points:', len(query_pts))
212 | print('caption:', info['caption'])
213 | print('candidate materials (%s):' % args.property_name)
214 | for mat_i, mat_name in enumerate(mat_names):
215 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1]))
216 |
217 | if args.show_mat_seg:
218 | # Visualize material segmentation in open3d
219 | cmap = mpl.colormaps['tab10']
220 | mat_colors = [cmap(i)[:3] for i in range(len(mat_names))]
221 | query_pred_colors = np.array([mat_colors[i] for i in query_pred_probs.argmax(1)])
222 |
223 | pcd = o3d.geometry.PointCloud()
224 | pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy())
225 | pcd.colors = o3d.utility.Vector3dVector(query_pred_colors)
226 | o3d.visualization.draw_geometries([pcd])
227 |
228 | if return_all:
229 | query_features = get_interpolated_values(source_pts, agg_patch_features, query_pts, batch_size=2048, k=1)
230 | query_similarities = get_interpolated_values(source_pts, similarities, query_pts, batch_size=2048, k=1)
231 | return {
232 | 'query_pred_probs': query_pred_probs.cpu().numpy(),
233 | 'query_pred_vals': query_pred_vals.cpu().numpy(),
234 | 'query_features': query_features.cpu().numpy(),
235 | 'query_similarities': query_similarities.cpu().numpy(),
236 | 'source_pts': source_pts.cpu().numpy(),
237 | 'mat_names': mat_names,
238 | }
239 | return query_pred_vals.cpu().numpy()
240 |
241 |
242 | if __name__ == '__main__':
243 |
244 | args = get_args()
245 |
246 | scenes_dir = os.path.join(args.data_dir, 'scenes')
247 | scenes = get_scenes_list(args)
248 |
249 | clip_model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT)
250 | clip_model.to(args.device)
251 | clip_tokenizer = open_clip.get_tokenizer(CLIP_BACKBONE)
252 |
253 | preds = {}
254 | for j, scene in enumerate(scenes):
255 | scene_dir = os.path.join(scenes_dir, scene)
256 | if args.prediction_mode == 'integral':
257 | pred = predict_physical_property_integral(args, scene_dir, clip_model, clip_tokenizer)
258 | elif args.prediction_mode == 'grid':
259 | pred = predict_physical_property_query(args, 'grid', scene_dir, clip_model, clip_tokenizer)
260 | else: # use predict_physical_property_query() to query points however you want!
261 | raise NotImplementedError
262 | preds[scene] = pred
263 |
264 | if args.prediction_mode == 'integral' and args.save_preds:
265 | os.makedirs('preds', exist_ok=True)
266 | with open(os.path.join('preds', 'preds_%s.json' % args.preds_save_name), 'w') as f:
267 | json.dump(preds, f, indent=4)
268 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pillow
3 | matplotlib
4 | transformers
5 | open_clip_torch
6 | open3d
7 | openai==0.28
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import numpy as np
4 | import json
5 | import open3d as o3d
6 | import gzip
7 | from PIL import Image
8 |
9 |
10 | def project_3d_to_2d(pts, w2c, K, return_dists=False):
11 | """Project 3D points to 2D (nerfstudio format)."""
12 | pts = np.array(pts)
13 | K = np.hstack([K, np.zeros((3, 1))])
14 | pts = np.concatenate([pts, np.ones((pts.shape[0], 1))], axis=1)
15 | pts = np.dot(pts, w2c.T)
16 | pts[:, [1, 2]] *= -1
17 | if return_dists:
18 | dists = np.linalg.norm(pts[:, :3], axis=-1)
19 | pts = np.dot(pts, K.T)
20 | pts_2d = pts[:, :2] / pts[:, 2:]
21 | if return_dists:
22 | return pts_2d, dists
23 | return pts_2d
24 |
25 |
26 | def parse_transforms_json(t_file, return_w2c=False, different_Ks=False):
27 | with open(t_file, 'rb') as f:
28 | transforms = json.load(f)
29 |
30 | if different_Ks:
31 | Ks = []
32 | for i in range(len(transforms['frames'])):
33 | K = np.array([
34 | [transforms['frames'][i]['fl_x'], 0, transforms['frames'][i]['cx']],
35 | [0, transforms['frames'][i]['fl_y'], transforms['frames'][i]['cy']],
36 | [0, 0, 1],
37 | ])
38 | Ks.append(K)
39 | K = Ks
40 | else:
41 | K = np.array([
42 | [transforms['fl_x'], 0, transforms['cx']],
43 | [0, transforms['fl_y'], transforms['cy']],
44 | [0, 0, 1],
45 | ])
46 |
47 | n_frames = len(transforms['frames'])
48 | c2ws = [np.array(transforms['frames'][i]['transform_matrix']) for i in range(n_frames)]
49 | if return_w2c:
50 | w2cs = [np.linalg.inv(c2w) for c2w in c2ws]
51 | return w2cs, K
52 | return c2ws, K
53 |
54 |
55 | def parse_dataparser_transforms_json(dt_file):
56 | with open(dt_file, "r") as fr:
57 | dataparser_transforms = json.load(fr)
58 |
59 | ns_transform = np.asarray(dataparser_transforms["transform"])
60 | scale = dataparser_transforms["scale"]
61 | return ns_transform, scale
62 |
63 |
64 | def load_ns_point_cloud(pcd_file, dt_file, ds_size=0.01, viz=False):
65 | pcd = o3d.io.read_point_cloud(pcd_file)
66 | if ds_size is not None:
67 | pcd = pcd.voxel_down_sample(ds_size)
68 |
69 | ns_transform, scale = parse_dataparser_transforms_json(dt_file)
70 | ns_transform = np.concatenate([ns_transform, np.array([[0, 0, 0, 1/scale]])], 0)
71 | inv_ns_transform = np.linalg.inv(ns_transform)
72 |
73 | # use open3d to scale and transform
74 | pcd.transform(inv_ns_transform)
75 |
76 | pts = np.asarray(pcd.points)
77 |
78 | if viz:
79 | cf = o3d.geometry.TriangleMesh.create_coordinate_frame(size=1.0, origin=[0, 0, 0])
80 | o3d.visualization.draw_geometries([cf, pcd])
81 | return pts
82 |
83 |
84 | def load_images(img_dir, bg_change=255, return_masks=False):
85 | img_files = os.listdir(img_dir)
86 | img_files.sort()
87 | imgs = []
88 | masks = []
89 | for img_file in img_files:
90 | # load RGBA image
91 | img = np.array(Image.open(os.path.join(img_dir, img_file)))
92 | if return_masks or bg_change is not None:
93 | mask = img[:, :, 3] > 0
94 | if bg_change is not None:
95 | img[~mask] = bg_change
96 | masks.append(mask)
97 | imgs.append(img[:, :, :3])
98 |
99 | if return_masks:
100 | return imgs, masks
101 | return imgs
102 |
103 |
104 | def load_depths(depth_dir, Ks):
105 | depth_files = os.listdir(depth_dir)
106 | depth_files.sort()
107 | depths = []
108 | for i, depth_file in enumerate(depth_files):
109 | # load npy.gz depth file
110 | with gzip.open(os.path.join(depth_dir, depth_file), 'rb') as f:
111 | dist = np.load(f)[:, :, 0]
112 | if Ks is not None:
113 | depth = distance_to_depth(dist, Ks[i])
114 | else:
115 | depth = dist
116 | depths.append(depth)
117 | return depths
118 |
119 |
120 | def depth_to_distance(depth, K):
121 | """Convert depth map to distance from camera."""
122 | h, w = depth.shape
123 | x, y = np.meshgrid(np.arange(w), np.arange(h))
124 | x = x.flatten()
125 | y = y.flatten()
126 | depth = depth.flatten()
127 | pts = np.stack([x, y, np.ones_like(x)], axis=1)
128 | pts = np.dot(pts, np.linalg.inv(K).T)
129 | pts *= depth[:, None]
130 | dists = np.linalg.norm(pts, axis=1)
131 | dists = dists.reshape(h, w)
132 | return dists
133 |
134 |
135 | def distance_to_depth(dists, K):
136 | """Convert distance map to depth map."""
137 | h, w = dists.shape
138 | x, y = np.meshgrid(np.arange(w), np.arange(h))
139 | x = x.flatten()
140 | y = y.flatten()
141 | pts = np.stack([x, y, np.ones_like(x)], axis=1)
142 | pts = np.dot(pts, np.linalg.inv(K).T)
143 | divisor = np.linalg.norm(pts, axis=1)
144 | divisor = divisor.reshape(h, w)
145 | depth = dists / divisor
146 | return depth
147 |
148 |
149 | def get_last_file_in_folder(folder):
150 | files = os.listdir(folder)
151 | return os.path.join(folder, sorted(files, reverse=True)[0])
152 |
153 |
154 | def get_scenes_list(args):
155 | if args.split != 'all':
156 | with open(os.path.join(args.data_dir, 'splits.json'), 'r') as f:
157 | splits = json.load(f)
158 | if args.split == 'train+val':
159 | scenes = splits['train'] + splits['val']
160 | else:
161 | scenes = splits[args.split]
162 | else:
163 | scenes = sorted(os.listdir(os.path.join(args.data_dir, 'scenes')))
164 |
165 | if args.end_idx != -1:
166 | scenes = scenes[args.start_idx:args.end_idx]
167 | else:
168 | scenes = scenes[args.start_idx:]
169 | return scenes
170 |
171 |
172 | def unproject_point(pt_2d, depth, c2w, K):
173 | """Unproject a single point from 2D to 3D (nerfstudio format)."""
174 | cx = K[0, 2]
175 | cy = K[1, 2]
176 | fx = K[0, 0]
177 | fy = K[1, 1]
178 | x = (pt_2d[0] - cx) / fx
179 | y = (pt_2d[1] - cy) / fy
180 | pt_3d = np.array([x, -y, -1])
181 | pt_3d *= depth[pt_2d[1], pt_2d[0]]
182 | pt_3d = np.concatenate([pt_3d, np.ones((1,))], axis=0)
183 | pt_3d = np.dot(c2w, pt_3d)
184 | pt_3d = pt_3d[:3]
185 | return pt_3d
186 |
--------------------------------------------------------------------------------
/visualization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import open3d as o3d
3 | import os
4 | import torch
5 | import open_clip
6 | import matplotlib as mpl
7 | import matplotlib.pyplot as plt
8 | import matplotlib.patches as mpatches
9 | from PIL import Image
10 | from sklearn.decomposition import PCA
11 |
12 | from feature_fusion import CLIP_BACKBONE, CLIP_CHECKPOINT
13 | from predict_property import predict_physical_property_query
14 | from utils import parse_transforms_json, load_ns_point_cloud, load_images
15 | from arguments import get_args
16 |
17 |
18 | def features_to_colors(features):
19 | """Convert feature vectors to RGB colors using PCA."""
20 | pca = PCA(n_components=3)
21 | pca.fit(features)
22 | transformed = pca.transform(features)
23 | q1, q99 = np.percentile(transformed, [1, 99])
24 | feature_pca_postprocess_sub = q1
25 | feature_pca_postprocess_div = (q99 - q1)
26 | transformed = (transformed - feature_pca_postprocess_sub) / feature_pca_postprocess_div
27 | colors = np.clip(transformed, 0, 1)
28 | return colors
29 |
30 |
31 | def similarities_to_colors(similarities, temperature=None):
32 | """Convert CLIP similarity values to RGB colors."""
33 | cmap = mpl.colormaps['tab10']
34 | mat_colors = [cmap(i)[:3] for i in range(similarities.shape[1])]
35 | if temperature is None:
36 | argmax_similarities = np.argmax(similarities, axis=1)
37 | colors = np.array([mat_colors[i] for i in argmax_similarities])
38 | else:
39 | softmax_probs = torch.softmax(torch.tensor(similarities) / temperature, dim=1)
40 | colors = softmax_probs @ torch.tensor(mat_colors).float()
41 | colors = colors.numpy()
42 | return colors
43 |
44 |
45 | def values_to_colors(values, low, high):
46 | """Convert scalar values to RGB colors."""
47 | cmap = mpl.colormaps['inferno']
48 | colors = cmap((values - low) / (high - low))
49 | return colors[:, :3]
50 |
51 |
52 | def render_pcd(pcd, w2c, K, hw=(1024, 1024), pt_size=8, savefile=None, show=False):
53 | h, w = hw
54 |
55 | # set pinhole camera parameters from K
56 | render_camera = o3d.camera.PinholeCameraParameters()
57 | render_camera.extrinsic = w2c
58 |
59 | intrinsic = o3d.camera.PinholeCameraIntrinsic()
60 | intrinsic.set_intrinsics(h, w, K[0, 0], K[1, 1], K[0, 2], K[1, 2])
61 | render_camera.intrinsic = intrinsic
62 |
63 | # visualize pcd from camera view with intrinsics set to K
64 | vis = o3d.visualization.Visualizer()
65 | vis.create_window(width=w, height=h, visible=show)
66 |
67 | vis.add_geometry(pcd)
68 | ctr = vis.get_view_control()
69 | ctr.convert_from_pinhole_camera_parameters(render_camera, allow_arbitrary=True)
70 |
71 | # rendering options
72 | render_option = vis.get_render_option()
73 | render_option.point_size = pt_size
74 | render_option.point_show_normal = False
75 | render_option.light_on = False
76 | vis.update_renderer()
77 |
78 | if show:
79 | vis.run()
80 |
81 | if savefile is not None:
82 | vis.capture_screen_image(savefile, do_render=True)
83 | vis.destroy_window()
84 | return Image.open(savefile)
85 | else:
86 | render = vis.capture_screen_float_buffer(do_render=True)
87 | vis.destroy_window()
88 | return np.array(render)
89 |
90 |
91 | def composite_and_save(img1, img2, alpha, savefile):
92 | img1 = img1.astype(np.float32)
93 | img2 = img2.astype(np.float32)
94 | img = img1 * alpha + img2 * (1 - alpha)
95 | img = (img * 255).astype(np.uint8)
96 | Image.fromarray(img).save(savefile)
97 | return img
98 |
99 |
100 | def make_legend(colors, names, ncol=1, figsize=(2.0, 2.5), savefile=None, show=False):
101 | plt.style.use('fast')
102 | plt.rcParams["font.family"] = "Times New Roman"
103 | fig = plt.figure(figsize=figsize)
104 | fig.patch.set_facecolor('white')
105 | plt.axis('off')
106 |
107 | # creating legend with color boxes
108 | ptchs = []
109 | for color, name in zip(colors, names):
110 | if len(name) > 10: # wrap long names
111 | name = name.replace(' ', '\n')
112 | ptchs.append(mpatches.Patch(color=color[:3], label=name))
113 | leg = plt.legend(handles=ptchs, ncol=ncol, loc='center left', prop={'size': 18},
114 | handlelength=1, handleheight=1, facecolor='white', framealpha=0)
115 | plt.tight_layout()
116 |
117 | if show:
118 | plt.show()
119 | if savefile is not None:
120 | plt.savefig(savefile, dpi=400)
121 | plt.close()
122 |
123 |
124 | if __name__ == '__main__':
125 |
126 | args = get_args()
127 |
128 | scenes_dir = os.path.join(args.data_dir, 'scenes')
129 |
130 | clip_model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT)
131 | clip_model.to(args.device)
132 | clip_tokenizer = open_clip.get_tokenizer(CLIP_BACKBONE)
133 |
134 | scene_dir = os.path.join(scenes_dir, args.scene_name)
135 | t_file = os.path.join(scene_dir, 'transforms.json')
136 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply')
137 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json')
138 |
139 | query_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=None)
140 | query_pts = torch.Tensor(query_pts).to(args.device)
141 |
142 | result = predict_physical_property_query(args, query_pts, scene_dir, clip_model, clip_tokenizer,
143 | return_all=True)
144 |
145 | out_dir = os.path.join('viz', args.viz_save_name)
146 | os.makedirs(out_dir, exist_ok=True)
147 |
148 | # legend for materials
149 | mat_names = result['mat_names']
150 | cmap_tab10 = mpl.colormaps['tab10']
151 | make_legend([cmap_tab10(i) for i in range(len(mat_names))], mat_names,
152 | savefile=os.path.join(out_dir, '%s_legend.png' % args.viz_save_name), show=args.show)
153 |
154 | # camera for rendering
155 | w2cs, K = parse_transforms_json(t_file, return_w2c=True)
156 | view_idx = 0
157 | w2c = w2cs[view_idx]
158 | w2c[[1, 2]] *= -1 # convert from nerfstudio to open3d format
159 | imgs = load_images(os.path.join(scene_dir, 'images'))
160 | orig_img = imgs[view_idx] / 255.
161 |
162 | # RGB reconstruction
163 | rgb_pcd = o3d.io.read_point_cloud(pcd_file)
164 | rgb_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy())
165 | render = render_pcd(rgb_pcd, w2c, K, show=args.show)
166 | if not args.show:
167 | Image.fromarray(imgs[view_idx]).save(os.path.join(out_dir, '%s_rgb.png' % args.viz_save_name))
168 |
169 | # features PCA
170 | pca_pcd = o3d.geometry.PointCloud()
171 | pca_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy())
172 | colors_pca = features_to_colors(result['query_features'])
173 | pca_pcd.colors = o3d.utility.Vector3dVector(colors_pca)
174 | render = render_pcd(pca_pcd, w2c, K, show=args.show)
175 | if not args.show:
176 | combined = composite_and_save(orig_img, render, args.compositing_alpha,
177 | savefile=os.path.join(out_dir, '%s_pca.png' % args.viz_save_name))
178 |
179 | # material segmentation
180 | seg_pcd = o3d.geometry.PointCloud()
181 | seg_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy())
182 | colors_seg = similarities_to_colors(result['query_similarities'])
183 | seg_pcd.colors = o3d.utility.Vector3dVector(colors_seg)
184 | render = render_pcd(seg_pcd, w2c, K, show=args.show)
185 | if not args.show:
186 | combined = composite_and_save(orig_img, render, args.compositing_alpha,
187 | savefile=os.path.join(out_dir, '%s_seg.png' % args.viz_save_name))
188 |
189 | # physical property values
190 | val_pcd = o3d.geometry.PointCloud()
191 | val_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy())
192 | colors_val = values_to_colors(np.mean(result['query_pred_vals'], axis=1), args.cmap_min, args.cmap_max)
193 | val_pcd.colors = o3d.utility.Vector3dVector(colors_val)
194 | render = render_pcd(val_pcd, w2c, K, show=args.show)
195 | if not args.show:
196 | combined = composite_and_save(orig_img, render, args.compositing_alpha,
197 | savefile=os.path.join(out_dir, '%s_%s.png' % (args.viz_save_name, args.property_name)))
198 |
199 |
--------------------------------------------------------------------------------