├── .gitignore ├── LICENSE ├── README.md ├── arguments.py ├── captioning.py ├── carving.py ├── docs ├── example_vid.gif └── overview.png ├── evaluation.py ├── feature_fusion.py ├── gpt_inference.py ├── material_proposal.py ├── ns_reconstruction.py ├── predict_property.py ├── requirements.txt ├── utils.py └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | preds/ 3 | viz/ 4 | *.pt 5 | my_api_key.py 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ajzhai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NeRF2Physics: Physical Property Understanding from Language-Embedded Feature Fields 2 | 3 | Albert J. Zhai, Yuan Shen, Emily Y. Chen, Gloria X. Wang, Xinlei Wang, Sheng Wang, Kaiyu Guan, Shenlong Wang
4 | University of Illinois at Urbana-Champaign 5 | 6 | CVPR 2024 7 | 8 | [Paper](https://arxiv.org/abs/2404.04242) │ [Project Page](https://ajzhai.github.io/NeRF2Physics/) 9 | 10 | 11 | **Infer physical properties densely in 3D for any object!**
12 | ![Example Video](docs/example_vid.gif) 13 | 14 | ## Requirements 15 | ### Installing Dependencies 16 | Our method involves 3D reconstruction using [Nerfstudio](https://docs.nerf.studio/). If you want to modify the reconstruction or reproduce it on your own data, you will need to follow the [official instructions to install Nerfstudio](https://docs.nerf.studio/quickstart/installation.html). 17 | 18 | Besides the initial reconstruction, the rest of our method operates on the extracted depth maps/point cloud (which we provide in our dataset) and thus does not require Nerfstudio. So if you don't wish to run Nerfstudio, the only things you need to do are 1) [install PyTorch](https://pytorch.org/get-started/locally/), and 2) run 19 | ``` 20 | pip install -r requirements.txt 21 | ``` 22 | to install the remaining dependencies. 23 | 24 | ### BLIP-2 Model 25 | Our method uses [BLIP-2-Flan-T5-XL](https://huggingface.co/Salesforce/blip2-flan-t5-xl) for image captioning. To download the model weights, clone the repository into the root directory of this repository (you will need Git LFS). You can also download it elsewhere and specify the location via the `--blip2_model_dir` argument. 26 | 27 | ### OpenAI API Key 28 | Our method involves calling GPT via the [OpenAI API](https://platform.openai.com/). This requires having an account with some credits on it (usage will be fairly minimal). Once you have an account, find your API key [here](https://platform.openai.com/api-keys) and set a variable named `OPENAI_API_KEY` to your key in a Python file named `my_api_key.py`. Example (replace `` with your API key): 29 | ``` 30 | echo "OPENAI_API_KEY = ''" >> ./my_api_key.py 31 | ``` 32 | 33 | ## ABO-500 Dataset 34 | We provide **ABO-500**, a dataset of multi-view images of objects from [Amazon Berkeley Objects (ABO)](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) with camera parameters and ground-truth object weights. We also provide the intermediate outputs from our method so that you can run any part of our method without needing to run the previous parts. Please download the dataset via [this Box link](https://uofi.box.com/shared/static/743ydh4n1xi0dj05lcyyg4evqk2n4dko.zip) 35 | and unzip into a directory `data/` (you can also put it elsewhere and specify `--data_dir` later). 36 | 37 | Example with curl: 38 | ``` 39 | curl -L https://uofi.box.com/shared/static/743ydh4n1xi0dj05lcyyg4evqk2n4dko.zip \ 40 | --output ./abo_500.zip 41 | unzip ./abo_500.zip -d ./data/ 42 | ``` 43 | 44 | ## Usage 45 | Overview of our method: 46 | ![Overview](docs/overview.png) 47 | We provide separate Python scripts for running each component in our method. Command-line arguments for all of the scripts can be found in `arguments.py`. Intermediate outputs get stored in the scene data directories. If you are using our provided dataset, you can start from anywhere along the pipeline without running the previous components. 48 | 49 | ### 3D Reconstruction 50 | We use Nerfstudio to train NeRFs and extract depth maps and point clouds. We have wrapped all of the Nerfstudio commands into `ns_reconstruction.py`. Example (only processes one scene): 51 | ``` 52 | python ns_reconstruction.py --end_idx 1 53 | ``` 54 | This is the only step that requires Nerfstudio. 55 | 56 | 57 | ### CLIP Feature Fusion 58 | Feature fusion is done in `feature_fusion.py`. Example (only processes one scene): 59 | ``` 60 | python feature_fusion.py --end_idx 1 61 | ``` 62 | 63 | ### Captioning and View Selection 64 | Captioning and view selection are both done in `captioning.py`. Example (only processes one scene): 65 | ``` 66 | python captioning.py --end_idx 1 67 | ``` 68 | This requires downloading the BLIP-2 model (see Requirements above). 69 | 70 | 71 | ### LLM Material Proposal 72 | Material proposal is done in `material_proposal.py`. You can specify the physical property of interest using the `--property_name` argument. Currently, only mass density, friction, and Shore hardness are supported, but feel free to make prompts for other properties (see `gpt_inference.py`). Example (only processes one scene): 73 | ``` 74 | python material_proposal.py --property_name density --end_idx 1 75 | ``` 76 | This requires setting your OpenAI API Key (see Requirements above). 77 | 78 | ### CLIP-based Kernel Regression (Final Prediction) 79 | Physical properties are predicted using CLIP-based kernel regression in `predict_property.py`. Example (only processes one scene): 80 | ``` 81 | python material_proposal.py --property_name density --end_idx 1 82 | ``` 83 | By default, the script will predict a volume integral of the physical property (e.g. predicting mass by integrating density). You can instead get dense results for a 3D grid of points by setting `--prediction_mode` to `grid`. You can also write your own code using the `predict_physical_property_query` function to query points however you want. 84 | 85 | ### Evaluation 86 | We provide a script for quantitative evaluation of mass predictions in `evaluation.py`. The results will be printed in your terminal. Example: 87 | ``` 88 | python evaluation.py 89 | ``` 90 | Explanations of each metric can be found in our paper. 91 | 92 | ### Visualization 93 | We provide a script for interactively viewing and saving 3D visualizations in `visualization.py`. You should specify the scene name using the `--scene_name` argument. Example: 94 | ``` 95 | python visualization.py --scene_name B075YQXRBS_ATVPDKIKX0DER 96 | ``` 97 | 98 | ## Using Custom Data 99 | To run our method on your own data, you can use [Nerfstudio's data processing tool](https://docs.nerf.studio/quickstart/custom_dataset.html) to convert your data into the right format. You can then run the components of our method in order. 100 | 101 | 102 | ## Citation 103 | Please cite our paper if you find this repo useful! 104 | ```bibtex 105 | @inproceedings{zhai2024physical, 106 | title={Physical Property Understanding from Language-Embedded Feature Fields}, 107 | author={Zhai, Albert J and Shen, Yuan and Chen, Emily Y and Wang, Gloria X and Wang, Xinlei and Wang, Sheng and Guan, Kaiyu and Wang, Shenlong}, 108 | booktitle={CVPR}, 109 | year={2024} 110 | } 111 | ``` -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser(description='NeRF2Physics') 6 | 7 | # General arguments 8 | parser.add_argument('--data_dir', type=str, default="./data/abo_500/", 9 | help='path to data (default: ./data/abo_500/)') 10 | parser.add_argument('--split', type=str, default="all", 11 | help='dataset split, either train, val, train+val, test, or all (default: all)') 12 | parser.add_argument('--start_idx', type=int, default=0, 13 | help='starting scene index, useful for evaluating only a few scenes (default: 0)') 14 | parser.add_argument('--end_idx', type=int, default=-1, 15 | help='ending scene index, useful for evaluating only a few scenes (default: -1)') 16 | parser.add_argument('--different_Ks', type=int, default=0, 17 | help='whether data has cameras with different intrinsic matrices (default: 0)') 18 | parser.add_argument('--device', type=str, default="cuda", 19 | help='device for torch (default: cuda)') 20 | 21 | # NeRF training 22 | parser.add_argument('--training_iters', type=int, default=20000, 23 | help='number of training iterations (default: 20000)') 24 | parser.add_argument('--near_plane', type=float, default=0.4, 25 | help='near plane for ray sampling (default: 0.4)') 26 | parser.add_argument('--far_plane', type=float, default=6.0, 27 | help='far plane for ray sampling (default: 6.0)') 28 | parser.add_argument('--vis_mode', type=str, default='wandb', 29 | help='nerfstudio visualization mode (default: wandb)') 30 | parser.add_argument('--project_name', type=str, default='NeRF2Physics', 31 | help='project name used by wandb (default: NeRF2Physics)') 32 | 33 | # NeRF point cloud 34 | parser.add_argument('--num_points', type=int, default=100000, 35 | help='number of points for point cloud (default: 100000)') 36 | parser.add_argument('--bbox_size', type=float, default=1.0, 37 | help='bounding box (cube) size, relative to scaled scene (default: 1.0)') 38 | 39 | # CLIP feature fusion 40 | parser.add_argument('--patch_size', type=int, default=56, 41 | help='patch size (default: 56)') 42 | parser.add_argument('--batch_size', type=int, default=16, 43 | help='batch size (default: 16)') 44 | parser.add_argument('--feature_voxel_size', type=int, default=0.01, 45 | help='voxel downsampling size for features, relative to scaled scene (default: 0.01)') 46 | parser.add_argument('--feature_save_name', type=str, default="ps56", 47 | help='feature save name (default: ps56)') 48 | parser.add_argument('--occ_thr', type=float, default=0.01, 49 | help='occlusion threshold, relative to scaled scene (default: 0.01)') 50 | 51 | # Captioning and view selection 52 | parser.add_argument('--blip2_model_dir', type=str, default="./blip2-flan-t5-xl", 53 | help='path to BLIP2 model directory (default: ./blip2-flan-t5-xl)') 54 | parser.add_argument('--mask_area_percentile', type=float, default=0.75, 55 | help='mask area percentile for canonical view (default: 0.75)') 56 | parser.add_argument('--caption_save_name', type=str, default="info_new", 57 | help='caption save name (default: info_new)') 58 | 59 | # Material proposal 60 | parser.add_argument('--caption_load_name', type=str, default="info_new", 61 | help='name of saved caption to load (default: info_new)') 62 | parser.add_argument('--property_name', type=str, default="density", 63 | help='property to predict (default: density)') 64 | parser.add_argument('--include_thickness', type=int, default=1, 65 | help='whether to also predict thickness (default: 1)') 66 | parser.add_argument('--gpt_model_name', type=str, default="gpt-3.5-turbo", 67 | help='GPT model name (default: gpt-3.5-turbo)') 68 | parser.add_argument('--mats_save_name', type=str, default="info_new", 69 | help='candidate materials save name (default: info_new)') 70 | 71 | # Physical property prediction (uses property_name argument from above) 72 | parser.add_argument('--mats_load_name', type=str, default="info", 73 | help='candidate materials load name (default: info)') 74 | parser.add_argument('--feature_load_name', type=str, default="ps56", 75 | help='feature load name (default: ps56)') 76 | parser.add_argument('--prediction_mode', type=str, default="integral", 77 | help="can be either 'integral' or 'grid' (default: integral)") 78 | parser.add_argument('--temperature', type=float, default=0.1, 79 | help='softmax temperature for kernel regression (default: 0.01)') 80 | parser.add_argument('--sample_voxel_size', type=float, default=0.005, 81 | help='voxel downsampling size for sampled points, relative to scaled scene (default: 0.005)') 82 | parser.add_argument('--volume_method', type=str, default="thickness", 83 | help="method for volume estimation, either 'thickness' or 'carving' (default: thickness)") 84 | parser.add_argument('--correction_factor', type=float, default=0.6, 85 | help='correction factor for integral prediction (default: 0.6)') 86 | parser.add_argument('--show_mat_seg', type=int, default=0, 87 | help="whether to show visualization of material segmentation (default: 0)") 88 | parser.add_argument('--save_preds', type=int, default=1, 89 | help='whether to save predictions (default: 1)') 90 | parser.add_argument('--preds_save_name', type=str, default="mass", 91 | help='predictions save name (default: mass)') 92 | 93 | # Evaluation 94 | parser.add_argument('--preds_json_path', type=str, default="./preds/preds_mass.json", 95 | help='path to predictions JSON file (default: ./preds/preds_mass.json)') 96 | parser.add_argument('--gts_json_path', type=str, default="./data/abo_500/filtered_product_weights.json", 97 | help='path to ground truth JSON file (default: ./data/abo_500_50/filtered_product_weights.json)') 98 | parser.add_argument('--clamp_min', type=float, default=0.01, 99 | help='minimum value to clamp predictions (default: 0.01)') 100 | parser.add_argument('--clamp_max', type=float, default=100., 101 | help='maximum value to clamp predictions (default: 100.)') 102 | 103 | # Visualization 104 | parser.add_argument('--scene_name', type=str, 105 | help='scene name for visualization (must be provided)') 106 | parser.add_argument('--show', type=int, default=1, 107 | help='whether to show interactive viewer (default: 1)') 108 | parser.add_argument('--compositing_alpha', type=float, default=0.2, 109 | help='alpha for compositing with RGB image (default: 0.2)') 110 | parser.add_argument('--cmap_min', type=float, default=500, 111 | help='minimum physical property value for colormap (default: 500)') 112 | parser.add_argument('--cmap_max', type=float, default=3500, 113 | help='maximum physical property value for colormap (default: 3500)') 114 | parser.add_argument('--viz_save_name', type=str, default="tmp", 115 | help='visualization save name (default: tmp)') 116 | 117 | args = parser.parse_args() 118 | 119 | return args -------------------------------------------------------------------------------- /captioning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | import matplotlib.pyplot as plt 5 | import torch 6 | from transformers import AutoProcessor, Blip2ForConditionalGeneration 7 | 8 | from utils import load_images, get_scenes_list 9 | from arguments import get_args 10 | 11 | 12 | CAPTIONING_PROMPT = "Question: Give a detailed description of the object. Answer:" 13 | 14 | 15 | def load_blip2(model_name, device='cuda'): 16 | processor = AutoProcessor.from_pretrained(model_name) 17 | model = Blip2ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16) 18 | model = model.to(device) 19 | model.eval() 20 | return model, processor 21 | 22 | 23 | def display_model_size(model): 24 | param_size = 0 25 | for param in model.parameters(): 26 | param_size += param.nelement() * param.element_size() 27 | buffer_size = 0 28 | for buffer in model.buffers(): 29 | buffer_size += buffer.nelement() * buffer.element_size() 30 | 31 | size_all_mb = (param_size + buffer_size) / 1024**2 32 | print('model size: {:.3f}MB'.format(size_all_mb)) 33 | 34 | 35 | def generate_text(img, model, processor, prompt=CAPTIONING_PROMPT, device='cuda'): 36 | if prompt is not None: 37 | inputs = processor(img, text=prompt, return_tensors="pt").to(device, torch.float16) 38 | else: 39 | inputs = processor(img, return_tensors="pt").to(device, torch.float16) 40 | 41 | generated_ids = model.generate(**inputs, max_new_tokens=30) 42 | generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() 43 | return generated_text 44 | 45 | 46 | def predict_caption(args, scene_dir, vqa_model, vqa_processor, show=False): 47 | img_dir = os.path.join(scene_dir, 'images') 48 | imgs, masks = load_images(img_dir, return_masks=True) 49 | mask_areas = [np.mean(mask) for mask in masks] 50 | 51 | idx_to_caption = np.argsort(mask_areas)[int(len(mask_areas) * args.mask_area_percentile)] 52 | img_to_caption = imgs[idx_to_caption] 53 | 54 | with torch.no_grad(): 55 | caption = generate_text(img_to_caption, vqa_model, vqa_processor, device=args.device) 56 | 57 | info = {'idx_to_caption': str(idx_to_caption), 'caption': caption} 58 | 59 | print('scene: %s, info:' % os.path.basename(scene_dir), info) 60 | if show: 61 | plt.imshow(img_to_caption) 62 | plt.show() 63 | 64 | # save info to json 65 | with open(os.path.join(scene_dir, '%s.json' % args.caption_save_name), 'w') as f: 66 | json.dump(info, f, indent=4) 67 | 68 | return info 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | args = get_args() 74 | 75 | scenes_dir = os.path.join(args.data_dir, 'scenes') 76 | scenes = get_scenes_list(args) 77 | 78 | model, processor = load_blip2(args.blip2_model_dir, device=args.device) 79 | 80 | for j, scene in enumerate(scenes): 81 | caption_info = predict_caption(args, os.path.join(scenes_dir, scene), model, processor) 82 | 83 | -------------------------------------------------------------------------------- /carving.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import open3d as o3d 4 | import time 5 | import torch 6 | from PIL import Image 7 | from utils import * 8 | 9 | 10 | def get_bounding_box(pts, percentile=1.0, buffer=0.1): 11 | """Get the bounding box of a point cloud.""" 12 | xyz1 = np.percentile(pts, percentile, axis=0) 13 | xyz2 = np.percentile(pts, 100 - percentile, axis=0) 14 | lwh = xyz2 - xyz1 15 | xyz1 -= buffer * lwh 16 | xyz2 += buffer * lwh 17 | return xyz1, xyz2 18 | 19 | 20 | def get_grid_points(xyz1, xyz2, grid_cell_size): 21 | """Get grid points.""" 22 | x1, y1, z1 = xyz1 23 | x2, y2, z2 = xyz2 24 | x = np.arange(x1, x2, grid_cell_size) 25 | y = np.arange(y1, y2, grid_cell_size) 26 | z = np.arange(z1, z2, grid_cell_size) 27 | xx, yy, zz = np.meshgrid(x, y, z) 28 | grid_pts = np.stack([xx, yy, zz], axis=-1).reshape(-1, 3) 29 | return grid_pts 30 | 31 | 32 | 33 | def project_3d_to_2d(pts, w2c, K, return_dists=False): 34 | """Project 3D points to 2D (nerfstudio format).""" 35 | pts = np.array(pts) 36 | K = np.hstack([K, np.zeros((3, 1))]) 37 | pts = np.concatenate([pts, np.ones((pts.shape[0], 1))], axis=1) 38 | pts = np.dot(pts, w2c.T) 39 | pts[:, [1, 2]] *= -1 40 | if return_dists: 41 | dists = np.linalg.norm(pts[:, :3], axis=-1) 42 | pts = np.dot(pts, K.T) 43 | pts_2d = pts[:, :2] / pts[:, 2:] 44 | if return_dists: 45 | return pts_2d, dists 46 | return pts_2d 47 | 48 | def project_3d_to_2d_torch(pts, w2c, K, return_dists=False): 49 | """Project 3D points to 2D (nerfstudio format).""" 50 | device = pts.device 51 | K = torch.cat([K, torch.zeros((3, 1), device=device)], 1) 52 | pts = torch.cat([pts, torch.ones((pts.shape[0], 1), device=device)], 1) 53 | pts = torch.matmul(pts, w2c.t()) 54 | pts[:, [1, 2]] *= -1 55 | if return_dists: 56 | dists = torch.norm(pts[:, :3], dim=-1) 57 | pts = torch.matmul(pts, K.t()) 58 | pts_2d = pts[:, :2] / pts[:, 2:] 59 | if return_dists: 60 | return pts_2d, dists 61 | return pts_2d 62 | 63 | 64 | def depth_to_distance(depth, K): 65 | """Convert depth map to distance from camera.""" 66 | h, w = depth.shape 67 | x, y = np.meshgrid(np.arange(w), np.arange(h)) 68 | x = x.flatten() 69 | y = y.flatten() 70 | depth = depth.flatten() 71 | pts = np.stack([x, y, np.ones_like(x)], axis=1) 72 | pts = np.dot(pts, np.linalg.inv(K).T) 73 | pts *= depth[:, None] 74 | dists = np.linalg.norm(pts, axis=1) 75 | dists = dists.reshape(h, w) 76 | return dists 77 | 78 | 79 | def depth_to_distance_torch(depth, K): 80 | """Convert depth map to distance from camera.""" 81 | h, w = depth.shape 82 | x, y = torch.meshgrid(torch.arange(w), torch.arange(h)) 83 | x = x.flatten() 84 | y = y.flatten() 85 | depth = depth.flatten() 86 | pts = torch.stack([x, y, torch.ones_like(x)], dim=1).float().to(depth.device) 87 | pts = torch.matmul(pts, torch.inverse(K).t()) 88 | pts *= depth[:, None] 89 | dists = torch.norm(pts, dim=1) 90 | dists = dists.reshape(h, w) 91 | return dists 92 | 93 | 94 | def carve_numpy(pts, masks, depths, w2cs, K, dist_thr): 95 | n_imgs = len(masks) 96 | 97 | for i in range(n_imgs): 98 | h, w = masks[i].shape 99 | pts_2d, dists = project_3d_to_2d(pts, w2cs[i], K, return_dists=True) 100 | pts_2d = np.round(pts_2d).astype(np.int32) 101 | pts_2d = np.clip(pts_2d, 0, [w - 1, h - 1]) 102 | 103 | observed_dists = depths[i] 104 | 105 | is_in_mask = masks[i][pts_2d[:, 1], pts_2d[:, 0]] 106 | is_behind_depth = dists > observed_dists[pts_2d[:, 1], pts_2d[:, 0]] - dist_thr 107 | pts = pts[is_in_mask & is_behind_depth] 108 | 109 | return pts 110 | 111 | def carve_torch(pts, masks, depths, w2cs, K, dist_thr, mask_only=False): 112 | n_imgs = len(masks) 113 | 114 | with torch.no_grad(): 115 | mask_votes = torch.zeros(len(pts), device=pts.device, dtype=torch.int32) 116 | depth_votes = torch.zeros(len(pts), device=pts.device, dtype=torch.int32) 117 | for i in range(n_imgs): 118 | h, w = masks[i].shape 119 | pts_2d, dists = project_3d_to_2d_torch(pts, w2cs[i], K, return_dists=True) 120 | pts_2d = torch.round(pts_2d).long().to(pts.device) 121 | pts_2d[:, 0] = torch.clamp(pts_2d[:, 0], 0, w - 1) 122 | pts_2d[:, 1] = torch.clamp(pts_2d[:, 1], 0, h - 1) 123 | 124 | observed_dists = depths[i] 125 | 126 | is_in_mask = masks[i][pts_2d[:, 1], pts_2d[:, 0]] 127 | is_behind_depth = dists > observed_dists[pts_2d[:, 1], pts_2d[:, 0]] - dist_thr 128 | mask_votes[is_in_mask] += 1 129 | depth_votes[is_behind_depth] += 1 130 | if mask_only: 131 | pts = pts[mask_votes == n_imgs] 132 | else: 133 | pts = pts[(mask_votes == n_imgs) & (depth_votes == n_imgs)] 134 | 135 | return pts 136 | 137 | 138 | def get_carved_pts(scene_dir, grid_cell_size_ns=1/512, dist_thr_ns=0.01, verbose=False, device='cuda'): 139 | scene_name = os.path.basename(scene_dir) 140 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply') 141 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json') 142 | t_file = os.path.join(scene_dir, 'transforms.json') 143 | img_dir = os.path.join(scene_dir, 'images') 144 | depth_dir = os.path.join(scene_dir, 'ns', 'renders', 'depth') 145 | 146 | pts = load_ns_point_cloud(pcd_file, dt_file) 147 | w2cs, K = parse_transforms_json(t_file, return_w2c=True) 148 | ns_transform, scale = parse_dataparser_transforms_json(dt_file) 149 | imgs, masks = load_images(img_dir, return_masks=True) 150 | depths = load_depths(depth_dir, Ks=None) 151 | 152 | xyz1, xyz2 = get_bounding_box(pts) 153 | grid_cell_size = grid_cell_size_ns / scale 154 | grid_pts = get_grid_points(xyz1, xyz2, grid_cell_size) 155 | dist_thr = dist_thr_ns / scale 156 | 157 | grid_pts = torch.from_numpy(grid_pts).float().to(device) 158 | masks = [torch.from_numpy(mask).to(device) for mask in masks] 159 | depths = [torch.from_numpy(depth).to(device) for depth in depths] 160 | w2cs = [torch.from_numpy(w2c).float().to(device) for w2c in w2cs] 161 | K = torch.from_numpy(K).float().to(device) 162 | 163 | carved = carve_torch(grid_pts, masks, depths, w2cs, K, dist_thr) 164 | if verbose: 165 | print('scene: %s, num. surface points: %d, num. carved points: %d, scale: %.4f' % 166 | (scene_name, len(pts), len(carved), scale)) 167 | 168 | return carved, grid_cell_size 169 | 170 | 171 | if __name__ == '__main__': 172 | scene_dir = '/home/azhai/n2p/data/debug/B075X4J15G_ATVPDKIKX0DER' 173 | 174 | carved, grid_cell_size = get_carved_pts(scene_dir) 175 | carved = carved.cpu().numpy() 176 | pcd = o3d.geometry.PointCloud() 177 | pcd.points = o3d.utility.Vector3dVector(carved) 178 | o3d.visualization.draw_geometries([pcd]) -------------------------------------------------------------------------------- /docs/example_vid.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajzhai/NeRF2Physics/6d81c093ed05434f31c0c735fdad1e51355bde86/docs/example_vid.gif -------------------------------------------------------------------------------- /docs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajzhai/NeRF2Physics/6d81c093ed05434f31c0c735fdad1e51355bde86/docs/overview.png -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | from arguments import get_args 5 | 6 | 7 | # preds should be Nx2, gts should be N 8 | def ADE(preds, gts): 9 | point_preds = np.mean(preds, axis=1) 10 | ade = np.abs(point_preds - gts) 11 | return np.mean(ade) 12 | 13 | def ALDE(preds, gts): 14 | point_preds = np.mean(preds, axis=1) 15 | alde = np.abs(np.log(point_preds) - np.log(gts)) 16 | return np.mean(alde) 17 | 18 | def APE(preds, gts): 19 | point_preds = np.mean(preds, axis=1) 20 | ape = np.abs(point_preds - gts) / gts 21 | return np.mean(ape) 22 | 23 | def MnRE(preds, gts): 24 | point_preds = np.mean(preds, axis=1) 25 | p_over_t = point_preds / gts 26 | t_over_p = gts / point_preds 27 | ratios = np.vstack([p_over_t, t_over_p]) 28 | mnre = np.min(ratios, axis=0) 29 | return np.mean(mnre) 30 | 31 | 32 | def show_metrics(preds, gts): 33 | print('ADE %.3f' % ADE(preds, gts)) 34 | print('ALDE %.3f' % ALDE(preds, gts)) 35 | print('APE %.3f' % APE(preds, gts)) 36 | print('MnRE %.3f' % MnRE(preds, gts)) 37 | 38 | 39 | if __name__ == '__main__': 40 | 41 | args = get_args() 42 | 43 | with open(args.preds_json_path, 'r') as f: 44 | preds_dict = json.load(f) 45 | with open(args.gts_json_path, 'r') as f: 46 | gts_dict = json.load(f) 47 | 48 | preds = np.zeros((len(preds_dict), 2)) 49 | gts = np.zeros(len(preds_dict)) 50 | for i, (k, v) in enumerate(preds_dict.items()): 51 | preds[i] = v 52 | gts[i] = gts_dict[k.split('_')[0]] 53 | print(preds, gts) 54 | 55 | show_metrics(preds, gts) 56 | -------------------------------------------------------------------------------- /feature_fusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | import torch 5 | import open_clip 6 | from PIL import Image 7 | 8 | from utils import * 9 | from arguments import get_args 10 | 11 | 12 | CLIP_BACKBONE = 'ViT-B-16' 13 | CLIP_CHECKPOINT = 'datacomp_xl_s13b_b90k' 14 | CLIP_INPUT_SIZE = 224 15 | CLIP_OUTPUT_SIZE = 512 16 | 17 | 18 | def get_patch_features(pts, imgs, depths, w2cs, K, model, preprocess_fn, occ_thr, 19 | patch_size=56, batch_size=8, device='cuda'): 20 | n_imgs = len(imgs) 21 | n_pts = len(pts) 22 | 23 | patch_features = torch.zeros(n_imgs, n_pts, CLIP_OUTPUT_SIZE, device=device, requires_grad=False) 24 | is_visible = torch.zeros(n_imgs, n_pts, device=device, dtype=torch.bool, requires_grad=False) 25 | half_patch_size = patch_size // 2 26 | 27 | K = np.array(K) 28 | with torch.no_grad(), torch.cuda.amp.autocast(): 29 | model.to(device) 30 | 31 | for i in range(n_imgs): 32 | h, w, c = imgs[i].shape 33 | if len(K.shape) == 3: 34 | curr_K = K[i] 35 | else: 36 | curr_K = K 37 | pts_2d, dists = project_3d_to_2d(pts, w2cs[i], curr_K, return_dists=True) 38 | pts_2d = np.round(pts_2d).astype(np.int32) 39 | 40 | observed_dists = depths[i] 41 | 42 | # loop through pts in batches 43 | for batch_start in range(0, n_pts, batch_size): 44 | curr_batch_size = min(batch_size, n_pts - batch_start) 45 | batch_patches = torch.zeros(curr_batch_size, 3, CLIP_INPUT_SIZE, CLIP_INPUT_SIZE, device=device) 46 | 47 | for j in range(curr_batch_size): 48 | x, y = pts_2d[batch_start + j] 49 | 50 | if x >= half_patch_size and x < w - half_patch_size and \ 51 | y >= half_patch_size and y < h - half_patch_size: 52 | is_occluded = dists[batch_start + j] > observed_dists[y, x] + occ_thr 53 | if not is_occluded: 54 | patch = imgs[i][y - half_patch_size:y + half_patch_size, x - half_patch_size:x + half_patch_size] 55 | patch = Image.fromarray(patch) 56 | 57 | patch = preprocess_fn(patch).unsqueeze(0).to(device) 58 | batch_patches[j] = patch 59 | is_visible[i, batch_start + j] = True 60 | 61 | if is_visible[i, batch_start:batch_start + batch_size].any(): 62 | patch_features[i, batch_start:batch_start + curr_batch_size] = model.encode_image(batch_patches) 63 | 64 | return patch_features, is_visible 65 | 66 | 67 | def process_scene(args, scene_dir, model, preprocess_fn): 68 | 69 | scene_name = os.path.basename(scene_dir) 70 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply') 71 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json') 72 | t_file = os.path.join(scene_dir, 'transforms.json') 73 | img_dir = os.path.join(scene_dir, 'images') 74 | depth_dir = os.path.join(scene_dir, 'ns', 'renders', 'depth') 75 | 76 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.feature_voxel_size) 77 | w2cs, K = parse_transforms_json(t_file, return_w2c=True, different_Ks=args.different_Ks) 78 | ns_transform, scale = parse_dataparser_transforms_json(dt_file) 79 | imgs = load_images(img_dir) 80 | depths = load_depths(depth_dir, Ks=None) 81 | 82 | print('scene: %s, points: %d, scale: %.4f' % (scene_name, len(pts), scale)) 83 | 84 | with torch.no_grad(): 85 | occ_thr = args.occ_thr * scale 86 | patch_features, is_visible = get_patch_features(pts, imgs, depths, w2cs, K, 87 | model, preprocess_fn, 88 | occ_thr, patch_size=args.patch_size, batch_size=args.batch_size, 89 | device=args.device) 90 | 91 | out_dir = os.path.join(scene_dir, 'features') 92 | os.makedirs(out_dir, exist_ok=True) 93 | torch.save(patch_features, os.path.join(out_dir, 'patch_features_%s.pt' % args.feature_save_name)) 94 | torch.save(is_visible, os.path.join(out_dir, 'is_visible_%s.pt' % args.feature_save_name)) 95 | with open(os.path.join(out_dir, 'voxel_size_%s.json' % args.feature_save_name), 'w') as f: 96 | json.dump({'voxel_size': args.feature_voxel_size}, f, indent=4) 97 | 98 | return pts, patch_features, is_visible 99 | 100 | 101 | if __name__ == '__main__': 102 | 103 | args = get_args() 104 | 105 | scenes_dir = os.path.join(args.data_dir, 'scenes') 106 | scenes = get_scenes_list(args) 107 | 108 | model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT) 109 | model.to(args.device) 110 | 111 | for j, scene in enumerate(scenes): 112 | pts, patch_features, is_visible = process_scene(args, os.path.join(scenes_dir, scene), model, preprocess) 113 | 114 | -------------------------------------------------------------------------------- /gpt_inference.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import base64 3 | import json 4 | 5 | 6 | PRED_CAND_MATS_DENSITY_SYS_MSG = """You will be provided with captions that each describe an image of an object. The captions will be delimited with quotes ("). Based on the caption, give me 5 materials that the object might be made of, along with the mass densities (in kg/m^3) of each of those materials. You may provide a range of values for the mass density instead of a single value. Try to consider all the possible parts of the object. Do not include coatings like "paint" in your answer. 7 | 8 | Format Requirement: 9 | You must provide your answer as a list of 5 (material: mass density) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like: 10 | (material 1: low-high kg/m^3);(material 2: low-high kg/m^3);(material 3: low-high kg/m^3);(material 4: low-high kg/m^3);(material 5: low-high kg/m^3) 11 | """ 12 | 13 | PRED_CAND_MATS_DENSITY_SYS_MSG_4V = """You will be given an image of an object. Based on the image, give me a short (5-10 words) description of what the object is, and also 5 materials (e.g. wood, plastic, foam) that the object might be made of, along with the mass densities (in kg/m^3) of each of those materials. You may provide a range of values for the mass density instead of a single value. Try to consider all the possible parts of the object. Do not include coatings like "paint" in your answer. 14 | 15 | Format Requirement: 16 | You must provide your answer in the following JSON format, as it will be parsed by a code script later. Your answer must look like: 17 | { 18 | "description": description 19 | "materials": [ 20 | {"name": material1, "mass density (kg/m^3)": low-high}, 21 | {"name": material2, "mass density (kg/m^3)": low-high}, 22 | {"name": material3, "mass density (kg/m^3)": low-high}, 23 | {"name": material4, "mass density (kg/m^3)": low-high}, 24 | {"name": material5, "mass density (kg/m^3)": low-high} 25 | ] 26 | } 27 | Do not include any other text in your answer. Do not include unnecessary words besides the material in the material name. 28 | """ 29 | 30 | 31 | PRED_CAND_MATS_HARDNESS_SYS_MSG = """You will be provided with captions that each describe an image of an object. The captions will be delimited with quotes ("). Based on the caption, give me 3 materials that the object might be made of, along with the hardness of each of those materials. Choose whether to use Shore A hardness or Shore D hardness depending on the material. You may provide a range of values for hardness instead of a single value. Try to consider all the possible parts of the object. 32 | 33 | Format Requirement: 34 | You must provide your answer as a list of 3 (material: hardness, Shore A/D) tuples, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like: 35 | (material 1: low-high, );(material 2: low-high, );(material 3: low-high, ) 36 | Make sure to use Shore A or Shore D hardness, not Mohs hardness. 37 | """ 38 | 39 | PRED_CAND_MATS_FRICTION_SYS_MSG = """You will be provided with captions that each describe an image. The captions will be delimited with quotes ("). Based on the caption, give me 3 materials that the surfaces in the image might be made of, along with the kinetic friction coefficient of each material when sliding against a fabric surface. You may provide a range of values for the friction coefficient instead of a single value. Try to consider all the possible surfaces. 40 | 41 | Format Requirement: 42 | You must provide your answer as a list of 3 (material: friction coefficient) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like: 43 | (material 1: low-high);(material 2: low-high);(material 3: low-high) 44 | Try to provide as narrow of a range as possible for the friction coefficient. 45 | """ 46 | 47 | PRED_THICKNESS_SYS_MSG = """You will be provided with captions that each describe an image of an object, along with a set of possible materials used to make the object. For each material, estimate the thickness (in cm) of that material in the object. You may provide a range of values for the thickness instead of a single value. 48 | 49 | Format Requirement: 50 | You must provide your answer as a list of 5 (material: thickness) pairs, each separated by a semi-colon (;). Do not include any other text in your answer, as it will be parsed by a code script later. Your answer must look like: 51 | (material 1: low-high cm);(material 2: low-high cm);(material 3: low-high cm);(material 4: low-high cm);(material 5: low-high cm) 52 | """ 53 | 54 | PRED_THICKNESS_EXAMPLE_INPUT_1 = 'Caption: "a lamp with a white shade" Materials: "fabric, plastic, metal, ceramic, glass"' 55 | PRED_THICKNESS_EXAMPLE_OUTPUT_1 = "(fabric: 0.1-0.2 cm);(plastic: 0.3-1.0 cm);(metal: 0.1-0.2 cm);(ceramic: 0.2-0.5 cm);(glass: 0.3-0.8 cm)" 56 | PRED_THICKNESS_EXAMPLE_INPUT_2 = 'Caption: "a grey ottoman" Materials: "wood, fabric, foam, metal, plastic"' 57 | PRED_THICKNESS_EXAMPLE_OUTPUT_2 = "(wood: 2.0-4.0 cm);(fabric: 0.2-0.5 cm);(foam: 5.0-15.0 cm);(metal: 0.1-0.2 cm);(plastic: 0.5-1.0 cm)" 58 | PRED_THICKNESS_EXAMPLE_INPUT_3 = 'Caption: "a white frame" Materials: "plastic, wood, aluminum, steel, glass"' 59 | PRED_THICKNESS_EXAMPLE_OUTPUT_3 = "(plastic: 0.1-0.3 cm);(wood: 1.0-1.5 cm);(aluminum: 0.1-0.3 cm);(steel: 0.1-0.2 cm);(glass: 0.2-0.5 cm)" 60 | PRED_THICKNESS_EXAMPLE_INPUT_4 = 'Caption: "a metal rack with three shelves" Materials: "steel, aluminum, wood, plastic, iron"' 61 | PRED_THICKNESS_EXAMPLE_OUTPUT_4 = "(steel: 0.1-0.2 cm);(aluminum: 0.1-0.3 cm);(wood: 1.0-2.0 cm);(plastic: 0.5-1.0 cm);(iron: 0.5-1.0 cm)" 62 | 63 | 64 | def gpt_candidate_materials(caption, property_name='density', model_name='gpt-3.5-turbo', seed=100): 65 | 66 | if property_name == 'density': 67 | sys_msg = PRED_CAND_MATS_DENSITY_SYS_MSG 68 | elif property_name == 'hardness': 69 | sys_msg = PRED_CAND_MATS_HARDNESS_SYS_MSG 70 | elif property_name == 'friction': 71 | sys_msg = PRED_CAND_MATS_FRICTION_SYS_MSG 72 | else: 73 | raise NotImplementedError 74 | response = openai.ChatCompletion.create( 75 | model=model_name, 76 | messages=[ 77 | {"role": "system", "content": sys_msg}, 78 | {"role": "user", "content": '"%s"' % caption}, 79 | ], 80 | request_timeout=20, 81 | seed=seed, 82 | ) 83 | return response['choices'][0]['message']['content'] 84 | 85 | 86 | def gpt_thickness(caption, candidate_materials, mode='list', model_name='gpt-3.5-turbo', seed=100): 87 | 88 | if mode == 'list': 89 | mat_names, mat_vals = parse_material_list(candidate_materials) 90 | elif mode == 'json': 91 | caption, mat_names, mat_vals = parse_material_json(candidate_materials) 92 | else: 93 | raise NotImplementedError 94 | mat_names_str = ', '.join(mat_names) 95 | user_msg = 'Caption: "%s" Materials: "%s"' % (caption, mat_names_str) 96 | 97 | response = openai.ChatCompletion.create( 98 | model=model_name, 99 | messages=[ 100 | {"role": "system", "content": PRED_THICKNESS_SYS_MSG}, 101 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_1}, 102 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_1}, 103 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_2}, 104 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_2}, 105 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_3}, 106 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_3}, 107 | {"role": "user", "content": PRED_THICKNESS_EXAMPLE_INPUT_4}, 108 | {"role": "assistant", "content": PRED_THICKNESS_EXAMPLE_OUTPUT_4}, 109 | {"role": "user", "content": user_msg}, 110 | ], 111 | request_timeout=20, 112 | seed=seed, 113 | ) 114 | return response['choices'][0]['message']['content'] 115 | 116 | 117 | def parse_material_list(matlist, max_n=5): 118 | elems = matlist.split(';') 119 | if len(elems) > max_n: 120 | print('too many materials %s' % matlist) 121 | return None 122 | 123 | mat_names = [] 124 | mat_vals = [] 125 | 126 | for elem in elems: 127 | elem_parts = elem.strip().split(':') 128 | if len(elem_parts) != 2: 129 | print('bad format %s' % matlist) 130 | return None 131 | mat_name, values = elem_parts 132 | if not mat_name.startswith('(') or mat_name[1].isnumeric() or mat_name.startswith('(material 1'): 133 | print('bad format %s' % matlist) 134 | return None 135 | 136 | mat_name = mat_name[1:] 137 | mat_names.append(mat_name.lower()) # force lowercase 138 | 139 | values = values.strip().split(' ')[0] 140 | values = values.replace(",", "") 141 | if values[-1] == ')': 142 | values = values[:-1] 143 | 144 | # Value may or may not be a range 145 | splitted = values.split('-') 146 | try: 147 | float(splitted[0]) 148 | except ValueError: 149 | print('value cannot be converted to float %s' % matlist) 150 | return None 151 | if len(splitted) == 2: 152 | mat_vals.append([float(splitted[0]), float(splitted[1])]) 153 | elif len(splitted) == 1: 154 | mat_vals.append([float(splitted[0]), float(splitted[0])]) 155 | else: 156 | print('bad format %s' % matlist) 157 | return None 158 | 159 | return mat_names, mat_vals 160 | 161 | 162 | def parse_material_hardness(matlist, max_n=5): 163 | elems = matlist.split(';') 164 | if len(elems) > max_n: 165 | print('too many materials %s' % matlist) 166 | return None 167 | 168 | mat_names = [] 169 | mat_vals = [] 170 | 171 | for elem in elems: 172 | elem_parts = elem.strip().split(':') 173 | if len(elem_parts) != 2: 174 | print('bad format %s' % matlist) 175 | return None 176 | mat_name, values = elem_parts 177 | if not mat_name.startswith('(') or mat_name[1].isnumeric() or mat_name.startswith('(material 1'): 178 | print('bad name %s' % matlist) 179 | return None 180 | 181 | mat_name = mat_name[1:] 182 | mat_names.append(mat_name.lower()) # force lowercase 183 | 184 | values = values.strip().split(',') 185 | units = values[-1].split(' ')[-1][:-1] 186 | if units not in ['A', 'D']: 187 | print('bad units %s' % matlist) 188 | return None 189 | values = values[0] 190 | values = values.replace(",", "") 191 | 192 | # Value may or may not be a range 193 | splitted = values.split('-') 194 | try: 195 | float(splitted[0]) 196 | except ValueError: 197 | print('value cannot be converted to float %s' % matlist) 198 | return None 199 | if len(splitted) == 2: 200 | mat_vals.append([float(splitted[0]), float(splitted[1])]) 201 | elif len(splitted) == 1: 202 | mat_vals.append([float(splitted[0]), float(splitted[0])]) 203 | else: 204 | print('bad format %s' % matlist) 205 | return None 206 | 207 | if units == 'D': 208 | mat_vals[-1][0] += 100 209 | mat_vals[-1][1] += 100 210 | 211 | return mat_names, mat_vals 212 | 213 | 214 | def encode_image(image_path): 215 | with open(image_path, "rb") as image_file: 216 | return base64.b64encode(image_file.read()).decode('utf-8') 217 | 218 | 219 | def gpt4v_candidate_materials(image_path, property_name='density', seed=100): 220 | 221 | if property_name == 'density': 222 | sys_msg = PRED_CAND_MATS_DENSITY_SYS_MSG_4V 223 | else: 224 | raise NotImplementedError 225 | 226 | base64_image = encode_image(image_path) 227 | 228 | response = openai.ChatCompletion.create( 229 | model="gpt-4-vision-preview", 230 | messages=[ 231 | { 232 | "role": "system", 233 | "content": sys_msg 234 | }, 235 | { 236 | "role": "user", 237 | "content": [ 238 | { 239 | "type": "image_url", 240 | "image_url": { 241 | "url": f"data:image/png;base64,{base64_image}" 242 | } 243 | }, 244 | ] 245 | } 246 | ], 247 | request_timeout=30, 248 | max_tokens=300, 249 | seed=seed, 250 | # response_format={"type": "json_object"}, 251 | ) 252 | return response['choices'][0]['message']['content'] 253 | 254 | 255 | def parse_material_json(matjson, max_n=5, field_name='mass density (kg/m^3)'): 256 | desc_and_mats = json.loads(matjson) 257 | if 'description' not in desc_and_mats or 'materials' not in desc_and_mats: 258 | print('bad format %s' % matjson) 259 | return None 260 | mat_names = [] 261 | mat_vals = [] 262 | for mat in desc_and_mats['materials']: 263 | if 'name' not in mat or field_name not in mat: 264 | print('bad format %s' % matjson) 265 | return None 266 | mat_name = mat['name'] 267 | mat_names.append(mat_name.lower()) # force lowercase 268 | values = mat[field_name] 269 | # Value may or may not be a range 270 | splitted = values.split('-') 271 | try: 272 | float(splitted[0]) 273 | except ValueError: 274 | print('value cannot be converted to float %s' % matjson) 275 | return None 276 | if len(splitted) == 2: 277 | mat_vals.append([float(splitted[0]), float(splitted[1])]) 278 | elif len(splitted) == 1: 279 | mat_vals.append([float(splitted[0]), float(splitted[0])]) 280 | else: 281 | print('bad format %s' % matjson) 282 | return None 283 | return desc_and_mats['description'], mat_names, mat_vals -------------------------------------------------------------------------------- /material_proposal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import openai 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from PIL import Image 8 | 9 | from gpt_inference import gpt_candidate_materials, gpt_thickness, parse_material_list, \ 10 | parse_material_hardness, gpt4v_candidate_materials, parse_material_json 11 | from utils import load_images, get_scenes_list 12 | from arguments import get_args 13 | from my_api_key import OPENAI_API_KEY 14 | 15 | 16 | BASE_SEED = 100 17 | 18 | 19 | def gpt_wrapper(gpt_fn, parse_fn, max_tries=10, sleep_time=3): 20 | """Wrap gpt_fn with error handling and retrying.""" 21 | tries = 0 22 | # sleep to avoid overloading openai api 23 | time.sleep(sleep_time) 24 | try: 25 | gpt_response = gpt_fn(BASE_SEED + tries) 26 | result = parse_fn(gpt_response) 27 | except Exception as error: 28 | print('error:', error) 29 | result = None 30 | while result is None and tries < max_tries: 31 | tries += 1 32 | time.sleep(sleep_time) 33 | print('retrying...') 34 | try: 35 | gpt_response = gpt_fn(BASE_SEED + tries) 36 | result = parse_fn(gpt_response) 37 | except: 38 | result = None 39 | return gpt_response 40 | 41 | 42 | def show_img_to_caption(scene_dir, idx_to_caption): 43 | img_dir = os.path.join(scene_dir, 'images') 44 | imgs = load_images(img_dir, bg_change=None, return_masks=False) 45 | img_to_caption = imgs[idx_to_caption] 46 | plt.imshow(img_to_caption) 47 | plt.show() 48 | plt.close() 49 | return 50 | 51 | 52 | def predict_candidate_materials(args, scene_dir, show=False): 53 | # load caption info 54 | with open(os.path.join(scene_dir, '%s.json' % args.caption_load_name), 'r') as f: 55 | info = json.load(f) 56 | 57 | caption = info['caption'] 58 | 59 | gpt_fn = lambda seed: gpt_candidate_materials(caption, property_name=args.property_name, 60 | model_name=args.gpt_model_name, seed=seed) 61 | parse_fn = parse_material_hardness if args.property_name == 'hardness' else parse_material_list 62 | candidate_materials = gpt_wrapper(gpt_fn, parse_fn) 63 | 64 | info['candidate_materials_%s' % args.property_name] = candidate_materials 65 | 66 | print('-' * 50) 67 | print('scene: %s, info:' % os.path.basename(scene_dir), info) 68 | print('candidate materials (%s):' % args.property_name) 69 | mat_names, mat_vals = parse_fn(candidate_materials) 70 | for mat_i, mat_name in enumerate(mat_names): 71 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1])) 72 | if show: 73 | show_img_to_caption(scene_dir, int(info['idx_to_caption'])) 74 | 75 | # save info to json 76 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f: 77 | json.dump(info, f, indent=4) 78 | 79 | return info 80 | 81 | 82 | def predict_object_info_gpt4v(args, scene_dir, show=False): 83 | """(EXPERIMENTAL) Predict materials directly from image with GPT-4V.""" 84 | img_dir = os.path.join(scene_dir, 'images') 85 | imgs, masks = load_images(img_dir, return_masks=True) 86 | mask_areas = [np.mean(mask) for mask in masks] 87 | 88 | idx_to_caption = np.argsort(mask_areas)[int(len(mask_areas) * args.mask_area_percentile)] 89 | img_to_caption = imgs[idx_to_caption] 90 | 91 | # save img_to_caption in img_dir 92 | img_to_caption = Image.fromarray(img_to_caption) 93 | img_path = os.path.join(scene_dir, 'img_to_caption.png') 94 | img_to_caption.save(img_path) 95 | 96 | gpt_fn = lambda seed: gpt4v_candidate_materials(img_path, property_name=args.property_name, seed=seed) 97 | candidate_materials = gpt_wrapper(gpt_fn, parse_material_json) 98 | 99 | info = {'idx_to_caption': str(idx_to_caption), 100 | 'candidate_materials_%s' % args.property_name: candidate_materials} 101 | 102 | print('-' * 50) 103 | print('scene: %s, info:' % os.path.basename(scene_dir), info) 104 | print('candidate materials (%s):' % args.property_name) 105 | mat_names, mat_vals = parse_material_list(candidate_materials) 106 | for mat_i, mat_name in enumerate(mat_names): 107 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1])) 108 | if show: 109 | show_img_to_caption(scene_dir, int(info['idx_to_caption'])) 110 | 111 | # save info to json 112 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f: 113 | json.dump(info, f, indent=4) 114 | 115 | return info 116 | 117 | 118 | def predict_thickness(args, scene_dir, mode='list', show=False): 119 | # load info 120 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'r') as f: 121 | info = json.load(f) 122 | 123 | if mode == 'list': 124 | caption = info['caption'] 125 | elif mode == 'json': # json contains caption inside 126 | caption = None 127 | else: 128 | raise NotImplementedError 129 | candidate_materials = info['candidate_materials_density'] 130 | 131 | gpt_fn = lambda seed: gpt_thickness(caption, candidate_materials, 132 | model_name=args.gpt_model_name, mode=mode, seed=seed) 133 | thickness = gpt_wrapper(gpt_fn, parse_material_list) 134 | 135 | info['thickness'] = thickness 136 | 137 | print('thickness (cm):') 138 | mat_names, mat_vals = parse_material_list(thickness) 139 | for mat_i, mat_name in enumerate(mat_names): 140 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1])) 141 | if show: 142 | show_img_to_caption(scene_dir, int(info['idx_to_caption'])) 143 | 144 | # save info to json 145 | with open(os.path.join(scene_dir, '%s.json' % args.mats_save_name), 'w') as f: 146 | json.dump(info, f, indent=4) 147 | 148 | return info 149 | 150 | 151 | if __name__ == '__main__': 152 | 153 | args = get_args() 154 | 155 | scenes_dir = os.path.join(args.data_dir, 'scenes') 156 | scenes = get_scenes_list(args) 157 | 158 | openai.api_key = OPENAI_API_KEY 159 | 160 | for j, scene in enumerate(scenes): 161 | mats_info = predict_candidate_materials(args, os.path.join(scenes_dir, scene)) 162 | if args.include_thickness: 163 | mats_info = predict_thickness(args, os.path.join(scenes_dir, scene)) 164 | -------------------------------------------------------------------------------- /ns_reconstruction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import shutil 4 | from utils import get_last_file_in_folder, get_scenes_list 5 | from arguments import get_args 6 | 7 | 8 | def move_files_to_folder(source_dir, target_dir): 9 | for file in os.listdir(source_dir): 10 | shutil.move(os.path.join(source_dir, file), os.path.join(target_dir, file)) 11 | 12 | 13 | if __name__ == '__main__': 14 | 15 | args = get_args() 16 | 17 | scenes_dir = os.path.join(args.data_dir, 'scenes') 18 | scenes = get_scenes_list(args) 19 | 20 | for scene in scenes: 21 | base_dir = os.path.join(scenes_dir, scene, 'ns') 22 | 23 | # Calling ns-train 24 | result = subprocess.run([ 25 | 'ns-train', 'nerfacto', 26 | '--data', os.path.join(scenes_dir, scene), 27 | '--output_dir', base_dir, 28 | '--vis', args.vis_mode, 29 | '--project_name', args.project_name, 30 | '--experiment_name', scene, 31 | '--max_num_iterations', str(args.training_iters), 32 | '--pipeline.model.background-color', 'random', 33 | '--pipeline.datamanager.camera-optimizer.mode', 'off', 34 | '--pipeline.model.proposal-initial-sampler', 'uniform', 35 | '--pipeline.model.near-plane', str(args.near_plane), 36 | '--pipeline.model.far-plane', str(args.far_plane), 37 | '--steps-per-eval-image', '10000', 38 | ]) 39 | 40 | ns_dir = get_last_file_in_folder(os.path.join(base_dir, '%s/nerfacto' % scene)) 41 | 42 | # Copying dataparser_transforms (contains scale) 43 | result = subprocess.run([ 44 | 'scp', '-r', 45 | os.path.join(ns_dir, 'dataparser_transforms.json'), 46 | os.path.join(base_dir, 'dataparser_transforms.json') 47 | ]) 48 | 49 | half_bbox_size = args.bbox_size / 2 50 | 51 | # Calling ns-export pcd 52 | result = subprocess.run([ 53 | 'ns-export', 'pointcloud', 54 | '--load-config', os.path.join(ns_dir, 'config.yml'), 55 | '--output-dir', base_dir, 56 | '--num-points', str(args.num_points), 57 | '--remove-outliers', 'True', 58 | '--normal-method', 'open3d', 59 | '--use-bounding-box', 'True', 60 | '--bounding-box-min', str(-half_bbox_size), str(-half_bbox_size), str(-half_bbox_size), 61 | '--bounding-box-max', str(half_bbox_size), str(half_bbox_size), str(half_bbox_size), 62 | ]) 63 | 64 | # Calling ns-render 65 | result = subprocess.run([ 66 | 'ns-render', 'dataset', 67 | '--load-config', os.path.join(ns_dir, 'config.yml'), 68 | '--output-path', os.path.join(base_dir, 'renders'), 69 | '--rendered-output-names', 'raw-depth', 70 | '--split', 'train+test', 71 | ]) 72 | 73 | # Collect all depths in one folder 74 | os.makedirs(os.path.join(base_dir, 'renders', 'depth'), exist_ok=True) 75 | move_files_to_folder(os.path.join(base_dir, 'renders', 'test', 'raw-depth'), os.path.join(base_dir, 'renders', 'depth')) 76 | move_files_to_folder(os.path.join(base_dir, 'renders', 'train', 'raw-depth'), os.path.join(base_dir, 'renders', 'depth')) 77 | 78 | -------------------------------------------------------------------------------- /predict_property.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import open_clip 5 | import numpy as np 6 | import open3d as o3d 7 | import matplotlib as mpl 8 | 9 | from feature_fusion import CLIP_BACKBONE, CLIP_CHECKPOINT 10 | from gpt_inference import parse_material_list, parse_material_hardness 11 | from carving import get_carved_pts 12 | from utils import load_ns_point_cloud, parse_dataparser_transforms_json, get_last_file_in_folder, get_scenes_list 13 | from arguments import get_args 14 | 15 | 16 | @torch.no_grad() 17 | def get_text_features(texts, clip_model, clip_tokenizer, prefix='', suffix='', device='cuda'): 18 | """Get CLIP text features, optionally with a fixed prefix and suffix.""" 19 | extended_texts = [prefix + text + suffix for text in texts] 20 | tokenized = clip_tokenizer(extended_texts).to(device) 21 | 22 | with torch.no_grad(), torch.cuda.amp.autocast(): 23 | text_features = clip_model.encode_text(tokenized) 24 | text_features = text_features / text_features.norm(dim=1, keepdim=True) 25 | 26 | return text_features 27 | 28 | 29 | @torch.no_grad() 30 | def get_agg_patch_features(patch_features, is_visible): 31 | """Get aggregated patch features by averaging over visible patches.""" 32 | n_visible = is_visible.sum(0) 33 | is_valid = n_visible > 0 34 | 35 | visible_patch_features = patch_features * is_visible.unsqueeze(-1) 36 | avg_visible_patch_features = visible_patch_features.sum(0) / n_visible.unsqueeze(-1) 37 | avg_visible_patch_features = avg_visible_patch_features / avg_visible_patch_features.norm(dim=1, keepdim=True) 38 | return avg_visible_patch_features[is_valid], is_valid 39 | 40 | 41 | @torch.no_grad() 42 | def get_interpolated_values(source_pts, source_vals, inner_pts, batch_size=2048, k=1): 43 | """Interpolate values by k nearest neighbor.""" 44 | n_inner = len(inner_pts) 45 | inner_vals = torch.zeros(n_inner, source_vals.shape[1], device=inner_pts.device) 46 | for batch_start in range(0, n_inner, batch_size): 47 | curr_batch_size = min(batch_size, n_inner - batch_start) 48 | curr_inner_pts = inner_pts[batch_start:batch_start + curr_batch_size] 49 | 50 | dists = torch.cdist(curr_inner_pts, source_pts) 51 | _, idxs = torch.topk(dists, k=k, dim=1, largest=False) 52 | curr_inner_vals = source_vals[idxs].mean(1) 53 | 54 | inner_vals[batch_start:batch_start + curr_batch_size] = curr_inner_vals 55 | return inner_vals 56 | 57 | 58 | @torch.no_grad() 59 | def predict_physical_property_integral(args, scene_dir, clip_model, clip_tokenizer): 60 | """Predict the volume integral of a physical property (e.g. for mass). Returns a [low, high] range.""" 61 | 62 | scene_name = os.path.basename(scene_dir) 63 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply') 64 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json') 65 | info_file = os.path.join(scene_dir, '%s.json' % args.mats_load_name) 66 | 67 | with open(info_file, 'r') as f: 68 | info = json.load(f) 69 | 70 | # loading source point info 71 | with open(os.path.join(scene_dir, 'features', 'voxel_size_%s.json' % args.feature_load_name), 'r') as f: 72 | feature_voxel_size = json.load(f)['voxel_size'] 73 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=feature_voxel_size) 74 | source_pts = torch.Tensor(pts).to(args.device) 75 | patch_features = torch.load(os.path.join(scene_dir, 'features', 'patch_features_%s.pt' % args.feature_load_name)) 76 | is_visible = torch.load(os.path.join(scene_dir, 'features', 'is_visible_%s.pt' % args.feature_load_name)) 77 | 78 | # preparing material info 79 | mat_val_list = info['candidate_materials_%s' % args.property_name] 80 | mat_names, mat_vals = parse_material_list(mat_val_list) 81 | mat_vals = torch.Tensor(mat_vals).to(args.device) 82 | mat_tn_list = info['thickness'] 83 | mat_names, mat_tns = parse_material_list(mat_tn_list) 84 | mat_tns = torch.Tensor(mat_tns).to(args.device) / 100 # cm to m 85 | 86 | # predictions on source points 87 | text_features = get_text_features(mat_names, clip_model, clip_tokenizer, device=args.device) 88 | agg_patch_features, is_valid = get_agg_patch_features(patch_features, is_visible) 89 | source_pts = source_pts[is_valid] 90 | 91 | similarities = agg_patch_features @ text_features.T 92 | 93 | source_pred_probs = torch.softmax(similarities / args.temperature, dim=1) 94 | source_pred_mat_idxs = similarities.argmax(1) 95 | source_pred_vals = source_pred_probs @ mat_vals 96 | 97 | # volume integration 98 | ns_transform, scale = parse_dataparser_transforms_json(dt_file) 99 | surface_cell_size = args.sample_voxel_size / scale 100 | mat_cell_volumes = surface_cell_size**2 * mat_tns 101 | mat_cell_products = mat_vals * mat_cell_volumes 102 | 103 | if args.volume_method == 'thickness': 104 | dense_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.sample_voxel_size) 105 | dense_pts = torch.Tensor(dense_pts).to(args.device) 106 | 107 | dense_pred_probs = get_interpolated_values(source_pts, source_pred_probs, dense_pts, batch_size=2048, k=1) 108 | dense_pred_products = dense_pred_probs @ mat_cell_products 109 | total_pred_val = (dense_pred_products).sum(0) 110 | 111 | carved, grid_cell_size = get_carved_pts(scene_dir, dist_thr_ns=0.05) 112 | bound_volume = grid_cell_size ** 3 * len(carved) 113 | total_volume = (dense_pred_probs @ mat_cell_volumes).max(1)[0].sum(0) 114 | if total_volume > bound_volume: 115 | total_pred_val *= bound_volume / total_volume 116 | total_pred_val *= args.correction_factor 117 | 118 | elif args.volume_method == 'carving': 119 | carved, grid_cell_size = get_carved_pts(scene_dir) 120 | carved_pred_probs = get_interpolated_values(source_pts, source_pred_probs, carved, batch_size=2048, k=1) 121 | carved_pred_vals = carved_pred_probs @ mat_vals 122 | grid_cell_volume = grid_cell_size ** 3 123 | total_pred_val = carved_pred_vals.sum(0) * grid_cell_volume * args.correction_factor 124 | 125 | dense_pts = carved 126 | dense_pred_probs = carved_pred_probs 127 | 128 | 129 | else: 130 | raise NotImplementedError 131 | 132 | print('-' * 50) 133 | print('scene:', scene_name) 134 | print('-' * 50) 135 | print('num. dense points:', len(dense_pts)) 136 | print('caption:', info['caption']) 137 | print('candidate materials:') 138 | for mat_i, mat_name in enumerate(mat_names): 139 | print('%16s: %8.1f -%8.1f kg/m^3, %5.1f -%5.1f cm' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1], 140 | mat_tns[mat_i][0] * 100, mat_tns[mat_i][1] * 100)) 141 | 142 | print('surface cell size: %.4f cm' % (surface_cell_size * 100)) 143 | print('predicted total mass: [%.4f - %.4f kg]' % (total_pred_val[0], total_pred_val[1])) 144 | 145 | if args.show_mat_seg: 146 | # Visualize material segmentation in open3d 147 | cmap = mpl.colormaps['tab10'] 148 | mat_colors = [cmap(i)[:3] for i in range(len(mat_names))] 149 | dense_pred_colors = np.array([mat_colors[i] for i in dense_pred_probs.argmax(1)]) 150 | 151 | pcd = o3d.geometry.PointCloud() 152 | pcd.points = o3d.utility.Vector3dVector(dense_pts.cpu().numpy()) 153 | pcd.colors = o3d.utility.Vector3dVector(dense_pred_colors) 154 | o3d.visualization.draw_geometries([pcd]) 155 | 156 | return total_pred_val.tolist() 157 | 158 | 159 | @torch.no_grad() 160 | def predict_physical_property_query(args, query_pts, scene_dir, clip_model, clip_tokenizer, return_all=False): 161 | """ 162 | Predict a physical property at given array of 3D query points. query_pts can be set to 'grid' 163 | instead to automatically generate a grid of query points from source points. If return_all=True, 164 | returns various intermediate results. Otherwise, returns [low, high] range for each query point. 165 | """ 166 | 167 | scene_name = os.path.basename(scene_dir) 168 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply') 169 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json') 170 | info_file = os.path.join(scene_dir, '%s.json' % args.mats_load_name) 171 | 172 | with open(info_file, 'r') as f: 173 | info = json.load(f) 174 | 175 | # loading source point info 176 | with open(os.path.join(scene_dir, 'features', 'voxel_size_%s.json' % args.feature_load_name), 'r') as f: 177 | feature_voxel_size = json.load(f)['voxel_size'] 178 | pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=feature_voxel_size) 179 | source_pts = torch.Tensor(pts).to(args.device) 180 | patch_features = torch.load(os.path.join(scene_dir, 'features', 'patch_features_%s.pt' % args.feature_load_name)) 181 | is_visible = torch.load(os.path.join(scene_dir, 'features', 'is_visible_%s.pt' % args.feature_load_name)) 182 | 183 | # preparing material info 184 | mat_val_list = info['candidate_materials_%s' % args.property_name] 185 | if args.property_name == 'hardness': 186 | mat_names, mat_vals = parse_material_hardness(mat_val_list) 187 | else: 188 | mat_names, mat_vals = parse_material_list(mat_val_list) 189 | mat_vals = torch.Tensor(mat_vals).to(args.device) 190 | 191 | # predictions on source points 192 | text_features = get_text_features(mat_names, clip_model, clip_tokenizer, device=args.device) 193 | agg_patch_features, is_valid = get_agg_patch_features(patch_features, is_visible) 194 | source_pts = source_pts[is_valid] 195 | 196 | similarities = agg_patch_features @ text_features.T 197 | 198 | source_pred_probs = torch.softmax(similarities / args.temperature, dim=1) 199 | source_pred_mat_idxs = similarities.argmax(1) 200 | source_pred_vals = source_pred_probs @ mat_vals 201 | 202 | if query_pts == 'grid': 203 | query_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=args.sample_voxel_size) 204 | query_pts = torch.Tensor(query_pts).to(args.device) 205 | query_pred_probs = get_interpolated_values(source_pts, source_pred_probs, query_pts, batch_size=2048, k=1) 206 | query_pred_vals = query_pred_probs @ mat_vals 207 | 208 | print('-' * 50) 209 | print('scene:', scene_name) 210 | print('-' * 50) 211 | print('num. query points:', len(query_pts)) 212 | print('caption:', info['caption']) 213 | print('candidate materials (%s):' % args.property_name) 214 | for mat_i, mat_name in enumerate(mat_names): 215 | print('%16s: %8.1f -%8.1f' % (mat_name, mat_vals[mat_i][0], mat_vals[mat_i][1])) 216 | 217 | if args.show_mat_seg: 218 | # Visualize material segmentation in open3d 219 | cmap = mpl.colormaps['tab10'] 220 | mat_colors = [cmap(i)[:3] for i in range(len(mat_names))] 221 | query_pred_colors = np.array([mat_colors[i] for i in query_pred_probs.argmax(1)]) 222 | 223 | pcd = o3d.geometry.PointCloud() 224 | pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy()) 225 | pcd.colors = o3d.utility.Vector3dVector(query_pred_colors) 226 | o3d.visualization.draw_geometries([pcd]) 227 | 228 | if return_all: 229 | query_features = get_interpolated_values(source_pts, agg_patch_features, query_pts, batch_size=2048, k=1) 230 | query_similarities = get_interpolated_values(source_pts, similarities, query_pts, batch_size=2048, k=1) 231 | return { 232 | 'query_pred_probs': query_pred_probs.cpu().numpy(), 233 | 'query_pred_vals': query_pred_vals.cpu().numpy(), 234 | 'query_features': query_features.cpu().numpy(), 235 | 'query_similarities': query_similarities.cpu().numpy(), 236 | 'source_pts': source_pts.cpu().numpy(), 237 | 'mat_names': mat_names, 238 | } 239 | return query_pred_vals.cpu().numpy() 240 | 241 | 242 | if __name__ == '__main__': 243 | 244 | args = get_args() 245 | 246 | scenes_dir = os.path.join(args.data_dir, 'scenes') 247 | scenes = get_scenes_list(args) 248 | 249 | clip_model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT) 250 | clip_model.to(args.device) 251 | clip_tokenizer = open_clip.get_tokenizer(CLIP_BACKBONE) 252 | 253 | preds = {} 254 | for j, scene in enumerate(scenes): 255 | scene_dir = os.path.join(scenes_dir, scene) 256 | if args.prediction_mode == 'integral': 257 | pred = predict_physical_property_integral(args, scene_dir, clip_model, clip_tokenizer) 258 | elif args.prediction_mode == 'grid': 259 | pred = predict_physical_property_query(args, 'grid', scene_dir, clip_model, clip_tokenizer) 260 | else: # use predict_physical_property_query() to query points however you want! 261 | raise NotImplementedError 262 | preds[scene] = pred 263 | 264 | if args.prediction_mode == 'integral' and args.save_preds: 265 | os.makedirs('preds', exist_ok=True) 266 | with open(os.path.join('preds', 'preds_%s.json' % args.preds_save_name), 'w') as f: 267 | json.dump(preds, f, indent=4) 268 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pillow 3 | matplotlib 4 | transformers 5 | open_clip_torch 6 | open3d 7 | openai==0.28 -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import numpy as np 4 | import json 5 | import open3d as o3d 6 | import gzip 7 | from PIL import Image 8 | 9 | 10 | def project_3d_to_2d(pts, w2c, K, return_dists=False): 11 | """Project 3D points to 2D (nerfstudio format).""" 12 | pts = np.array(pts) 13 | K = np.hstack([K, np.zeros((3, 1))]) 14 | pts = np.concatenate([pts, np.ones((pts.shape[0], 1))], axis=1) 15 | pts = np.dot(pts, w2c.T) 16 | pts[:, [1, 2]] *= -1 17 | if return_dists: 18 | dists = np.linalg.norm(pts[:, :3], axis=-1) 19 | pts = np.dot(pts, K.T) 20 | pts_2d = pts[:, :2] / pts[:, 2:] 21 | if return_dists: 22 | return pts_2d, dists 23 | return pts_2d 24 | 25 | 26 | def parse_transforms_json(t_file, return_w2c=False, different_Ks=False): 27 | with open(t_file, 'rb') as f: 28 | transforms = json.load(f) 29 | 30 | if different_Ks: 31 | Ks = [] 32 | for i in range(len(transforms['frames'])): 33 | K = np.array([ 34 | [transforms['frames'][i]['fl_x'], 0, transforms['frames'][i]['cx']], 35 | [0, transforms['frames'][i]['fl_y'], transforms['frames'][i]['cy']], 36 | [0, 0, 1], 37 | ]) 38 | Ks.append(K) 39 | K = Ks 40 | else: 41 | K = np.array([ 42 | [transforms['fl_x'], 0, transforms['cx']], 43 | [0, transforms['fl_y'], transforms['cy']], 44 | [0, 0, 1], 45 | ]) 46 | 47 | n_frames = len(transforms['frames']) 48 | c2ws = [np.array(transforms['frames'][i]['transform_matrix']) for i in range(n_frames)] 49 | if return_w2c: 50 | w2cs = [np.linalg.inv(c2w) for c2w in c2ws] 51 | return w2cs, K 52 | return c2ws, K 53 | 54 | 55 | def parse_dataparser_transforms_json(dt_file): 56 | with open(dt_file, "r") as fr: 57 | dataparser_transforms = json.load(fr) 58 | 59 | ns_transform = np.asarray(dataparser_transforms["transform"]) 60 | scale = dataparser_transforms["scale"] 61 | return ns_transform, scale 62 | 63 | 64 | def load_ns_point_cloud(pcd_file, dt_file, ds_size=0.01, viz=False): 65 | pcd = o3d.io.read_point_cloud(pcd_file) 66 | if ds_size is not None: 67 | pcd = pcd.voxel_down_sample(ds_size) 68 | 69 | ns_transform, scale = parse_dataparser_transforms_json(dt_file) 70 | ns_transform = np.concatenate([ns_transform, np.array([[0, 0, 0, 1/scale]])], 0) 71 | inv_ns_transform = np.linalg.inv(ns_transform) 72 | 73 | # use open3d to scale and transform 74 | pcd.transform(inv_ns_transform) 75 | 76 | pts = np.asarray(pcd.points) 77 | 78 | if viz: 79 | cf = o3d.geometry.TriangleMesh.create_coordinate_frame(size=1.0, origin=[0, 0, 0]) 80 | o3d.visualization.draw_geometries([cf, pcd]) 81 | return pts 82 | 83 | 84 | def load_images(img_dir, bg_change=255, return_masks=False): 85 | img_files = os.listdir(img_dir) 86 | img_files.sort() 87 | imgs = [] 88 | masks = [] 89 | for img_file in img_files: 90 | # load RGBA image 91 | img = np.array(Image.open(os.path.join(img_dir, img_file))) 92 | if return_masks or bg_change is not None: 93 | mask = img[:, :, 3] > 0 94 | if bg_change is not None: 95 | img[~mask] = bg_change 96 | masks.append(mask) 97 | imgs.append(img[:, :, :3]) 98 | 99 | if return_masks: 100 | return imgs, masks 101 | return imgs 102 | 103 | 104 | def load_depths(depth_dir, Ks): 105 | depth_files = os.listdir(depth_dir) 106 | depth_files.sort() 107 | depths = [] 108 | for i, depth_file in enumerate(depth_files): 109 | # load npy.gz depth file 110 | with gzip.open(os.path.join(depth_dir, depth_file), 'rb') as f: 111 | dist = np.load(f)[:, :, 0] 112 | if Ks is not None: 113 | depth = distance_to_depth(dist, Ks[i]) 114 | else: 115 | depth = dist 116 | depths.append(depth) 117 | return depths 118 | 119 | 120 | def depth_to_distance(depth, K): 121 | """Convert depth map to distance from camera.""" 122 | h, w = depth.shape 123 | x, y = np.meshgrid(np.arange(w), np.arange(h)) 124 | x = x.flatten() 125 | y = y.flatten() 126 | depth = depth.flatten() 127 | pts = np.stack([x, y, np.ones_like(x)], axis=1) 128 | pts = np.dot(pts, np.linalg.inv(K).T) 129 | pts *= depth[:, None] 130 | dists = np.linalg.norm(pts, axis=1) 131 | dists = dists.reshape(h, w) 132 | return dists 133 | 134 | 135 | def distance_to_depth(dists, K): 136 | """Convert distance map to depth map.""" 137 | h, w = dists.shape 138 | x, y = np.meshgrid(np.arange(w), np.arange(h)) 139 | x = x.flatten() 140 | y = y.flatten() 141 | pts = np.stack([x, y, np.ones_like(x)], axis=1) 142 | pts = np.dot(pts, np.linalg.inv(K).T) 143 | divisor = np.linalg.norm(pts, axis=1) 144 | divisor = divisor.reshape(h, w) 145 | depth = dists / divisor 146 | return depth 147 | 148 | 149 | def get_last_file_in_folder(folder): 150 | files = os.listdir(folder) 151 | return os.path.join(folder, sorted(files, reverse=True)[0]) 152 | 153 | 154 | def get_scenes_list(args): 155 | if args.split != 'all': 156 | with open(os.path.join(args.data_dir, 'splits.json'), 'r') as f: 157 | splits = json.load(f) 158 | if args.split == 'train+val': 159 | scenes = splits['train'] + splits['val'] 160 | else: 161 | scenes = splits[args.split] 162 | else: 163 | scenes = sorted(os.listdir(os.path.join(args.data_dir, 'scenes'))) 164 | 165 | if args.end_idx != -1: 166 | scenes = scenes[args.start_idx:args.end_idx] 167 | else: 168 | scenes = scenes[args.start_idx:] 169 | return scenes 170 | 171 | 172 | def unproject_point(pt_2d, depth, c2w, K): 173 | """Unproject a single point from 2D to 3D (nerfstudio format).""" 174 | cx = K[0, 2] 175 | cy = K[1, 2] 176 | fx = K[0, 0] 177 | fy = K[1, 1] 178 | x = (pt_2d[0] - cx) / fx 179 | y = (pt_2d[1] - cy) / fy 180 | pt_3d = np.array([x, -y, -1]) 181 | pt_3d *= depth[pt_2d[1], pt_2d[0]] 182 | pt_3d = np.concatenate([pt_3d, np.ones((1,))], axis=0) 183 | pt_3d = np.dot(c2w, pt_3d) 184 | pt_3d = pt_3d[:3] 185 | return pt_3d 186 | -------------------------------------------------------------------------------- /visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import open3d as o3d 3 | import os 4 | import torch 5 | import open_clip 6 | import matplotlib as mpl 7 | import matplotlib.pyplot as plt 8 | import matplotlib.patches as mpatches 9 | from PIL import Image 10 | from sklearn.decomposition import PCA 11 | 12 | from feature_fusion import CLIP_BACKBONE, CLIP_CHECKPOINT 13 | from predict_property import predict_physical_property_query 14 | from utils import parse_transforms_json, load_ns_point_cloud, load_images 15 | from arguments import get_args 16 | 17 | 18 | def features_to_colors(features): 19 | """Convert feature vectors to RGB colors using PCA.""" 20 | pca = PCA(n_components=3) 21 | pca.fit(features) 22 | transformed = pca.transform(features) 23 | q1, q99 = np.percentile(transformed, [1, 99]) 24 | feature_pca_postprocess_sub = q1 25 | feature_pca_postprocess_div = (q99 - q1) 26 | transformed = (transformed - feature_pca_postprocess_sub) / feature_pca_postprocess_div 27 | colors = np.clip(transformed, 0, 1) 28 | return colors 29 | 30 | 31 | def similarities_to_colors(similarities, temperature=None): 32 | """Convert CLIP similarity values to RGB colors.""" 33 | cmap = mpl.colormaps['tab10'] 34 | mat_colors = [cmap(i)[:3] for i in range(similarities.shape[1])] 35 | if temperature is None: 36 | argmax_similarities = np.argmax(similarities, axis=1) 37 | colors = np.array([mat_colors[i] for i in argmax_similarities]) 38 | else: 39 | softmax_probs = torch.softmax(torch.tensor(similarities) / temperature, dim=1) 40 | colors = softmax_probs @ torch.tensor(mat_colors).float() 41 | colors = colors.numpy() 42 | return colors 43 | 44 | 45 | def values_to_colors(values, low, high): 46 | """Convert scalar values to RGB colors.""" 47 | cmap = mpl.colormaps['inferno'] 48 | colors = cmap((values - low) / (high - low)) 49 | return colors[:, :3] 50 | 51 | 52 | def render_pcd(pcd, w2c, K, hw=(1024, 1024), pt_size=8, savefile=None, show=False): 53 | h, w = hw 54 | 55 | # set pinhole camera parameters from K 56 | render_camera = o3d.camera.PinholeCameraParameters() 57 | render_camera.extrinsic = w2c 58 | 59 | intrinsic = o3d.camera.PinholeCameraIntrinsic() 60 | intrinsic.set_intrinsics(h, w, K[0, 0], K[1, 1], K[0, 2], K[1, 2]) 61 | render_camera.intrinsic = intrinsic 62 | 63 | # visualize pcd from camera view with intrinsics set to K 64 | vis = o3d.visualization.Visualizer() 65 | vis.create_window(width=w, height=h, visible=show) 66 | 67 | vis.add_geometry(pcd) 68 | ctr = vis.get_view_control() 69 | ctr.convert_from_pinhole_camera_parameters(render_camera, allow_arbitrary=True) 70 | 71 | # rendering options 72 | render_option = vis.get_render_option() 73 | render_option.point_size = pt_size 74 | render_option.point_show_normal = False 75 | render_option.light_on = False 76 | vis.update_renderer() 77 | 78 | if show: 79 | vis.run() 80 | 81 | if savefile is not None: 82 | vis.capture_screen_image(savefile, do_render=True) 83 | vis.destroy_window() 84 | return Image.open(savefile) 85 | else: 86 | render = vis.capture_screen_float_buffer(do_render=True) 87 | vis.destroy_window() 88 | return np.array(render) 89 | 90 | 91 | def composite_and_save(img1, img2, alpha, savefile): 92 | img1 = img1.astype(np.float32) 93 | img2 = img2.astype(np.float32) 94 | img = img1 * alpha + img2 * (1 - alpha) 95 | img = (img * 255).astype(np.uint8) 96 | Image.fromarray(img).save(savefile) 97 | return img 98 | 99 | 100 | def make_legend(colors, names, ncol=1, figsize=(2.0, 2.5), savefile=None, show=False): 101 | plt.style.use('fast') 102 | plt.rcParams["font.family"] = "Times New Roman" 103 | fig = plt.figure(figsize=figsize) 104 | fig.patch.set_facecolor('white') 105 | plt.axis('off') 106 | 107 | # creating legend with color boxes 108 | ptchs = [] 109 | for color, name in zip(colors, names): 110 | if len(name) > 10: # wrap long names 111 | name = name.replace(' ', '\n') 112 | ptchs.append(mpatches.Patch(color=color[:3], label=name)) 113 | leg = plt.legend(handles=ptchs, ncol=ncol, loc='center left', prop={'size': 18}, 114 | handlelength=1, handleheight=1, facecolor='white', framealpha=0) 115 | plt.tight_layout() 116 | 117 | if show: 118 | plt.show() 119 | if savefile is not None: 120 | plt.savefig(savefile, dpi=400) 121 | plt.close() 122 | 123 | 124 | if __name__ == '__main__': 125 | 126 | args = get_args() 127 | 128 | scenes_dir = os.path.join(args.data_dir, 'scenes') 129 | 130 | clip_model, _, preprocess = open_clip.create_model_and_transforms(CLIP_BACKBONE, pretrained=CLIP_CHECKPOINT) 131 | clip_model.to(args.device) 132 | clip_tokenizer = open_clip.get_tokenizer(CLIP_BACKBONE) 133 | 134 | scene_dir = os.path.join(scenes_dir, args.scene_name) 135 | t_file = os.path.join(scene_dir, 'transforms.json') 136 | pcd_file = os.path.join(scene_dir, 'ns', 'point_cloud.ply') 137 | dt_file = os.path.join(scene_dir, 'ns', 'dataparser_transforms.json') 138 | 139 | query_pts = load_ns_point_cloud(pcd_file, dt_file, ds_size=None) 140 | query_pts = torch.Tensor(query_pts).to(args.device) 141 | 142 | result = predict_physical_property_query(args, query_pts, scene_dir, clip_model, clip_tokenizer, 143 | return_all=True) 144 | 145 | out_dir = os.path.join('viz', args.viz_save_name) 146 | os.makedirs(out_dir, exist_ok=True) 147 | 148 | # legend for materials 149 | mat_names = result['mat_names'] 150 | cmap_tab10 = mpl.colormaps['tab10'] 151 | make_legend([cmap_tab10(i) for i in range(len(mat_names))], mat_names, 152 | savefile=os.path.join(out_dir, '%s_legend.png' % args.viz_save_name), show=args.show) 153 | 154 | # camera for rendering 155 | w2cs, K = parse_transforms_json(t_file, return_w2c=True) 156 | view_idx = 0 157 | w2c = w2cs[view_idx] 158 | w2c[[1, 2]] *= -1 # convert from nerfstudio to open3d format 159 | imgs = load_images(os.path.join(scene_dir, 'images')) 160 | orig_img = imgs[view_idx] / 255. 161 | 162 | # RGB reconstruction 163 | rgb_pcd = o3d.io.read_point_cloud(pcd_file) 164 | rgb_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy()) 165 | render = render_pcd(rgb_pcd, w2c, K, show=args.show) 166 | if not args.show: 167 | Image.fromarray(imgs[view_idx]).save(os.path.join(out_dir, '%s_rgb.png' % args.viz_save_name)) 168 | 169 | # features PCA 170 | pca_pcd = o3d.geometry.PointCloud() 171 | pca_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy()) 172 | colors_pca = features_to_colors(result['query_features']) 173 | pca_pcd.colors = o3d.utility.Vector3dVector(colors_pca) 174 | render = render_pcd(pca_pcd, w2c, K, show=args.show) 175 | if not args.show: 176 | combined = composite_and_save(orig_img, render, args.compositing_alpha, 177 | savefile=os.path.join(out_dir, '%s_pca.png' % args.viz_save_name)) 178 | 179 | # material segmentation 180 | seg_pcd = o3d.geometry.PointCloud() 181 | seg_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy()) 182 | colors_seg = similarities_to_colors(result['query_similarities']) 183 | seg_pcd.colors = o3d.utility.Vector3dVector(colors_seg) 184 | render = render_pcd(seg_pcd, w2c, K, show=args.show) 185 | if not args.show: 186 | combined = composite_and_save(orig_img, render, args.compositing_alpha, 187 | savefile=os.path.join(out_dir, '%s_seg.png' % args.viz_save_name)) 188 | 189 | # physical property values 190 | val_pcd = o3d.geometry.PointCloud() 191 | val_pcd.points = o3d.utility.Vector3dVector(query_pts.cpu().numpy()) 192 | colors_val = values_to_colors(np.mean(result['query_pred_vals'], axis=1), args.cmap_min, args.cmap_max) 193 | val_pcd.colors = o3d.utility.Vector3dVector(colors_val) 194 | render = render_pcd(val_pcd, w2c, K, show=args.show) 195 | if not args.show: 196 | combined = composite_and_save(orig_img, render, args.compositing_alpha, 197 | savefile=os.path.join(out_dir, '%s_%s.png' % (args.viz_save_name, args.property_name))) 198 | 199 | --------------------------------------------------------------------------------