├── openfungraph
    ├── __init__.py
    ├── dataset
    │   ├── __init__.py
    │   └── dataconfigs
    │   │   ├── fungraph3d
    │   │       └── fungraph3d.yaml
    │   │   └── scenefun3d
    │   │       └── scenefun3d.yaml
    ├── llava
    │   ├── __init__.py
    │   └── llava_model_16.py
    ├── scripts
    │   ├── __init__.py
    │   ├── ana_rigid_objs.py
    │   ├── pyviz3d_interactable_results.py
    │   ├── generate_part_gsa_results.py
    │   └── generate_gsa_results.py
    ├── slam
    │   ├── __init__.py
    │   ├── mapping.py
    │   ├── slam_classes.py
    │   └── cfslam_pipeline_batch.py
    ├── utils
    │   ├── __init__.py
    │   ├── general_utils.py
    │   ├── model_utils.py
    │   ├── vis.py
    │   ├── ious.py
    │   └── colmap.py
    ├── scenegraph
    │   ├── detection_fungraph3d.sh
    │   ├── detection_scenefun3d.sh
    │   └── GPTPrompt.py
    ├── configs
    │   └── slam_pipeline
    │   │   └── base.yaml
    └── eval
    │   ├── eval_node.py
    │   └── eval_triplet.py
├── assets
    ├── teaser.png
    └── teaser_top.jpg
├── setup.py
├── env_vars.bash.template
├── .gitignore
└── README.md


/openfungraph/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/openfungraph/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/openfungraph/llava/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/openfungraph/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/openfungraph/slam/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/openfungraph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangCYG/OpenFunGraph/HEAD/assets/teaser.png


--------------------------------------------------------------------------------
/assets/teaser_top.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangCYG/OpenFunGraph/HEAD/assets/teaser_top.jpg


--------------------------------------------------------------------------------
/openfungraph/dataset/dataconfigs/fungraph3d/fungraph3d.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: 'fungraph3d'
 2 | camera_params:
 3 |   image_height: 1440
 4 |   image_width: 1920
 5 |   fx: 1580
 6 |   fy: 1580
 7 |   cx: 950
 8 |   cy: 722
 9 |   png_depth_scale: 1000 #for depth image in png format
10 |   crop_edge: 0


--------------------------------------------------------------------------------
/openfungraph/dataset/dataconfigs/scenefun3d/scenefun3d.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: 'scenefun3d'
 2 | camera_params:
 3 |   image_height: 1440
 4 |   image_width: 1920
 5 |   fx: 1592
 6 |   fy: 1592
 7 |   cx: 952
 8 |   cy: 742
 9 |   png_depth_scale: 1000 #for depth image in png format
10 |   crop_edge: 0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='openfungraph',
5 |     version='1.0.0',
6 |     description='Open-Vocabulary Functional 3D Scene Graphs for Real-World Indoor Spaces',
7 |     author='See https://openfungraph.github.io/',
8 |     packages=find_packages(),
9 | )


--------------------------------------------------------------------------------
/env_vars.bash.template:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Example script setting up the rnv variables needed for running OpenFunGraph
 3 | # Please adapt it to your own paths!
 4 | 
 5 | cd openfungraph
 6 | 
 7 | conda activate openfungraph
 8 | 
 9 | export FG_FOLDER=../
10 | 
11 | export GSA_PATH=../Grounded-Segment-Anything
12 | 
13 | export FUNGRAPH3D_ROOT=
14 | export FUNGRAPH3D_CONFIG_PATH=${FG_FOLDER}/openfungraph/dataset/dataconfigs/fungraph3d/fungraph3d.yaml
15 | export SCENEFUN3D_ROOT=  # for SceneFun3D, it should be with dev / test
16 | export SCENEFUN3D_CONFIG_PATH=${FG_FOLDER}/openfungraph/dataset/dataconfigs/scenefun3d/scenefun3d.yaml
17 | 
18 | export SCENE_NAME=
19 | 
20 | export THRESHOLD=1.2
21 | 
22 | export CLASS_SET=ram
23 | 
24 | export OPENAI_API_KEY=<your GPT-4 API KEY here>
25 | 
26 | 


--------------------------------------------------------------------------------
/openfungraph/scenegraph/detection_fungraph3d.sh:
--------------------------------------------------------------------------------
 1 | # generate object 2D detection
 2 | CUDA_VISIBLE_DEVICES=0 python scripts/generate_gsa_results.py     --dataset_root $FUNGRAPH3D_ROOT     --dataset_config $FUNGRAPH3D_CONFIG_PATH     --scene_id $SCENE_NAME     --class_set $CLASS_SET     --box_threshold 0.25     --text_threshold 0.25     --stride 1     --add_bg_classes     --accumu_classes     --exp_suffix withbg_allclasses
 3 | 
 4 | # fuse general objects
 5 | python slam/cfslam_pipeline_batch.py     dataset_root=$FUNGRAPH3D_ROOT     dataset_config=$FUNGRAPH3D_CONFIG_PATH     stride=1     scene_id=$SCENE_NAME     spatial_sim_type=overlap     mask_conf_threshold=0.3     match_method=sim_sum     sim_threshold=${THRESHOLD}     dbscan_eps=0.1     gsa_variant=ram_withbg_allclasses     skip_bg=False     max_bbox_area_ratio=0.9    merge_overlap_thresh=0.9 save_suffix=overlap_maskconf0.3_bbox0.9_simsum${THRESHOLD}_dbscan.1 merge_visual_sim_thresh=0.75 merge_text_sim_thresh=0.7
 6 | 
 7 | # detect 2D parts
 8 | CUDA_VISIBLE_DEVICES=0 python scripts/generate_part_gsa_results.py     --dataset_root $FUNGRAPH3D_ROOT     --dataset_config $FUNGRAPH3D_CONFIG_PATH     --scene_id $SCENE_NAME     --class_set $CLASS_SET     --box_threshold 0.15     --text_threshold 0.15     --stride 1     --add_bg_classes     --accumu_classes     --exp_suffix withbg_allclasses
 9 | 
10 | # fuse parts
11 | python slam/cfslam_pipeline_batch.py     dataset_root=$FUNGRAPH3D_ROOT     dataset_config=$FUNGRAPH3D_CONFIG_PATH     stride=1     scene_id=$SCENE_NAME     spatial_sim_type=overlap     mask_conf_threshold=0.15     match_method=sim_sum     sim_threshold=${THRESHOLD}     dbscan_eps=0.1     gsa_variant=ram_withbg_allclasses     skip_bg=False     max_bbox_area_ratio=0.1     save_suffix=overlap_maskconf0.15_bbox0.1_simsum${THRESHOLD}_dbscan.1_parts part_reg=True
12 | 
13 | python scripts/ana_rigid_objs.py --result_path $FUNGRAPH3D_ROOT'/'$SCENE_NAME'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz'  --part_result_path $FUNGRAPH3D_ROOT'/'$SCENE_NAME'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz'


--------------------------------------------------------------------------------
/openfungraph/scenegraph/detection_scenefun3d.sh:
--------------------------------------------------------------------------------
 1 | # generate object 2D detection
 2 | CUDA_VISIBLE_DEVICES=0 python scripts/generate_gsa_results.py     --dataset_root $SCENEFUN3D_ROOT     --dataset_config $SCENEFUN3D_CONFIG_PATH     --scene_id $SCENE_NAME     --class_set $CLASS_SET     --box_threshold 0.25     --text_threshold 0.25     --stride 1     --add_bg_classes     --accumu_classes     --exp_suffix withbg_allclasses
 3 | 
 4 | # fuse general objects
 5 | python slam/cfslam_pipeline_batch.py     dataset_root=$SCENEFUN3D_ROOT     dataset_config=$SCENEFUN3D_CONFIG_PATH     stride=1     scene_id=$SCENE_NAME     spatial_sim_type=overlap     mask_conf_threshold=0.3     match_method=sim_sum     sim_threshold=${THRESHOLD}     dbscan_eps=0.1     gsa_variant=ram_withbg_allclasses     skip_bg=False     max_bbox_area_ratio=0.9    merge_overlap_thresh=0.9 save_suffix=overlap_maskconf0.3_bbox0.9_simsum${THRESHOLD}_dbscan.1 merge_visual_sim_thresh=0.75 merge_text_sim_thresh=0.7
 6 | 
 7 | # detect 2D parts
 8 | CUDA_VISIBLE_DEVICES=0 python scripts/generate_part_gsa_results.py     --dataset_root $SCENEFUN3D_ROOT     --dataset_config $SCENEFUN3D_CONFIG_PATH     --scene_id $SCENE_NAME     --class_set $CLASS_SET     --box_threshold 0.15     --text_threshold 0.15     --stride 1     --add_bg_classes     --accumu_classes     --exp_suffix withbg_allclasses
 9 | 
10 | # fuse parts
11 | python slam/cfslam_pipeline_batch.py     dataset_root=$SCENEFUN3D_ROOT     dataset_config=$SCENEFUN3D_CONFIG_PATH     stride=1     scene_id=$SCENE_NAME     spatial_sim_type=overlap     mask_conf_threshold=0.15     match_method=sim_sum     sim_threshold=${THRESHOLD}     dbscan_eps=0.1     gsa_variant=ram_withbg_allclasses     skip_bg=False     max_bbox_area_ratio=0.1     save_suffix=overlap_maskconf0.15_bbox0.1_simsum${THRESHOLD}_dbscan.1_parts part_reg=True
12 | 
13 | python scripts/ana_rigid_objs.py --result_path $SCENEFUN3D_ROOT'/'$SCENE_NAME'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz'  --part_result_path $SCENEFUN3D_ROOT'/'$SCENE_NAME'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz'


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | datasets
  2 | openfungraph/outputs
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | .pdm.toml
 89 | 
 90 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 91 | __pypackages__/
 92 | 
 93 | # Celery stuff
 94 | celerybeat-schedule
 95 | celerybeat.pid
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # pytype static type analyzer
128 | .pytype/
129 | 
130 | # Cython debug symbols
131 | cython_debug/
132 | 
133 | 


--------------------------------------------------------------------------------
/openfungraph/configs/slam_pipeline/base.yaml:
--------------------------------------------------------------------------------
 1 | # Dataset
 2 | dataset_root: /home/kuwajerw/NAS3/MyStuff/Projects/gradslam-vlm/data/ai2thor/
 3 | dataset_config: /home/kuwajerw/repos/CFSLAM/cfslam/dataset/dataconfigs/ai2thor/ai2thor.yaml
 4 | scene_id: train_3_interact
 5 | start: 0
 6 | end: -1
 7 | stride: 1
 8 | image_height: null # if null, it will be determined by dataconfig
 9 | image_width: null # if null, it will be determined by dataconfig
10 | 
11 | # Input detections
12 | gsa_variant: ram
13 | detection_folder_name: gsa_detections_${gsa_variant}
14 | det_vis_folder_name: gsa_vis_${gsa_variant}
15 | color_file_name: gsa_classes_${gsa_variant}
16 | 
17 | device: cuda
18 | 
19 | use_iou: !!bool True
20 | spatial_sim_type: iou # "iou", "giou", "overlap"
21 | phys_bias: 0.0
22 | match_method: "sep_thresh" # "sep_thresh", "sim_sum"
23 | # Only when match_method=="sep_thresh"
24 | semantic_threshold: 0.5
25 | physical_threshold: 0.5
26 | # Only when match_method=="sim_sum"
27 | sim_threshold: 0
28 | 
29 | # For contain_number
30 | use_contain_number: !!bool False
31 | contain_area_thresh: 0.95
32 | contain_mismatch_penalty: 0.5
33 | 
34 | # Selection criteria on the 2D masks
35 | mask_area_threshold: 25 # mask with pixel area less than this will be skipped
36 | mask_conf_threshold: 0.2 # mask with lower confidence score will be skipped
37 | max_bbox_area_ratio: 1.0 # boxes with larger areas than this will be skipped
38 | skip_bg: !!bool True
39 | min_points_threshold: 16 # projected and sampled pcd with less points will be skipped
40 | 
41 | # point cloud processing
42 | downsample_voxel_size: 0.01
43 | dbscan_remove_noise: !!bool True
44 | dbscan_eps: 0.05
45 | dbscan_min_points: 10
46 | 
47 | # Selection criteria of the fused object point cloud
48 | obj_min_points: 0
49 | obj_min_detections: 9
50 | 
51 | # For merge_overlap_objects() function
52 | merge_overlap_thresh: 0.7      # -1 means do not perform the merge_overlap_objects()
53 | merge_visual_sim_thresh: 0.7   # Merge only if the visual similarity is larger
54 | merge_text_sim_thresh: 0.7     # Merge only if the text cosine sim is larger
55 | 
56 | # Periodically perform post-process operations every k frame
57 | # -1 means not perform them during the run. They are performed at the end anyway. 
58 | denoise_interval: 20           # Run DBSCAN every k frame. This operation is heavy
59 | filter_interval: -1            # Filter objects that have too few associations or are too small
60 | merge_interval: -1             # Merge objects based on geometric and semantic similarity
61 | 
62 | # Output point cloud
63 | save_pcd: !!bool True
64 | save_suffix: exp
65 | 
66 | # Visualization
67 | debug_render: !!bool False     # If True, the vis.run() will be called and used for debugging
68 | class_agnostic: !!bool False   # If set, the color will be set by instance, rather than most common class
69 | 
70 | render_camera_path: "replica_room0.json"
71 | 
72 | # part recognition
73 | part_reg: !!bool False


--------------------------------------------------------------------------------
/openfungraph/slam/mapping.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | from openfungraph.slam.slam_classes import MapObjectList, DetectionList
  5 | from openfungraph.utils.general_utils import Timer
  6 | from openfungraph.utils.ious import (
  7 |     compute_iou_batch, 
  8 |     compute_giou_batch, 
  9 |     compute_3d_iou_accuracte_batch, 
 10 |     compute_3d_giou_accurate_batch,
 11 | )
 12 | from openfungraph.slam.utils import (
 13 |     merge_obj2_into_obj1, 
 14 |     compute_overlap_matrix_2set
 15 | )
 16 | 
 17 | def compute_spatial_similarities(cfg, detection_list: DetectionList, objects: MapObjectList) -> torch.Tensor:
 18 |     '''
 19 |     Compute the spatial similarities between the detections and the objects
 20 |     
 21 |     Args:
 22 |         detection_list: a list of M detections
 23 |         objects: a list of N objects in the map
 24 |     Returns:
 25 |         A MxN tensor of spatial similarities
 26 |     '''
 27 |     det_bboxes = detection_list.get_stacked_values_torch('bbox')
 28 |     obj_bboxes = objects.get_stacked_values_torch('bbox')
 29 | 
 30 |     if cfg.spatial_sim_type == "iou":
 31 |         spatial_sim = compute_iou_batch(det_bboxes, obj_bboxes)
 32 |     elif cfg.spatial_sim_type == "giou":
 33 |         spatial_sim = compute_giou_batch(det_bboxes, obj_bboxes)
 34 |     elif cfg.spatial_sim_type == "iou_accurate":
 35 |         spatial_sim = compute_3d_iou_accuracte_batch(det_bboxes, obj_bboxes)
 36 |     elif cfg.spatial_sim_type == "giou_accurate":
 37 |         spatial_sim = compute_3d_giou_accurate_batch(det_bboxes, obj_bboxes)
 38 |     elif cfg.spatial_sim_type == "overlap":
 39 |         spatial_sim = compute_overlap_matrix_2set(cfg, objects, detection_list)
 40 |         spatial_sim = torch.from_numpy(spatial_sim).T
 41 |     else:
 42 |         raise ValueError(f"Invalid spatial similarity type: {cfg.spatial_sim_type}")
 43 |     
 44 |     return spatial_sim
 45 | 
 46 | def compute_visual_similarities(cfg, detection_list: DetectionList, objects: MapObjectList) -> torch.Tensor:
 47 |     '''
 48 |     Compute the visual similarities between the detections and the objects
 49 |     
 50 |     Args:
 51 |         detection_list: a list of M detections
 52 |         objects: a list of N objects in the map
 53 |     Returns:
 54 |         A MxN tensor of visual similarities
 55 |     '''
 56 |     det_fts = detection_list.get_stacked_values_torch('clip_ft') # (M, D)
 57 |     obj_fts = objects.get_stacked_values_torch('clip_ft') # (N, D)
 58 | 
 59 |     det_fts = det_fts.unsqueeze(-1) # (M, D, 1)
 60 |     obj_fts = obj_fts.T.unsqueeze(0) # (1, D, N)
 61 |     
 62 |     visual_sim = F.cosine_similarity(det_fts, obj_fts, dim=1) # (M, N)
 63 |     
 64 |     return visual_sim
 65 | 
 66 | def aggregate_similarities(cfg, spatial_sim: torch.Tensor, visual_sim: torch.Tensor) -> torch.Tensor:
 67 |     '''
 68 |     Aggregate spatial and visual similarities into a single similarity score
 69 |     
 70 |     Args:
 71 |         spatial_sim: a MxN tensor of spatial similarities
 72 |         visual_sim: a MxN tensor of visual similarities
 73 |     Returns:
 74 |         A MxN tensor of aggregated similarities
 75 |     '''
 76 |     if cfg.match_method == "sim_sum":
 77 |         sims = (1 + cfg.phys_bias) * spatial_sim + (1 - cfg.phys_bias) * visual_sim # (M, N)
 78 |     else:
 79 |         raise ValueError(f"Unknown matching method: {cfg.match_method}")
 80 |     
 81 |     return sims
 82 | 
 83 | def merge_detections_to_objects(
 84 |     cfg, 
 85 |     detection_list: DetectionList, 
 86 |     objects: MapObjectList, 
 87 |     agg_sim: torch.Tensor
 88 | ) -> MapObjectList:
 89 |     # Iterate through all detections and merge them into objects
 90 |     for i in range(agg_sim.shape[0]):
 91 |         # If not matched to any object, add it as a new object
 92 |         if agg_sim[i].max() == float('-inf'):
 93 |             objects.append(detection_list[i])
 94 |         # Merge with most similar existing object
 95 |         else:
 96 |             j = agg_sim[i].argmax()
 97 |             matched_det = detection_list[i]
 98 |             matched_obj = objects[j]
 99 |             merged_obj = merge_obj2_into_obj1(cfg, matched_obj, matched_det, run_dbscan=False)
100 |             objects[j] = merged_obj
101 |             
102 |     return objects


--------------------------------------------------------------------------------
/openfungraph/utils/general_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | import numpy as np
  4 | import time
  5 | 
  6 | class Timer:
  7 |     def __init__(self, heading = "", verbose = True):
  8 |         self.verbose = verbose
  9 |         if not self.verbose:
 10 |             return
 11 |         self.heading = heading
 12 | 
 13 |     def __enter__(self):
 14 |         if not self.verbose:
 15 |             return self
 16 |         self.start = time.time()
 17 |         return self
 18 | 
 19 |     def __exit__(self, *args):
 20 |         if not self.verbose:
 21 |             return
 22 |         self.end = time.time()
 23 |         self.interval = self.end - self.start
 24 |         print(self.heading, self.interval)
 25 | 
 26 | def to_numpy(tensor):
 27 |     if isinstance(tensor, np.ndarray):
 28 |         return tensor
 29 |     return tensor.detach().cpu().numpy()
 30 | 
 31 | def to_tensor(numpy_array, device=None):
 32 |     if isinstance(numpy_array, torch.Tensor):
 33 |         return numpy_array
 34 |     if device is None:
 35 |         return torch.from_numpy(numpy_array)
 36 |     else:
 37 |         return torch.from_numpy(numpy_array).to(device)
 38 |     
 39 | def to_scalar(d: np.ndarray | torch.Tensor | float) -> int | float:
 40 |     '''
 41 |     Convert the d to a scalar
 42 |     '''
 43 |     if isinstance(d, float):
 44 |         return d
 45 |     
 46 |     elif "numpy" in str(type(d)):
 47 |         assert d.size == 1
 48 |         return d.item()
 49 |     
 50 |     elif isinstance(d, torch.Tensor):
 51 |         assert d.numel() == 1
 52 |         return d.item()
 53 |     
 54 |     else:
 55 |         raise TypeError(f"Invalid type for conversion: {type(d)}")
 56 | 
 57 | def prjson(input_json, indent=0):
 58 |     """ Pretty print a json object """
 59 |     if not isinstance(input_json, list):
 60 |         input_json = [input_json]
 61 |         
 62 |     print("[")
 63 |     for i, entry in enumerate(input_json):
 64 |         print("  {")
 65 |         for j, (key, value) in enumerate(entry.items()):
 66 |             terminator = "," if j < len(entry) - 1 else ""
 67 |             if isinstance(value, str):
 68 |                 formatted_value = value.replace("\\n", "\n").replace("\\t", "\t")
 69 |                 print('    "{}": "{}"{}'.format(key, formatted_value, terminator))
 70 |             else:
 71 |                 print(f'    "{key}": {value}{terminator}')
 72 |         print("  }" + ("," if i < len(input_json) - 1 else ""))
 73 |     print("]")
 74 | 
 75 | def cfg_to_dict(input_cfg):
 76 |     """ Convert a json object to a dictionary representation """
 77 |     # Ensure input is a list for uniform processing
 78 |     if not isinstance(input_cfg, list):
 79 |         input_cfg = [input_cfg]
 80 |     
 81 |     result = []  # Initialize the result list to hold our dictionaries
 82 |     
 83 |     for entry in input_cfg:
 84 |         entry_dict = {}  # Dictionary to store current entry's data
 85 |         for key, value in entry.items():
 86 |             # Replace escaped newline and tab characters in strings
 87 |             if isinstance(value, str):
 88 |                 formatted_value = value.replace("\\n", "\n").replace("\\t", "\t")
 89 |             else:
 90 |                 formatted_value = value
 91 |             # Add the key-value pair to the current entry dictionary
 92 |             entry_dict[key] = formatted_value
 93 |         # Append the current entry dictionary to the result list
 94 |         result.append(entry_dict)
 95 |     
 96 |     # Return the result in dictionary format if it's a single entry or list of dictionaries otherwise
 97 |     return result[0] if len(result) == 1 else result
 98 | 
 99 | def measure_time(func):
100 |     def wrapper(*args, **kwargs):
101 |         start_time = time.time()
102 |         # print(f"Starting {func.__name__}...")
103 |         result = func(*args, **kwargs)  # Call the function with any arguments it was called with
104 |         end_time = time.time()
105 |         elapsed_time = end_time - start_time
106 |         print(f"Done! Execution time of {func.__name__} function: {elapsed_time:.2f} seconds")
107 |         return result  # Return the result of the function call
108 |     return wrapper
109 | 
110 | def save_hydra_config(hydra_cfg, exp_out_path):
111 |     with open(exp_out_path / "config_params.json", "w") as f:
112 |         json.dump(cfg_to_dict(hydra_cfg), f, indent=2)


--------------------------------------------------------------------------------
/openfungraph/llava/llava_model_16.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | from PIL import Image
  4 | 
  5 | from llava.constants import (
  6 |     IMAGE_TOKEN_INDEX,
  7 |     DEFAULT_IMAGE_TOKEN,
  8 |     DEFAULT_IM_START_TOKEN,
  9 |     DEFAULT_IM_END_TOKEN,
 10 |     IMAGE_PLACEHOLDER,
 11 | )
 12 | from llava.conversation import conv_templates, SeparatorStyle
 13 | from llava.model.builder import load_pretrained_model
 14 | from llava.utils import disable_torch_init
 15 | from llava.mm_utils import (
 16 |     process_images,
 17 |     tokenizer_image_token,
 18 |     get_model_name_from_path,
 19 | )
 20 | 
 21 | from PIL import Image
 22 | 
 23 | import requests
 24 | from PIL import Image
 25 | from io import BytesIO
 26 | import re
 27 | 
 28 | class LlavaModel16():
 29 |     def __init__(
 30 |         self,
 31 |         model_path,
 32 |         model_base,
 33 |         conv_mode_input,
 34 |     ) -> None:
 35 |         disable_torch_init()
 36 | 
 37 |         model_name = get_model_name_from_path(model_path)
 38 |         tokenizer, model, image_processor, context_len = load_pretrained_model(
 39 |             model_path, model_base, model_name
 40 |         )
 41 |         
 42 |         if "llama-2" in model_name.lower():
 43 |             conv_mode = "llava_llama_2"
 44 |         elif "mistral" in model_name.lower():
 45 |             conv_mode = "mistral_instruct"
 46 |         elif "v1.6-34b" in model_name.lower():
 47 |             conv_mode = "chatml_direct"
 48 |         elif "v1" in model_name.lower():
 49 |             conv_mode = "llava_v1"
 50 |         elif "mpt" in model_name.lower():
 51 |             conv_mode = "mpt"
 52 |         else:
 53 |             conv_mode = "llava_v0"
 54 | 
 55 |         if conv_mode_input is not None and conv_mode != conv_mode_input:
 56 |             print(
 57 |                 "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
 58 |                     conv_mode, conv_mode_input, conv_mode_input
 59 |                 )
 60 |             )
 61 |             conv_mode = conv_mode_input
 62 |             
 63 |         self.model = model
 64 |         self.tokenizer = tokenizer
 65 |         self.image_processor = image_processor
 66 |         self.context_len = context_len
 67 |         self.conv_mode = conv_mode
 68 |         
 69 |     def infer(
 70 |         self, 
 71 |         query: str,
 72 |         images: list[Image.Image],
 73 |         top_p = None,
 74 |         num_beams: int = 1,
 75 |         max_new_tokens: int = 512,
 76 |         temperature: float = 0.0,
 77 |     ):
 78 |         qs = query
 79 |         image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
 80 |         if IMAGE_PLACEHOLDER in qs:
 81 |             if self.model.config.mm_use_im_start_end:
 82 |                 qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
 83 |             else:
 84 |                 qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
 85 |         else:
 86 |             if self.model.config.mm_use_im_start_end:
 87 |                 qs = image_token_se + "\n" + qs
 88 |             else:
 89 |                 qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 90 |             
 91 |         conv = conv_templates[self.conv_mode].copy()
 92 |         conv.append_message(conv.roles[0], qs)
 93 |         conv.append_message(conv.roles[1], None)
 94 |         prompt = conv.get_prompt()
 95 |     
 96 |         image_sizes = [x.size for x in images]
 97 |         images_tensor = process_images(
 98 |             images,
 99 |             self.image_processor,
100 |             self.model.config
101 |         ).to(self.model.device, dtype=torch.float16)
102 |         
103 |         input_ids = (
104 |             tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
105 |             .unsqueeze(0)
106 |             .cuda()
107 |         )
108 |         
109 |         with torch.inference_mode():
110 |             output_ids = self.model.generate(
111 |                 input_ids,
112 |                 images=images_tensor,
113 |                 image_sizes=image_sizes,
114 |                 do_sample=True if temperature > 0 else False,
115 |                 temperature=temperature,
116 |                 top_p=top_p,
117 |                 num_beams=num_beams,
118 |                 max_new_tokens=max_new_tokens,
119 |                 use_cache=True,
120 |             )
121 | 
122 |         outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
123 |         return outputs


--------------------------------------------------------------------------------
/openfungraph/scripts/ana_rigid_objs.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import pickle
  3 | import gzip
  4 | import argparse
  5 | import numpy as np
  6 | import open3d as o3d
  7 | 
  8 | from openfungraph.slam.slam_classes import MapObjectList
  9 | 
 10 | 
 11 | def get_parser():
 12 |     parser = argparse.ArgumentParser()
 13 |     parser.add_argument("--result_path", type=str, required=True)
 14 |     parser.add_argument("--part_result_path", type=str, required=True)
 15 |     
 16 |     return parser
 17 | 
 18 | 
 19 | def get_classes_colors(classes):
 20 |     class_colors = {}
 21 | 
 22 |     # Generate a random color for each class
 23 |     for class_idx, class_name in enumerate(classes):
 24 |         # Generate random RGB values between 0 and 255
 25 |         r = np.random.randint(0, 256)/255.0
 26 |         g = np.random.randint(0, 256)/255.0
 27 |         b = np.random.randint(0, 256)/255.0
 28 | 
 29 |         # Assign the RGB values as a tuple to the class in the dictionary
 30 |         class_colors[class_name] = (r, g, b)
 31 | 
 32 |     class_colors[-1] = (0, 0, 0)
 33 | 
 34 |     return class_colors
 35 | 
 36 | 
 37 | def compute_overlap_ratio(source, target, distance_threshold=0.02):
 38 |     # source: part
 39 |     # target: object
 40 | 
 41 |     # source_tree = o3d.geometry.KDTreeFlann(source)
 42 |     target_tree = o3d.geometry.KDTreeFlann(target)
 43 |     
 44 |     overlap_count = 0
 45 |     for point in source.points:
 46 |         [_, idx, _] = target_tree.search_radius_vector_3d(point, distance_threshold)
 47 |         if len(idx) > 0:
 48 |             overlap_count += 1
 49 |     
 50 |     overlap_ratio = overlap_count / len(source.points)
 51 |     return overlap_ratio
 52 | 
 53 | 
 54 | if __name__ == "__main__":
 55 |     parser = get_parser()
 56 |     args = parser.parse_args()
 57 | 
 58 |     result_path = args.result_path
 59 |     part_result_path = args.part_result_path
 60 | 
 61 |     with gzip.open(result_path, "rb") as f:
 62 |         results = pickle.load(f)
 63 |     
 64 |     with gzip.open(part_result_path, "rb") as fp:
 65 |         part_results = pickle.load(fp)
 66 |         
 67 |     objects = MapObjectList()
 68 |     objects.load_serializable(results['objects'])
 69 | 
 70 |     parts = MapObjectList()
 71 |     parts.load_serializable(part_results['objects'])
 72 |     
 73 |     # Run the post-processing filtering and merging in instructed to do so
 74 |     cfg = copy.deepcopy(results['cfg'])
 75 | 
 76 |     parts_interest = ["knob", "button", "handle"]
 77 | 
 78 |     rigid_inter_id_candidate = []
 79 |     part_inter_id_candidate = []
 80 | 
 81 |     for inter_idx, obj_inter in enumerate(objects):
 82 |         obj_inter['connected_parts'] = []
 83 |     
 84 |     for inter_idx, obj_inter in enumerate(objects):
 85 |         obj_classes_inter = np.asarray(obj_inter['class_name'])
 86 |         values_inter, counts_inter = np.unique(obj_classes_inter, return_counts=True)
 87 |         obj_class_inter = values_inter[np.argmax(counts_inter)]
 88 |         tag = False
 89 |         for obj_idx, obj in enumerate(parts):
 90 |             obj_classes = np.asarray(obj['class_name'])
 91 |             values, counts = np.unique(obj_classes, return_counts=True)
 92 |             obj_class = values[np.argmax(counts)]
 93 |             if obj_class in parts_interest:
 94 |                 # an interactable part
 95 |                 # detect nearby objects of interest
 96 |                 points_part = obj['pcd']
 97 |                 points_obj_inter = obj_inter['pcd']
 98 |                 iou = compute_overlap_ratio(points_part, points_obj_inter, 0.02)
 99 |                 # fusion based on inter objects: 1 object many parts and object must be big enough
100 |                 obj_box_extent = obj_inter['bbox'].extent
101 |                 part_box_extent = obj['bbox'].extent
102 |                 if iou > 0.7 and obj_box_extent.mean() > 3 * part_box_extent.mean():
103 |                     print(obj_class_inter, ' ', obj_class, ' ', iou)
104 |                     if 'connected_parts' not in obj_inter:
105 |                         # obj_inter['ori_id'] = inter_idx
106 |                         obj_inter['connected_parts'] = []
107 |                         obj_inter['connected_parts'].append(obj_idx)
108 |                         part_inter_id_candidate.append(obj_idx)
109 |                         tag = True
110 |                     else:
111 |                         obj_inter['connected_parts'].append(obj_idx)
112 |                         part_inter_id_candidate.append(obj_idx)
113 |                         tag = True
114 |         if tag:
115 |             rigid_inter_id_candidate.append(inter_idx)
116 | 
117 |     updated_results = {
118 |         'objects': objects.to_serializable(),
119 |         'cfg': results['cfg'],
120 |         'class_names': results['class_names'],
121 |         'class_colors': results['class_colors'],
122 |         'inter_id_candidate': rigid_inter_id_candidate
123 |     }    
124 | 
125 |     save_path = result_path
126 |     
127 |     with gzip.open(save_path, "wb") as f:
128 |         pickle.dump(updated_results, f)
129 |     print(f"Saved full point cloud to {save_path}")
130 | 
131 |     updated_results = {
132 |         'objects': parts.to_serializable(),
133 |         'cfg': part_results['cfg'],
134 |         'class_names': part_results['class_names'],
135 |         'class_colors': part_results['class_colors'],
136 |         'part_inter_id_candidate': part_inter_id_candidate
137 |     }    
138 | 
139 |     save_path = part_result_path
140 |     
141 |     with gzip.open(save_path, "wb") as f:
142 |         pickle.dump(updated_results, f)
143 |     print(f"Saved full point cloud to {save_path}")       


--------------------------------------------------------------------------------
/openfungraph/slam/slam_classes.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable
  2 | import copy
  3 | import matplotlib
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import numpy as np
  7 | import open3d as o3d
  8 | 
  9 | def to_numpy(tensor):
 10 |     if isinstance(tensor, np.ndarray):
 11 |         return tensor
 12 |     return tensor.detach().cpu().numpy()
 13 | 
 14 | def to_tensor(numpy_array, device=None):
 15 |     if isinstance(numpy_array, torch.Tensor):
 16 |         return numpy_array
 17 |     if device is None:
 18 |         return torch.from_numpy(numpy_array)
 19 |     else:
 20 |         return torch.from_numpy(numpy_array).to(device)
 21 | 
 22 | class DetectionList(list):
 23 |     def get_values(self, key, idx:int=None):
 24 |         if idx is None:
 25 |             return [detection[key] for detection in self]
 26 |         else:
 27 |             return [detection[key][idx] for detection in self]
 28 |     
 29 |     def get_stacked_values_torch(self, key, idx:int=None):
 30 |         values = []
 31 |         for detection in self:
 32 |             v = detection[key]
 33 |             if idx is not None:
 34 |                 v = v[idx]
 35 |             if isinstance(v, o3d.geometry.OrientedBoundingBox) or \
 36 |                 isinstance(v, o3d.geometry.AxisAlignedBoundingBox):
 37 |                 v = np.asarray(v.get_box_points())
 38 |             if isinstance(v, np.ndarray):
 39 |                 v = torch.from_numpy(v)
 40 |             values.append(v)
 41 |         return torch.stack(values, dim=0)
 42 |     
 43 |     def get_stacked_values_numpy(self, key, idx:int=None):
 44 |         values = self.get_stacked_values_torch(key, idx)
 45 |         return to_numpy(values)
 46 |     
 47 |     def __add__(self, other):
 48 |         new_list = copy.deepcopy(self)
 49 |         new_list.extend(other)
 50 |         return new_list
 51 |     
 52 |     def __iadd__(self, other):
 53 |         self.extend(other)
 54 |         return self
 55 |     
 56 |     def slice_by_indices(self, index: Iterable[int]):
 57 |         '''
 58 |         Return a sublist of the current list by indexing
 59 |         '''
 60 |         new_self = type(self)()
 61 |         for i in index:
 62 |             new_self.append(self[i])
 63 |         return new_self
 64 |     
 65 |     def slice_by_mask(self, mask: Iterable[bool]):
 66 |         '''
 67 |         Return a sublist of the current list by masking
 68 |         '''
 69 |         new_self = type(self)()
 70 |         for i, m in enumerate(mask):
 71 |             if m:
 72 |                 new_self.append(self[i])
 73 |         return new_self
 74 |     
 75 |     def get_most_common_class(self) -> list[int]:
 76 |         classes = []
 77 |         for d in self:
 78 |             values, counts = np.unique(np.asarray(d['class_name']), return_counts=True)
 79 |             most_common_class = values[np.argmax(counts)]
 80 |             classes.append(most_common_class)
 81 |         return classes
 82 |     
 83 |     def color_by_most_common_classes(self, colors_dict: dict[str, list[float]], color_bbox: bool=True):
 84 |         '''
 85 |         Color the point cloud of each detection by the most common class
 86 |         '''
 87 |         classes = self.get_most_common_class()
 88 |         for d, c in zip(self, classes):
 89 |             color = colors_dict[str(c)]
 90 |             d['pcd'].paint_uniform_color(color)
 91 |             if color_bbox:
 92 |                 d['bbox'].color = color
 93 |                 
 94 |     def color_by_instance(self):
 95 |         if len(self) == 0:
 96 |             # Do nothing
 97 |             return
 98 |         
 99 |         if "inst_color" in self[0]:
100 |             for d in self:
101 |                 d['pcd'].paint_uniform_color(d['inst_color'])
102 |                 d['bbox'].color = d['inst_color']
103 |         else:
104 |             cmap = matplotlib.colormaps.get_cmap("turbo")
105 |             instance_colors = cmap(np.linspace(0, 1, len(self)))
106 |             instance_colors = instance_colors[:, :3]
107 |             for i in range(len(self)):
108 |                 self[i]['pcd'].paint_uniform_color(instance_colors[i])
109 |                 self[i]['bbox'].color = instance_colors[i]
110 |             
111 |     
112 | class MapObjectList(DetectionList):
113 |     def compute_similarities(self, new_clip_ft):
114 |         '''
115 |         The input feature should be of shape (D, ), a one-row vector
116 |         This is mostly for backward compatibility
117 |         '''
118 |         # if it is a numpy array, make it a tensor 
119 |         new_clip_ft = to_tensor(new_clip_ft)
120 |         
121 |         # assuming cosine similarity for features
122 |         clip_fts = self.get_stacked_values_torch('clip_ft')
123 | 
124 |         similarities = F.cosine_similarity(new_clip_ft.unsqueeze(0), clip_fts)
125 |         # return similarities.squeeze()
126 |         return similarities
127 |     
128 |     def to_serializable(self):
129 |         s_obj_list = []
130 |         for obj in self:
131 |             s_obj_dict = copy.deepcopy(obj)
132 |             
133 |             s_obj_dict['clip_ft'] = to_numpy(s_obj_dict['clip_ft'])
134 |             s_obj_dict['text_ft'] = to_numpy(s_obj_dict['text_ft'])
135 |             
136 |             s_obj_dict['pcd_np'] = np.asarray(s_obj_dict['pcd'].points)
137 |             s_obj_dict['bbox_np'] = np.asarray(s_obj_dict['bbox'].get_box_points())
138 |             s_obj_dict['pcd_color_np'] = np.asarray(s_obj_dict['pcd'].colors)
139 |             
140 |             del s_obj_dict['pcd']
141 |             del s_obj_dict['bbox']
142 |             
143 |             s_obj_list.append(s_obj_dict)
144 |             
145 |         return s_obj_list
146 |     
147 |     def load_serializable(self, s_obj_list):
148 |         assert len(self) == 0, 'MapObjectList should be empty when loading'
149 |         for s_obj_dict in s_obj_list:
150 |             try:
151 |                 new_obj = copy.deepcopy(s_obj_dict)
152 |                 
153 |                 new_obj['clip_ft'] = to_tensor(new_obj['clip_ft'])
154 |                 new_obj['text_ft'] = to_tensor(new_obj['text_ft'])
155 |                 
156 |                 new_obj['pcd'] = o3d.geometry.PointCloud()
157 |                 new_obj['pcd'].points = o3d.utility.Vector3dVector(new_obj['pcd_np'])
158 |                 new_obj['bbox'] = o3d.geometry.OrientedBoundingBox.create_from_points(
159 |                     o3d.utility.Vector3dVector(new_obj['bbox_np']))
160 |                 new_obj['bbox'].color = new_obj['pcd_color_np'][0]
161 |                 new_obj['pcd'].colors = o3d.utility.Vector3dVector(new_obj['pcd_color_np'])
162 |                 
163 |                 del new_obj['pcd_np']
164 |                 del new_obj['bbox_np']
165 |                 del new_obj['pcd_color_np']
166 |                 
167 |                 self.append(new_obj)
168 |             except:
169 |                 continue


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open-Vocabulary Functional 3D Scene Graphs for Real-World Indoor Spaces
  2 | 
  3 | [**Project Page**](https://openfungraph.github.io/)
  4 | 
  5 | ![Splash Figure Top](./assets/teaser_top.jpg)
  6 | ![Splash Figure](./assets/teaser.png)
  7 | 
  8 | ## Setup
  9 | 
 10 | ### Install the required libraries
 11 | 
 12 | ```bash
 13 | conda create -n openfungraph python=3.10
 14 | conda activate openfungraph
 15 | 
 16 | ##### Install Pytorch according to your own setup #####
 17 | # For example, if you have a GPU with CUDA 11.8
 18 | # Note that this version is compatible with the LLaVA repo
 19 | # Here we install cudatoolkit via Conda for installation of Grounded-SAM
 20 | conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 cudatoolkit=11.8 -c pytorch -c nvidia
 21 | 
 22 | # Install the Faiss library (CPU version should be fine)
 23 | conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl
 24 | 
 25 | # Install Pytorch3D by 
 26 | https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md
 27 | # We recommend installing from a local clone to avoid confliction
 28 | 
 29 | # Install the required libraries
 30 | pip install tyro open_clip_torch wandb h5py openai hydra-core distinctipy pyviz3d line_profiler
 31 | 
 32 | # Install the gradslam package and its dependencies
 33 | git clone https://github.com/krrish94/chamferdist.git
 34 | cd chamferdist
 35 | pip install .
 36 | cd ..
 37 | git clone https://github.com/gradslam/gradslam.git
 38 | cd gradslam
 39 | git checkout conceptfusion
 40 | pip install .
 41 | ```
 42 | 
 43 | ### Install [Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything) package
 44 | 
 45 | Follow the instructions on the original [repo](https://github.com/IDEA-Research/Grounded-Segment-Anything#install-without-docker). 
 46 | 
 47 | First checkout the package by 
 48 | 
 49 | ```bash
 50 | git clone git@github.com:IDEA-Research/Grounded-Segment-Anything.git
 51 | ```
 52 | 
 53 | Then, install the package Following the commands listed in the original GitHub repo. You can skip the `Install osx` step and the "optional dependencies". 
 54 | 
 55 | During this process, you will need to set the `CUDA_HOME` to be where the CUDA toolkit is installed. 
 56 | The CUDA tookit can be set up system-wide or within a conda environment. 
 57 | We tested it within a conda environment, i.e. installing [cudatoolkit-dev](https://anaconda.org/conda-forge/cudatoolkit-dev) using conda by former commands. 
 58 | 
 59 | ```bash
 60 | # and you need to replace `export CUDA_HOME=/path/to/cuda-11.3/` by 
 61 | export CUDA_HOME=/path/to/anaconda3/envs/openfungraph/
 62 | ```
 63 | 
 64 | You also need to download `ram_swin_large_14m.pth`, `groundingdino_swint_ogc.pth`, `sam_vit_h_4b8939.pth` following the instruction [here](https://github.com/IDEA-Research/Grounded-Segment-Anything#label-grounded-sam-with-ram-or-tag2text-for-automatic-labeling). 
 65 | 
 66 | After installation, set the path to Grounded-SAM as an environment variable.
 67 | 
 68 | ```bash
 69 | export GSA_PATH=/path/to/Grounded-Segment-Anything
 70 | ```
 71 | 
 72 | ### Set up LLaVA
 73 | 
 74 | Follow the instructions on the [LLaVA repo](https://github.com/haotian-liu/LLaVA) to set it up. We have tested with model checkpoint `LLaVA-7B-v1.6`.
 75 | 
 76 | ### Install this repo
 77 | 
 78 | ```bash
 79 | cd OpenFunGraph
 80 | pip install -e .
 81 | ```
 82 | 
 83 | ## Prepare dataset
 84 | 
 85 | Download the [customized SceneFun3D dataset](https://huggingface.co/datasets/OpenFunGraph/SceneFun3D_Graph) and the newly recorded [FunGraph3D dataset](https://huggingface.co/datasets/OpenFunGraph/FunGraph3D). 
 86 | 
 87 | In their top repo, file structure is introduced.
 88 | Note that related path should be set as ``env_vars.bash.template``.
 89 | 
 90 | ```bash
 91 | export FUNGRAPH3D_ROOT=
 92 | export FUNGRAPH3D_CONFIG_PATH=${FG_FOLDER}/openfungraph/dataset/dataconfigs/fungraph3d/fungraph3d.yaml
 93 | export SCENEFUN3D_ROOT=  # for SceneFun3D, it should be with dev / test
 94 | export SCENEFUN3D_CONFIG_PATH=${FG_FOLDER}/openfungraph/dataset/dataconfigs/scenefun3d/scenefun3d.yaml
 95 | ```
 96 | 
 97 | OpenFunGraph can also be easily run on other dataset. 
 98 | See `dataset/datasets_common.py` for how to write your own dataloader. 
 99 | 
100 | ## Run OpenFunGraph
101 | 
102 | The env variables needed can be found in `env_vars.bash.template`. 
103 | When following the setup guide below, you should change the variables accordingly for easy setup. 
104 | 
105 | The following commands should be run in the `openfungraph` folder.
106 | 
107 | ```bash
108 | cd openfungraph
109 | ```
110 | 
111 | ### Functional Scene Graph Node Detection
112 | 
113 | ```bash
114 | export SCENE_NAME=
115 | 
116 | bash scenegraph/detection_scenefun3d.sh (or *_fungraph3d.sh)
117 | ```
118 | 
119 | The above commands will save the 2D node detection and segmentation results.
120 | 
121 | You can ignore the `There's a wrong phrase happen, this is because of our post-process merged wrong tokens, which will be modified in the future. We will assign it with a random label at this time.` message. 
122 | 
123 | ### 3D Functional Scene Graph Construction
124 | 
125 | Ensure that the `openai` package is installed and that your APIKEY is set. We recommend using GPT-4.
126 | ```bash
127 | export OPENAI_API_KEY=<your GPT-4 API KEY here>
128 | ```
129 | 
130 | ```bash
131 | CUDA_VISIBLE_DEVICES=0 python scenegraph/build_fungraph_whole_openai.py --dataset_root ${SCENEFUN3D_ROOT} ``or`` ${FUNGRAPH3D_ROOT}  --scene_name ${SCENE_NAME} --mapfile ``<SCENE_PATH>/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz`` --part_file ``<SCENE_PATH>/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz`` 
132 | ```
133 | 
134 | ### Visualize and Evaluation
135 | 
136 | After running the algorithm, you can get three modified key assets of object-level nodes ``<SCENE_PATH>/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz`` (the name could be varied depending on what parameters you choose), sub-object-level elements ``<SCENE_PATH>/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz``, and the finel graph edges ``<SCENE_PATH>/cfslam_funcgraph_edges.pkl`` (or with confidence).
137 | 
138 | Visualize them by
139 | ```bash
140 | python scripts/pyviz3d_interactable_results.py --inter_result_path <PATH TO OBJECT-LEVEL NODES> --part_result_path <PATH TO SUB-OBJECT-LEVEL ELEMENTS> --edge_file <PATH TO GRAPH EDGES> --pc_path <PATH TO SCENE POINT CLOUD> (--pose_path (only for SCENEFUN3D) <SCENE_PATH>/*_transform.npy) 
141 | ```
142 | 
143 | Evaluation scripts:
144 | For node evaluation
145 | ```bash
146 | python eval/eval_node.py --dataset <SceneFun3D or FunGraph3D> --root_path <PATH TO THE DATASET> --scene <SCENE NAME> --video <VIDEO NAME> (--split (only for SCENEFUN3D) <dev or test>)
147 | ```
148 | 
149 | You can also control the top K value for retrieval and the IoU threshold for spatial alignment by using ``--topk`` and ``--iou_threshold``. 
150 | 
151 | For triplet evaluation
152 | ```bash
153 | python eval/eval_triplet.py --dataset <SceneFun3D or FunGraph3D> --root_path <PATH TO THE DATASET> --scene <SCENE NAME> --video <VIDEO NAME> (--split (only for SCENEFUN3D) <dev or test>)
154 | ```


--------------------------------------------------------------------------------
/openfungraph/utils/model_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from openfungraph.utils.general_utils import measure_time
  3 | from line_profiler import profile
  4 | from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image
  8 | from scipy.spatial.distance import cosine
  9 | 
 10 | def get_sam_predictor(cfg) -> SamPredictor:
 11 |     if cfg.sam_variant == "sam":
 12 |         sam = sam_model_registry[cfg.sam_encoder_version](checkpoint=cfg.sam_checkpoint_path)
 13 |         sam.to(cfg.device)
 14 |         sam_predictor = SamPredictor(sam)
 15 |         return sam_predictor
 16 |     
 17 |     if cfg.sam_variant == "mobilesam":
 18 |         from MobileSAM.setup_mobile_sam import setup_model
 19 |         # MOBILE_SAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./EfficientSAM/mobile_sam.pt")
 20 |         # checkpoint = torch.load(MOBILE_SAM_CHECKPOINT_PATH)
 21 |         checkpoint = torch.load(cfg.mobile_sam_path)
 22 |         mobile_sam = setup_model()
 23 |         mobile_sam.load_state_dict(checkpoint, strict=True)
 24 |         mobile_sam.to(device=cfg.device)
 25 |         
 26 |         sam_predictor = SamPredictor(mobile_sam)
 27 |         return sam_predictor
 28 | 
 29 |     elif cfg.sam_variant == "lighthqsam":
 30 |         from LightHQSAM.setup_light_hqsam import setup_model
 31 |         HQSAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./EfficientSAM/sam_hq_vit_tiny.pth")
 32 |         checkpoint = torch.load(HQSAM_CHECKPOINT_PATH)
 33 |         light_hqsam = setup_model()
 34 |         light_hqsam.load_state_dict(checkpoint, strict=True)
 35 |         light_hqsam.to(device=cfg.device)
 36 |         
 37 |         sam_predictor = SamPredictor(light_hqsam)
 38 |         return sam_predictor
 39 |         
 40 |     elif cfg.sam_variant == "fastsam":
 41 |         raise NotImplementedError
 42 |     else:
 43 |         raise NotImplementedError
 44 | 
 45 | # Prompting SAM with detected boxes in a batch
 46 | def get_sam_segmentation_from_xyxy_batched(sam_predictor: SamPredictor, image: np.ndarray, xyxy_tensor: torch.Tensor) -> torch.Tensor:
 47 |     
 48 |     sam_predictor.set_image(image)
 49 |     
 50 |     transformed_boxes = sam_predictor.transform.apply_boxes_torch(xyxy_tensor, image.shape[:2])
 51 |     
 52 |     masks, _, _ = sam_predictor.predict_torch(
 53 |         point_coords=None,
 54 |         point_labels=None,
 55 |         boxes=transformed_boxes,
 56 |         multimask_output=False,
 57 |     )
 58 |     
 59 |     return masks.squeeze()
 60 | 
 61 | # Prompting SAM with detected boxes in a batch
 62 | def get_sam_segmentation_from_xyxy(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
 63 |     
 64 |     sam_predictor.set_image(image)
 65 |     
 66 |     result_masks = []
 67 |     for box in xyxy:
 68 |         masks, scores, logits = sam_predictor.predict(
 69 |             box=box,
 70 |             multimask_output=True
 71 |         )
 72 |         index = np.argmax(scores)
 73 |         result_masks.append(masks[index])
 74 |     return np.array(result_masks)
 75 |     
 76 | def compute_clip_features(image, detections, clip_model, clip_preprocess, clip_tokenizer, classes, device):
 77 |     backup_image = image.copy()
 78 |     
 79 |     image = Image.fromarray(image)
 80 |     
 81 |     # padding = args.clip_padding  # Adjust the padding amount as needed
 82 |     padding = 20  # Adjust the padding amount as needed
 83 |     
 84 |     image_crops = []
 85 |     image_feats = []
 86 |     text_feats = []
 87 | 
 88 |     
 89 |     for idx in range(len(detections.xyxy)):
 90 |         # Get the crop of the mask with padding
 91 |         x_min, y_min, x_max, y_max = detections.xyxy[idx]
 92 | 
 93 |         # Check and adjust padding to avoid going beyond the image borders
 94 |         image_width, image_height = image.size
 95 |         left_padding = min(padding, x_min)
 96 |         top_padding = min(padding, y_min)
 97 |         right_padding = min(padding, image_width - x_max)
 98 |         bottom_padding = min(padding, image_height - y_max)
 99 | 
100 |         # Apply the adjusted padding
101 |         x_min -= left_padding
102 |         y_min -= top_padding
103 |         x_max += right_padding
104 |         y_max += bottom_padding
105 | 
106 |         cropped_image = image.crop((x_min, y_min, x_max, y_max))
107 |         
108 |         # Get the preprocessed image for clip from the crop 
109 |         preprocessed_image = clip_preprocess(cropped_image).unsqueeze(0).to("cuda")
110 | 
111 |         crop_feat = clip_model.encode_image(preprocessed_image)
112 |         crop_feat /= crop_feat.norm(dim=-1, keepdim=True)
113 |         
114 |         class_id = detections.class_id[idx]
115 |         tokenized_text = clip_tokenizer([classes[class_id]]).to("cuda")
116 |         text_feat = clip_model.encode_text(tokenized_text)
117 |         text_feat /= text_feat.norm(dim=-1, keepdim=True)
118 |         
119 |         crop_feat = crop_feat.cpu().numpy()
120 |         text_feat = text_feat.cpu().numpy()
121 | 
122 |         image_crops.append(cropped_image)
123 |         image_feats.append(crop_feat)
124 |         text_feats.append(text_feat)
125 |         
126 |     # turn the list of feats into np matrices
127 |     image_feats = np.concatenate(image_feats, axis=0)
128 |     text_feats = np.concatenate(text_feats, axis=0)
129 | 
130 |     return image_crops, image_feats, text_feats
131 | 
132 | @profile
133 | def compute_clip_features_batched(image, detections, clip_model, clip_preprocess, clip_tokenizer, classes, device):
134 |     
135 |     image = Image.fromarray(image)
136 |     padding = 20  # Adjust the padding amount as needed
137 |     
138 |     image_crops = []
139 |     preprocessed_images = []
140 |     text_tokens = []
141 |     
142 |     # Prepare data for batch processing
143 |     for idx in range(len(detections.xyxy)):
144 |         x_min, y_min, x_max, y_max = detections.xyxy[idx]
145 |         image_width, image_height = image.size
146 |         left_padding = min(padding, x_min)
147 |         top_padding = min(padding, y_min)
148 |         right_padding = min(padding, image_width - x_max)
149 |         bottom_padding = min(padding, image_height - y_max)
150 | 
151 |         x_min -= left_padding
152 |         y_min -= top_padding
153 |         x_max += right_padding
154 |         y_max += bottom_padding
155 | 
156 |         cropped_image = image.crop((x_min, y_min, x_max, y_max))
157 |         preprocessed_image = clip_preprocess(cropped_image).unsqueeze(0)
158 |         preprocessed_images.append(preprocessed_image)
159 | 
160 |         class_id = detections.class_id[idx]
161 |         text_tokens.append(classes[class_id])
162 |         image_crops.append(cropped_image)
163 |     
164 |     # Convert lists to batches
165 |     preprocessed_images_batch = torch.cat(preprocessed_images, dim=0).to(device)
166 |     text_tokens_batch = clip_tokenizer(text_tokens).to(device)
167 |     
168 |     # Batch inference
169 |     with torch.no_grad():
170 |         image_features = clip_model.encode_image(preprocessed_images_batch)
171 |         image_features /= image_features.norm(dim=-1, keepdim=True)
172 |         
173 |         text_features = clip_model.encode_text(text_tokens_batch)
174 |         text_features /= text_features.norm(dim=-1, keepdim=True)
175 |     
176 |     # Convert to numpy
177 |     image_feats = image_features.cpu().numpy()
178 |     text_feats = text_features.cpu().numpy()
179 |     # image_feats = []
180 |     # text_feats = []
181 |     
182 |     return image_crops, image_feats, text_feats
183 | 
184 | 
185 | def compute_ft_vector_closeness_statistics(unbatched, batched):
186 |     # Initialize lists to store statistics
187 |     mad = []  # Mean Absolute Difference
188 |     max_diff = []  # Maximum Absolute Difference
189 |     mrd = []  # Mean Relative Difference
190 |     cosine_sim = []  # Cosine Similarity
191 | 
192 |     for i in range(len(unbatched)):
193 |         diff = np.abs(unbatched[i] - batched[i])
194 |         mad.append(np.mean(diff))
195 |         max_diff.append(np.max(diff))
196 |         mrd.append(np.mean(diff / (np.abs(batched[i]) + 1e-8)))  # Adding a small value to avoid division by zero
197 |         cosine_sim.append(1 - cosine(unbatched[i].flatten(), batched[i].flatten()))  # 1 - cosine distance to get similarity
198 | 
199 |     # Convert lists to numpy arrays for easy statistics
200 |     mad = np.array(mad)
201 |     max_diff = np.array(max_diff)
202 |     mrd = np.array(mrd)
203 |     cosine_sim = np.array(cosine_sim)
204 | 
205 |     # Print statistics
206 |     print(f"Mean Absolute Difference: {np.mean(mad)}")
207 |     print(f"Maximum Absolute Difference: {np.max(max_diff)}")
208 |     print(f"Mean Relative Difference: {np.mean(mrd)}")
209 |     print(f"Mean Cosine Similarity: {np.mean(cosine_sim)}")
210 |     print(f"Min Cosine Similarity: {np.min(cosine_sim)}")


--------------------------------------------------------------------------------
/openfungraph/scripts/pyviz3d_interactable_results.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | import pickle
  4 | import gzip
  5 | import argparse
  6 | import numpy as np
  7 | import open3d as o3d
  8 | from openfungraph.slam.slam_classes import MapObjectList
  9 | import pyviz3d.visualizer as viz
 10 | 
 11 | 
 12 | def get_parser():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument("--inter_result_path", type=str, default=None)
 15 |     parser.add_argument("--part_result_path", type=str, default=None)
 16 |     parser.add_argument("--edge_file", type=str, default=None)
 17 |     parser.add_argument("--pc_path", type=str, default=None)
 18 |     parser.add_argument("--pose_path", type=str, default=None)
 19 |     return parser
 20 | 
 21 | 
 22 | def load_result(result_path):
 23 | 
 24 |     with gzip.open(result_path, "rb") as f:
 25 |         results = pickle.load(f)
 26 |     
 27 |     objects = MapObjectList()
 28 |     objects.load_serializable(results["objects"])
 29 | 
 30 |     if 'bg_objects' not in results:
 31 |         bg_objects = None
 32 |     elif results['bg_objects'] is None:
 33 |         bg_objects = None
 34 |     else:
 35 |         bg_objects = MapObjectList()
 36 |         bg_objects.load_serializable(results["bg_objects"])
 37 | 
 38 |     class_colors = results['class_colors']
 39 |     class_names = results['class_names']
 40 |     try:
 41 |         obj_cand = results['inter_id_candidate']
 42 |     except:
 43 |         obj_cand = results['part_inter_id_candidate']
 44 | 
 45 |     return objects, bg_objects, class_colors, class_names, obj_cand
 46 | 
 47 | 
 48 | def main(args):
 49 |     inter_result_path = args.inter_result_path
 50 |     part_result_path = args.part_result_path
 51 |         
 52 |     objects, _, class_colors, class_names, obj_cand = load_result(inter_result_path)
 53 |     parts, _, part_colors, _, part_cand = load_result(part_result_path)
 54 | 
 55 |     if args.pc_path is not None:
 56 |         scene_pc = o3d.io.read_point_cloud(args.pc_path)
 57 |         if args.pose_path is not None:
 58 |             scene_pc.transform(np.load(args.pose_path))
 59 |         pc_center = np.mean(np.asarray(scene_pc.points), axis=0)
 60 |     else:
 61 |         pc_center = np.array([0.0,0.0,0.0])
 62 |     
 63 |     if args.edge_file is not None:
 64 |         with open(args.edge_file, "rb") as f:
 65 |             edges = pickle.load(f)
 66 |     
 67 |     v = viz.Visualizer()
 68 | 
 69 |     normals = []
 70 |     obj_centers = []
 71 |     # Sub-sample the point cloud for better interactive experience
 72 |     for i in obj_cand:
 73 |         pcd = objects[i]['pcd']
 74 |         pcd = pcd.voxel_down_sample(0.01)
 75 |         objects[i]['pcd'] = pcd
 76 |         pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30))
 77 |         normals.append(np.asarray(pcd.normals))
 78 |         obj_centers.append(np.mean(np.asarray(pcd.points), axis=0)-pc_center)
 79 |     
 80 |     for i in part_cand:
 81 |         pcd = parts[i]['pcd']
 82 |         pcd = pcd.voxel_down_sample(0.01)
 83 |         parts[i]['pcd'] = pcd
 84 |         pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30))
 85 |         normals.append(np.asarray(pcd.normals))
 86 |         obj_centers.append(np.mean(np.asarray(pcd.points), axis=0)-pc_center)
 87 |     
 88 |     pcds = copy.deepcopy(objects.get_values("pcd"))
 89 |     part_pcds = copy.deepcopy(parts.get_values("pcd"))
 90 | 
 91 |     # Get the color for each object when colored by their class
 92 |     object_classes = []
 93 |     object_class_names = []
 94 |     object_colors = []
 95 |     obj_color_dict = {}
 96 |     points = []
 97 |     colors = []
 98 |     for i in obj_cand:
 99 |         obj = objects[i]
100 |         pcd = pcds[i]
101 |         obj_classes = np.asarray(obj['class_id'])
102 |         values, counts = np.unique(obj_classes, return_counts=True)
103 |         obj_class = values[np.argmax(counts)]
104 |         object_classes.append(obj_class)
105 |         object_class_names.append(obj['refined_obj_tag'])
106 |         r = np.random.randint(0, 256)/255.0
107 |         g = np.random.randint(0, 256)/255.0
108 |         b = np.random.randint(0, 256)/255.0
109 |         object_colors.append(np.array([r,g,b]))
110 |         obj_color_dict['O' + str(i)] = np.array([r,g,b])
111 |         points.append(np.asarray(pcd.points)-pc_center)
112 |         colors.append(np.asarray(pcd.colors))
113 |     
114 |     part_color_dict = {}
115 |     for i in part_cand:
116 |         obj = parts[i]
117 |         pcd = part_pcds[i]
118 |         obj_classes = np.asarray(obj['class_id'])
119 |         values, counts = np.unique(obj_classes, return_counts=True)
120 |         obj_class = values[np.argmax(counts)]
121 |         object_classes.append(obj_class)
122 |         object_class_names.append(obj['refined_obj_tag'])
123 |         if 'knob' in obj['refined_obj_tag']:
124 |             if 'knob' not in part_color_dict.keys():
125 |                 r = np.random.randint(0, 256)/255.0
126 |                 g = np.random.randint(0, 256)/255.0
127 |                 b = np.random.randint(0, 256)/255.0
128 |                 part_color_dict['knob'] = np.array([r,g,b])
129 |             object_colors.append(part_color_dict['knob'])
130 |             obj_color_dict['P' + str(i)] = part_color_dict['knob']
131 |         elif 'handle' in obj['refined_obj_tag']:
132 |             if 'handle' not in part_color_dict.keys():
133 |                 r = np.random.randint(0, 256)/255.0
134 |                 g = np.random.randint(0, 256)/255.0
135 |                 b = np.random.randint(0, 256)/255.0
136 |                 part_color_dict['handle'] = np.array([r,g,b])
137 |             object_colors.append(part_color_dict['handle'])
138 |             obj_color_dict['P' + str(i)] = part_color_dict['handle']
139 |         elif 'button' in obj['refined_obj_tag']:
140 |             if 'button' not in part_color_dict.keys():
141 |                 r = np.random.randint(0, 256)/255.0
142 |                 g = np.random.randint(0, 256)/255.0
143 |                 b = np.random.randint(0, 256)/255.0
144 |                 part_color_dict['button'] = np.array([r,g,b])
145 |             object_colors.append(part_color_dict['button'])
146 |             obj_color_dict['P' + str(i)] = part_color_dict['button']
147 |         else:
148 |             r = np.random.randint(0, 256)/255.0
149 |             g = np.random.randint(0, 256)/255.0
150 |             b = np.random.randint(0, 256)/255.0
151 |             object_colors.append(np.array([r,g,b]))
152 |             obj_color_dict['P' + str(i)] = np.array([r,g,b])
153 |         points.append(np.asarray(pcd.points)-pc_center)
154 |         colors.append(np.asarray(pcd.colors))
155 |     
156 |     for idx, point in enumerate(points):
157 |         if idx < len(obj_cand):
158 |             v.add_points(f'O{obj_cand[idx]}', point, colors[idx] * 255, normals[idx], point_size=20, visible=True)
159 |             v.add_points(f'InsO{obj_cand[idx]}', np.expand_dims(obj_centers[idx], axis=0), np.expand_dims(object_colors[idx], axis=0) * 255, point_size=200, resolution=15, visible=True)
160 |         else:
161 |             v.add_points(f'P{part_cand[idx - len(obj_cand)]}', point, colors[idx] * 255, normals[idx], point_size=20, visible=True)
162 |             v.add_points(f'InsP{part_cand[idx - len(obj_cand)]}', np.expand_dims(obj_centers[idx], axis=0), np.expand_dims(object_colors[idx], axis=0) * 255, point_size=150, resolution=15, visible=True)
163 |     # add whole pc
164 |     if args.pc_path is not None:
165 |         scene_pc.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30))
166 |         scene_normals = np.asarray(scene_pc.normals)
167 |         v.add_points('Background', np.asarray(scene_pc.points) - pc_center, np.asarray(scene_pc.colors)*255, scene_normals, point_size=20, visible=True)
168 | 
169 |     for idx in range(len(object_class_names)):
170 |         if idx < len(obj_cand):
171 |             v.add_labels(f'LabelO{obj_cand[idx]}',
172 |                     'O' + str(obj_cand[idx]) + ': ' + object_class_names[idx],
173 |                     # ['O' + str(obj_cand[idx])],
174 |                     obj_centers[idx],
175 |                     object_colors[idx] * 255,
176 |                     '11px',
177 |                     visible=True)
178 |         else:
179 |             v.add_labels(f'LabelP{part_cand[idx - len(obj_cand)]}',
180 |                     'P' + str(part_cand[idx - len(obj_cand)]) + ': ' + object_class_names[idx],
181 |                     # ['P' + str(part_cand[idx - len(obj_cand)])],
182 |                     obj_centers[idx],
183 |                     object_colors[idx] * 255,
184 |                     '11px',
185 |                     visible=True)
186 |     
187 |     all_obj_centers = []
188 |     all_part_centers = []
189 |     for i in range(len(objects)):
190 |         pcd = objects[i]['pcd']
191 |         all_obj_centers.append(np.mean(np.asarray(pcd.points), axis=0)-pc_center)
192 |     for i in range(len(parts)):
193 |         pcd = parts[i]['pcd']
194 |         all_part_centers.append(np.mean(np.asarray(pcd.points), axis=0)-pc_center)
195 | 
196 |     if args.edge_file is not None:
197 |         edge_func = []
198 |         edge_centers = []
199 |         for edge in edges:
200 |             # [(0:obj_idx: obj.pkl, 1:part_idx: part.pkl，2:-1，3:description)] rigid
201 |             # [(0:obj_idx: obj.pkl, 1:-1，2:obj_idx: obj.pkl，3:description)] remote
202 |             if edge[2] == -1:
203 |                 edge_func.append(edge[3])
204 |                 edge_centers.append((all_obj_centers[edge[0]] + all_part_centers[edge[1]]) / 2)
205 |                 v.add_polyline(f'EdgeOP{edge[0]}{edge[1]}', np.array([all_obj_centers[edge[0]], all_part_centers[edge[1]]]), color=np.array([100.0, 100.0, 100.0]), edge_width=0.02, alpha=0.5)
206 |             elif edge[1] == -1:
207 |                 edge_func.append(edge[3])
208 |                 edge_centers.append((all_obj_centers[edge[0]] + all_obj_centers[edge[2]]) / 2)
209 |                 v.add_polyline(f'EdgeOO{edge[0]}{edge[2]}', np.array([all_obj_centers[edge[0]], all_obj_centers[edge[2]]]), color=np.array([100.0, 100.0, 100.0]), edge_width=0.02, alpha=0.5)
210 |         edge_centers = np.stack(edge_centers)
211 |         
212 |     if args.edge_file is not None:
213 |         for i in range(len(edge_func)):
214 |             v.add_labels(f'EdgeLabels{i}',
215 |                         edge_func[i],
216 |                         edge_centers[i],
217 |                         np.ones(3) * 255,
218 |                         '8px',
219 |                         visible=True)
220 | 
221 |     v.save(os.path.dirname(inter_result_path)+ '/fungraph')
222 |     np.save(os.path.dirname(inter_result_path)+ '/fungraph/color.npy', obj_color_dict)
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     parser = get_parser()
227 |     args = parser.parse_args()
228 |     main(args)


--------------------------------------------------------------------------------
/openfungraph/scenegraph/GPTPrompt.py:
--------------------------------------------------------------------------------
  1 | class GPTPrompt:
  2 |     def __init__(self):
  3 |         
  4 |         self.old_system_prompt = """
  5 |         The input is a list of JSONs describing multiple predictions of a single object. Each JSON has four fields: 
  6 |         1. id: a unique identifier for the object
  7 |         2. bbox_extent: the 3D bounding box extents of the object 
  8 |         3. bbox_center: the 3D bounding box center of the object 
  9 |         4. caption: a caption predicted by an image captioning model referencing that object. This caption should be brief and in sparse prose. For example, the caption "the object described appears to be described as a electric bicycle, it is sitting alongside a red suitcase which is nearby" should be shortened to "electric bike, near red suitcase".
 10 |         There may be upto 10 such bounding boxes and captions in each input.
 11 |         The captions may not always be accurate or consistent (often, predictions may just be wrong). 
 12 |         The only valid objects are ones that we commonly find in indoor scenes. Any predictions that reference 
 13 |         people, animals, or objects that are impossible to find indoors, must be tagged as "invalid".
 14 | 
 15 |         Output a brief, informative language tag for each object being referenced to. If the captions are grossly
 16 |         inconsistent, output "invalid" for that object.
 17 | 
 18 |         The output must be a single JSON containing just the following fields. "summary" indicating a brief 
 19 |         summary of your understanding of the object being referenced to. "possible_tags" indicating a list of 
 20 |         possible tags that you think the object could be. "object_tag" indicating the final tag that you think 
 21 |         this object should be, considering everything else in the scene (particularly, nearby other objects). 
 22 |         Before suggesting the final tag, consider the actual size of the object (you have this in the "bbox") 
 23 |         and identify the best possible tag this object could be. Verify that the output is in valid JSON format with
 24 |         the fields "summary", "possible_tags", and "object_tag". The object_tag must be supported by the captions.
 25 |         It is very important that the output must be valid JSON, and nothing else.
 26 |         """
 27 |         
 28 |         self.new_but_long_system_prompt = """You are a helpful assistant that helps identify and describe objects in a scene. Your input is a in JSON format, and you should always reply in JSON format. Your input will contain the field "caption" which is a list of captions of an image attempting to identify the objects in the image. Your response should contain the fields "summary" containing a concise summary of the object(s) in the image. If an object is mentioned more than once, that prediction is likely accurate. If many objects are mentioned more than once, and a container or surface is mentioned, it is likely that the image was of those objects (mentioned more than once) on that container or surface. If no object is mentioned more than once, and each caption is unrelated to the rest, it could be a blank or too small image or blurry image, the captions are likely incorrect, so just say "conflicting captions about [objects] in the summary field and put "invalid" in the "object_tag" field. The field "possible_tags" should contain a list of possible tags that you think the object(s) could be. The field "object_tag" should contain the final tag that you think this object should be, considering all the information given. These are based on scans of indoor scenes so most objects will be those found in indoor spaces. """
 29 |         
 30 |         self.system_prompt = """
 31 |         Identify and describe objects in scenes. 
 32 |         Input and output must be in JSON format. 
 33 |         The input field 'captions' contains a list of image captions aiming to identify objects. 
 34 |         Output 'summary' as a concise description of the identified object(s). 
 35 |         An object mentioned multiple times is likely accurate. 
 36 |         If various objects are repeated and a container/surface is noted such as a shelf or table, assume the (repeated) objects are on that container/surface. 
 37 |         For unrelated, non-repeating (or empty) captions, or captions with more than three 'others', summarize as 'conflicting (or empty) captions about [objects]' and set 'object_tag' to 'invalid'. 
 38 |         Output 'possible_tags' listing potential object categories. 
 39 |         Set 'object_tag' as the conclusive identification. 
 40 |         Focus on indoor object types, as the input captions are from indoor scans.
 41 |         """
 42 |         
 43 |         self.example_1 = """
 44 |         "id": 1,
 45 |         "captions": [
 46 |         "a jacket hanging on a wall, either on a hook or a rack.",
 47 |         "a jacket, which is hanging on a wall or a rack.",
 48 |         "a jacket, which is either being worn or draped over a person's shoulders.",
 49 |         "a sweater, which is hanging on a clothes hanger.",
 50 |         "a hooded jacket, which is either hanging on a hook or draped over a shower rail.",
 51 |         "a mannequin, which is wearing a yellow shirt and a red jacket.",
 52 |         "a jacket, which is hanging on a hook or a rack.",
 53 |         "a hooded sweatshirt, which is either being held by someone or hanging on a hook or a rack.",
 54 |         "a hanger with a yellow and black jacket hanging on it.",
 55 |         "a yellow and black striped umbrella.",
 56 |         "a hanging coat, which is either yellow or red depending on the specific description.",
 57 |         "a cat.",
 58 |         "a person wearing a yellow and red jacket."
 59 |         ]
 60 |         }"
 61 |         """
 62 |         
 63 |         self.response_1 = """{
 64 |         "summary": "a jacket hanging on a rack",
 65 |         "possible_tags": ["jacket", "sweater", "hooded jacket", "hooded sweatshirt", "coat", "hanging clothing", "hanger", "wall", "rack"],
 66 |         "object_tag": "hanging jacket"
 67 |         }"""
 68 |         
 69 |         self.example_2 = """{
 70 |         "id": 12,
 71 |         "captions": [
 72 |         "a bookshelf filled with books",
 73 |         "a bicycle helmet"
 74 |         ]
 75 |         }"""
 76 |         
 77 |         self.response_2 = """{
 78 |         "summary": "conflicting captions of a bookshelf and a bicycle helmet",
 79 |         "possible_tags": ["bookshelf", "bicycle helmet", "helmet", "bookcase", "book", "books", "shelf", "shelves"],
 80 |         "object_tag": "invalid"
 81 |         }"""
 82 |         
 83 |         self.example_3 = """{
 84 |         "id": 304,
 85 |         "captions": [
 86 |         "a pair of scissors.",
 87 |         "a sewing machine.",
 88 |         "a white shelf or rack, which is filled with various boxes and files.",
 89 |         "a white shelf or bookshelf that is filled with various items."
 90 |         ]
 91 |         }"""
 92 |         
 93 |         self.response_3 = """{
 94 |         "summary": "a white shelf or rack filled with various items",
 95 |         "possible_tags": ["shelf", "rack", "bookshelf", "bookcase", "box", "file", "scissors", "sewing machine"],
 96 |         "object_tag": "shelf"
 97 |         }"""
 98 |         
 99 |         self.example_4 = """{
100 |         "id": 433,
101 |         "captions": [
102 |         "a white toilet.",
103 |         "a pile of various exercise equipment, including a set of tennis balls.",
104 |         "a white table.",
105 |         "a barbell.",
106 |         "a microwave.",
107 |         "a tall, thin black bottle.",
108 |         "a barbell, which is situated on a table.",
109 |         "a pile of various exercise equipment, including a bench, situated in a room with desks and a whiteboard.",
110 |         "a tall vase or pitcher.",
111 |         "a tennis ball.",
112 |         "a shirt with a space design on it.",
113 |         "a tennis ball.",
114 |         "a gray shirt with the NASA logo on it."
115 |         ]
116 |         }"""
117 |         
118 |         self.response_4 = """{
119 |         "summary": "a white table containing a barbell and possibly with some other exercise equipment",
120 |         "possible_tags": ["exercise equipment", "tennis ball", "barbell", "bench", "table", "microwave", "bottle", "vase", "pitcher", "shirt"],
121 |         "object_tag": "white table"
122 |         }"""
123 |         
124 |         self.example_5 = """{
125 |         "id": 231,
126 |         "captions": [
127 |             "a teddy bear.",
128 |             "a doorknob.",
129 |             "a television set.",
130 |             "a laptop computer."
131 |         ]
132 |         }"""
133 |         
134 |         self.response_5 = """{
135 |         "summary": "conflicting captions of a teddy bear, a doorknob, a television set, and a laptop computer",
136 |         "possible_tags": ["teddy bear", "doorknob", "television set", "laptop computer", "computer"],
137 |         "object_tag": "invalid"
138 |         }"""
139 | 
140 |         self.example_6 = """
141 |             {
142 |                 "id": 86,
143 |                 "captions": [
144 |                     " 'others'. Reason: The image is blurry and indistinct, making it difficult to determine the exact nature of the item.",
145 |                     " 'knob'. Reason: The red outline is around a circular object that appears to be a part of a glass door, and it is located on a wooden surface, which is consistent with a knob.",
146 |                     " 'handle'. Reason: The shape and position of the item suggest it is a handle, which is typically used to open or close a glass door.",
147 |                     " 'others'. Reason: The red outline is not clearly defined enough to determine its exact nature, and it does not fit into any of the other categories.",
148 |                     " 'others'. Reason: The red outline is over a blurry image, and it's not possible to determine the exact nature of the item from the provided image.",
149 |                     " 'handle'. Reason: The shape and location of the item suggest it is a handle, which is typically used to open or close a glass door.",
150 |                     " 'others'. Reason: The red outline is drawn over a blurry image, and it is not possible to determine the exact nature of the item from the provided image.",
151 |                 ]
152 |             }"""
153 | 
154 |         self.response_6 = """
155 |             {"summary": "conflicting captions about a knob, a handle, and others",
156 |             "possible_tags": ["knob", "handle", "glass shower door", "wooden surface"],
157 |             "object_tag": "invalid"}
158 |         """
159 | 
160 |     def get_json(self):
161 |         prompt_json = [
162 |             {
163 |                 "role": "system",
164 |                 "content": self.system_prompt
165 |             },
166 |             {
167 |                 "role": "user",
168 |                 "content": self.example_1
169 |             },
170 |             {
171 |                 "role": "assistant",
172 |                 "content": self.response_1
173 |             },
174 |             {
175 |                 "role": "user",
176 |                 "content": self.example_2
177 |             },
178 |             {
179 |                 "role": "assistant",
180 |                 "content": self.response_2
181 |             },
182 |             {
183 |                 "role": "user",
184 |                 "content": self.example_3
185 |             },
186 |             {
187 |                 "role": "assistant",
188 |                 "content": self.response_3
189 |             },
190 |             {
191 |                 "role": "user",
192 |                 "content": self.example_4
193 |             },
194 |             {
195 |                 "role": "assistant",
196 |                 "content": self.response_4
197 |             },
198 |             {
199 |                 "role": "user",
200 |                 "content": self.example_5
201 |             },
202 |             {
203 |                 "role": "assistant",
204 |                 "content": self.response_5
205 |             },
206 |              {
207 |                 "role": "user",
208 |                 "content": self.example_6
209 |             },
210 |             {
211 |                 "role": "assistant",
212 |                 "content": self.response_6
213 |             }
214 |         ]
215 |         return prompt_json
216 | 
217 | # Usage example
218 | if __name__ == "__main__":
219 |     prompt_obj = GPTPrompt()
220 |     json_data = prompt_obj.get_json()
221 |     print(json_data)


--------------------------------------------------------------------------------
/openfungraph/slam/cfslam_pipeline_batch.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The script is used to model Grounded SAM detections in 3D, it assumes the tag2text classes are avaialable. It also assumes the dataset has Clip features saved for each object/mask.
  3 | '''
  4 | 
  5 | # Standard library imports
  6 | import copy
  7 | from datetime import datetime
  8 | import os
  9 | from pathlib import Path
 10 | import gzip
 11 | import pickle
 12 | import numpy as np
 13 | import open3d as o3d
 14 | import torch
 15 | from tqdm import trange
 16 | 
 17 | import hydra
 18 | import omegaconf
 19 | from omegaconf import DictConfig
 20 | 
 21 | # Local application/library specific imports
 22 | from openfungraph.dataset.datasets_common import get_dataset
 23 | from openfungraph.utils.vis import OnlineObjectRenderer
 24 | from openfungraph.utils.ious import (
 25 |     compute_2d_box_contained_batch
 26 | )
 27 | from openfungraph.slam.slam_classes import MapObjectList
 28 | from openfungraph.slam.utils import (
 29 |     create_or_load_colors,
 30 |     merge_obj2_into_obj1, 
 31 |     denoise_objects,
 32 |     filter_objects,
 33 |     merge_objects, 
 34 |     gobs_to_detection_list,
 35 | )
 36 | from openfungraph.slam.mapping import (
 37 |     compute_spatial_similarities,
 38 |     compute_visual_similarities,
 39 |     aggregate_similarities,
 40 |     merge_detections_to_objects
 41 | )
 42 | 
 43 | BG_CLASSES = ["wall", "floor", "ceiling"]
 44 | 
 45 | # Disable torch gradient computation
 46 | torch.set_grad_enabled(False)
 47 | 
 48 | def compute_match_batch(cfg, spatial_sim: torch.Tensor, visual_sim: torch.Tensor) -> torch.Tensor:
 49 |     '''
 50 |     Compute object association based on spatial and visual similarities
 51 |     
 52 |     Args:
 53 |         spatial_sim: a MxN tensor of spatial similarities
 54 |         visual_sim: a MxN tensor of visual similarities
 55 |     Returns:
 56 |         A MxN tensor of binary values, indicating whether a detection is associate with an object. 
 57 |         Each row has at most one 1, indicating one detection can be associated with at most one existing object.
 58 |         One existing object can receive multiple new detections
 59 |     '''
 60 |     assign_mat = torch.zeros_like(spatial_sim)
 61 |     if cfg.match_method == "sim_sum":
 62 |         sims = (1 + cfg.phys_bias) * spatial_sim + (1 - cfg.phys_bias) * visual_sim # (M, N)
 63 |         row_max, row_argmax = torch.max(sims, dim=1) # (M,), (M,)
 64 |         for i in row_max.argsort(descending=True):
 65 |             if row_max[i] > cfg.sim_threshold:
 66 |                 assign_mat[i, row_argmax[i]] = 1
 67 |             else:
 68 |                 break
 69 |     else:
 70 |         raise ValueError(f"Unknown matching method: {cfg.match_method}")
 71 |     
 72 |     return assign_mat
 73 | 
 74 | def prepare_objects_save_vis(objects: MapObjectList, downsample_size: float=0.01):
 75 |     objects_to_save = copy.deepcopy(objects)
 76 |             
 77 |     # Downsample the point cloud
 78 |     for i in range(len(objects_to_save)):
 79 |         objects_to_save[i]['pcd'] = objects_to_save[i]['pcd'].voxel_down_sample(downsample_size)
 80 | 
 81 |     # Remove unnecessary keys
 82 |     for i in range(len(objects_to_save)):
 83 |         for k in list(objects_to_save[i].keys()):
 84 |             if k not in [
 85 |                 'pcd', 'bbox', 'clip_ft', 'text_ft', 'class_id', 'num_detections', 'inst_color'
 86 |             ]:
 87 |                 del objects_to_save[i][k]
 88 |                 
 89 |     return objects_to_save.to_serializable()
 90 |     
 91 | def process_cfg(cfg: DictConfig):
 92 |     cfg.dataset_root = Path(cfg.dataset_root)
 93 |     cfg.dataset_config = Path(cfg.dataset_config)
 94 |     
 95 |     if cfg.dataset_config.name != "multiscan.yaml":
 96 |         # For datasets whose depth and RGB have the same resolution
 97 |         # Set the desired image heights and width from the dataset config
 98 |         dataset_cfg = omegaconf.OmegaConf.load(cfg.dataset_config)
 99 |         if cfg.image_height is None:
100 |             cfg.image_height = dataset_cfg.camera_params.image_height
101 |         if cfg.image_width is None:
102 |             cfg.image_width = dataset_cfg.camera_params.image_width
103 |         print(f"Setting image height and width to {cfg.image_height} x {cfg.image_width}")
104 |     else:
105 |         # For dataset whose depth and RGB have different resolutions
106 |         assert cfg.image_height is not None and cfg.image_width is not None, \
107 |             "For multiscan dataset, image height and width must be specified"
108 | 
109 |     return cfg
110 |     
111 | @hydra.main(version_base=None, config_path="../configs/slam_pipeline", config_name="base")
112 | def main(cfg : DictConfig):
113 |     cfg = process_cfg(cfg)
114 |     
115 |     # Initialize the dataset
116 |     dataset = get_dataset(
117 |         dataconfig=cfg.dataset_config,
118 |         start=cfg.start,
119 |         end=cfg.end,
120 |         stride=cfg.stride,
121 |         basedir=cfg.dataset_root,
122 |         sequence=cfg.scene_id,
123 |         desired_height=cfg.image_height,
124 |         desired_width=cfg.image_width,
125 |         device="cpu",
126 |         dtype=torch.float,
127 |     )
128 |     # cam_K = dataset.get_cam_K()
129 |     
130 |     classes, class_colors = create_or_load_colors(cfg, cfg.color_file_name)
131 | 
132 |     objects = MapObjectList(device=cfg.device)
133 |     
134 |     if not cfg.skip_bg:
135 |         # Handle the background detection separately 
136 |         # Each class of them are fused into the map as a single object
137 |         bg_objects = {
138 |             c: None for c in BG_CLASSES
139 |         }
140 |     else:
141 |         bg_objects = None
142 | 
143 |     for idx in trange(len(dataset)):
144 |         # get color image
145 |         color_path = dataset.color_paths[idx]
146 | 
147 |         color_tensor, depth_tensor, intrinsics, *_ = dataset[idx]
148 |         # image_rgb = cv2.cvtColor(cv2.imread(color_path), cv2.COLOR_BGR2RGB)
149 |         # Get the RGB image
150 |         color_np = color_tensor.cpu().numpy() # (H, W, 3)
151 |         image_rgb = (color_np).astype(np.uint8) # (H, W, 3)
152 |         assert image_rgb.max() > 1, "Image is not in range [0, 255]"
153 |         
154 |         # Get the depth image
155 |         depth_tensor = depth_tensor[..., 0]
156 |         depth_array = depth_tensor.cpu().numpy()
157 | 
158 |         # Get the intrinsics matrix
159 |         cam_K = intrinsics.cpu().numpy()[:3, :3]
160 |         
161 |         # load grounded SAM detections
162 |         gobs = None # stands for grounded SAM observations
163 | 
164 |         color_path = Path(color_path)
165 |         if not cfg.part_reg:
166 |             detections_path = color_path.parent.parent / cfg.detection_folder_name / color_path.name
167 |             detections_path = detections_path.with_suffix(".pkl.gz")
168 |         else:
169 |             detections_path = color_path.parent.parent / 'part' /cfg.detection_folder_name / color_path.name
170 |             detections_path = detections_path.with_suffix(".pkl.gz")
171 |         color_path = str(color_path)
172 |         detections_path = str(detections_path)
173 |         
174 |         with gzip.open(detections_path, "rb") as f:
175 |             gobs = pickle.load(f)
176 | 
177 | 
178 |         # get pose, this is the untrasformed pose.
179 |         unt_pose = dataset.poses[idx]
180 |         unt_pose = unt_pose.cpu().numpy()
181 |         
182 |         # Don't apply any transformation otherwise
183 |         adjusted_pose = unt_pose
184 |         
185 |         fg_detection_list, bg_detection_list = gobs_to_detection_list(
186 |             cfg = cfg,
187 |             image = image_rgb,
188 |             depth_array = depth_array,
189 |             cam_K = cam_K,
190 |             idx = idx,
191 |             gobs = gobs,
192 |             trans_pose = adjusted_pose,
193 |             class_names = classes,
194 |             BG_CLASSES = BG_CLASSES,
195 |             color_path = color_path,
196 |             part_reg = cfg.part_reg,
197 |         )
198 |         
199 |         if len(bg_detection_list) > 0:
200 |             for detected_object in bg_detection_list:
201 |                 class_name = detected_object['class_name'][0]
202 |                 if bg_objects[class_name] is None:
203 |                     bg_objects[class_name] = detected_object
204 |                 else:
205 |                     matched_obj = bg_objects[class_name]
206 |                     matched_det = detected_object
207 |                     bg_objects[class_name] = merge_obj2_into_obj1(cfg, matched_obj, matched_det, run_dbscan=False)
208 |             
209 |         if len(fg_detection_list) == 0:
210 |             continue
211 |             
212 |         if cfg.use_contain_number:
213 |             xyxy = fg_detection_list.get_stacked_values_torch('xyxy', 0)
214 |             contain_numbers = compute_2d_box_contained_batch(xyxy, cfg.contain_area_thresh)
215 |             for i in range(len(fg_detection_list)):
216 |                 fg_detection_list[i]['contain_number'] = [contain_numbers[i]]
217 |             
218 |         if len(objects) == 0:
219 |             # Add all detections to the map
220 |             for i in range(len(fg_detection_list)):
221 |                 objects.append(fg_detection_list[i])
222 | 
223 |             # Skip the similarity computation 
224 |             continue
225 |                 
226 |         spatial_sim = compute_spatial_similarities(cfg, fg_detection_list, objects)
227 |         visual_sim = compute_visual_similarities(cfg, fg_detection_list, objects)
228 |         agg_sim = aggregate_similarities(cfg, spatial_sim, visual_sim)
229 |         
230 |         # Compute the contain numbers for each detection
231 |         if cfg.use_contain_number:
232 |             # Get the contain numbers for all objects
233 |             contain_numbers_objects = torch.Tensor([obj['contain_number'][0] for obj in objects])
234 |             detection_contained = contain_numbers > 0 # (M,)
235 |             object_contained = contain_numbers_objects > 0 # (N,)
236 |             detection_contained = detection_contained.unsqueeze(1) # (M, 1)
237 |             object_contained = object_contained.unsqueeze(0) # (1, N)                
238 | 
239 |             # Get the non-matching entries, penalize their similarities
240 |             xor = detection_contained ^ object_contained
241 |             agg_sim[xor] = agg_sim[xor] - cfg.contain_mismatch_penalty
242 |         
243 |         # Threshold sims according to cfg. Set to negative infinity if below threshold
244 |         agg_sim[agg_sim < cfg.sim_threshold] = float('-inf')
245 |         
246 |         objects = merge_detections_to_objects(cfg, fg_detection_list, objects, agg_sim)
247 |         
248 |         # Perform post-processing periodically if told so
249 |         if cfg.denoise_interval > 0 and (idx+1) % cfg.denoise_interval == 0:
250 |             objects = denoise_objects(cfg, objects)
251 |         if cfg.filter_interval > 0 and (idx+1) % cfg.filter_interval == 0:
252 |             objects = filter_objects(cfg, objects)
253 |         if cfg.merge_interval > 0 and (idx+1) % cfg.merge_interval == 0:
254 |             objects = merge_objects(cfg, objects)
255 | 
256 |     if bg_objects is not None:
257 |         bg_objects = MapObjectList([_ for _ in bg_objects.values() if _ is not None])
258 |         bg_objects = denoise_objects(cfg, bg_objects)
259 |         
260 |     objects = denoise_objects(cfg, objects)
261 |     
262 |     # Save the full point cloud before post-processing
263 |     if cfg.save_pcd:
264 |         ts = datetime.now().strftime("%Y%m%d_%H%M%S")
265 |         
266 |         results = {
267 |             'objects': objects.to_serializable(),
268 |             'bg_objects': None if bg_objects is None else bg_objects.to_serializable(),
269 |             'cfg': cfg,
270 |             'class_names': classes,
271 |             'class_colors': class_colors,
272 |         }
273 | 
274 |         if not cfg.part_reg:
275 |             pcd_save_path = cfg.dataset_root / \
276 |                 cfg.scene_id / 'pcd_saves' / f"full_pcd_{cfg.gsa_variant}_{cfg.save_suffix}.pkl.gz"
277 |         else:
278 |             pcd_save_path = cfg.dataset_root / \
279 |                 cfg.scene_id / 'part' / 'pcd_saves' / f"full_pcd_{cfg.gsa_variant}_{cfg.save_suffix}.pkl.gz"
280 |         # make the directory if it doesn't exist
281 |         pcd_save_path.parent.mkdir(parents=True, exist_ok=True)
282 |         pcd_save_path = str(pcd_save_path)
283 |         
284 |         # with gzip.open(pcd_save_path, "wb") as f:
285 |         #     pickle.dump(results, f)
286 |         # print(f"Saved full point cloud to {pcd_save_path}")
287 |     
288 |     objects = filter_objects(cfg, objects)
289 |     objects = merge_objects(cfg, objects)
290 |     
291 |     # Save again the full point cloud after the post-processing
292 |     if cfg.save_pcd:
293 |         results['objects'] = objects.to_serializable()
294 |         pcd_save_path = pcd_save_path[:-7] + "_post.pkl.gz"
295 |         with gzip.open(pcd_save_path, "wb") as f:
296 |             pickle.dump(results, f)
297 |         print(f"Saved full point cloud after post-processing to {pcd_save_path}")
298 |         
299 | if __name__ == "__main__":
300 |     main()


--------------------------------------------------------------------------------
/openfungraph/eval/eval_node.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import open3d as o3d
  3 | import torch
  4 | from transformers import CLIPProcessor, CLIPModel
  5 | import numpy as np
  6 | import argparse
  7 | import pickle
  8 | import gzip
  9 | from openfungraph.slam.slam_classes import MapObjectList
 10 | from tqdm import tqdm
 11 | from openfungraph.utils.ious import compute_3d_iou
 12 | 
 13 | 
 14 | CLASS_LABELS_FUNC = ["button / knob",  "power strip", "light switch", "foucet / handle", "button", "handle", "knob", "knob / button", "foucet / knob / handle", "switch panel / electric outlet", "remote", "electric outlet / power strip", "handle / foucet", "switch panel", "electric outlet"]
 15 | 
 16 | 
 17 | def get_parser():
 18 | 
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument("--dataset", type=str, default=None)
 21 |     parser.add_argument("--root_path", type=str, default=None)
 22 |     parser.add_argument("--scene", type=str, default=None)
 23 |     parser.add_argument("--video", type=str, default=None)
 24 |     parser.add_argument("--split", type=str, default=None)
 25 |     parser.add_argument("--topk", type=int, default=3)
 26 |     parser.add_argument("--iou_threshold", type=float, default=0.)
 27 | 
 28 |     return parser
 29 | 
 30 | 
 31 | def load_result(result_path):
 32 | 
 33 |     with gzip.open(result_path, "rb") as f:
 34 |         results = pickle.load(f)
 35 |     
 36 |     objects = MapObjectList()
 37 |     objects.load_serializable(results["objects"])
 38 | 
 39 |     if 'bg_objects' not in results:
 40 |         bg_objects = None
 41 |     elif results['bg_objects'] is None:
 42 |         bg_objects = None
 43 |     else:
 44 |         bg_objects = MapObjectList()
 45 |         bg_objects.load_serializable(results["bg_objects"])
 46 | 
 47 |     class_colors = results['class_colors']
 48 |     class_names = results['class_names']
 49 |     try:
 50 |         obj_cand = results['inter_id_candidate']
 51 |     except:
 52 |         obj_cand = results['part_inter_id_candidate']
 53 | 
 54 |     return objects, bg_objects, class_colors, class_names, obj_cand
 55 | 
 56 | 
 57 | def compute_iou(pc1, pc2, voxel_size):
 58 |     # Voxelization
 59 |     pcd1 = pc1.voxel_down_sample(voxel_size)
 60 |     pcd2 = pc2.voxel_down_sample(voxel_size)
 61 | 
 62 |     # Create binary occupancy grids
 63 |     min_bound = np.minimum(pcd1.get_min_bound(), pcd2.get_min_bound())
 64 |     max_bound = np.maximum(pcd1.get_max_bound(), pcd2.get_max_bound())
 65 |     
 66 |     # Define grid size
 67 |     grid_size = np.ceil((max_bound - min_bound) / voxel_size).astype(int)
 68 | 
 69 |     # Create occupancy grid
 70 |     grid1 = np.zeros(grid_size, dtype=bool)
 71 |     grid2 = np.zeros(grid_size, dtype=bool)
 72 | 
 73 |     # Fill occupancy grids
 74 |     for point in np.asarray(pcd1.points):
 75 |         grid_idx = np.floor((point - min_bound) / voxel_size).astype(int)
 76 |         grid1[tuple(grid_idx)] = True
 77 | 
 78 |     for point in np.asarray(pcd2.points):
 79 |         grid_idx = np.floor((point - min_bound) / voxel_size).astype(int)
 80 |         grid2[tuple(grid_idx)] = True
 81 | 
 82 |     # Calculate intersection and union
 83 |     intersection = np.sum(grid1 & grid2)
 84 |     union = np.sum(grid1 | grid2)
 85 | 
 86 |     # Calculate IoU
 87 |     iou = intersection / union if union > 0 else 0
 88 |     return iou
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     parser = get_parser()
 93 |     args = parser.parse_args()
 94 |     if args.dataset == 'SceneFun3D':
 95 |         with open(args.root_path+'/SceneFun3D.annotations.json', 'r') as f:
 96 |             gt_annos = json.load(f)
 97 |         gt_annos = [anno for anno in gt_annos if anno['scene_id'] == args.scene]
 98 |     elif args.dataset == 'FunGraph3D':
 99 |         with open(args.root_path+'/FunGraph3D.annotations.json', 'r') as f:
100 |             gt_annos = json.load(f)
101 |         gt_annos = [anno for anno in gt_annos if anno['scene_id'] == args.scene]
102 |     else:
103 |         exit(1)
104 |     
105 |     if args.dataset == 'SceneFun3D':
106 |         scene_pc = o3d.io.read_point_cloud(args.root_path+'/scans/'+args.scene+'_laser_scan.ply')
107 |         refined_transform = np.load(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/'+args.video+'_refined_transform.npy') 
108 |         scene_pc.transform(refined_transform)
109 |     elif args.dataset == 'FunGraph3D':
110 |         scene_pc = o3d.io.read_point_cloud(args.root_path+'/scans/'+args.scene+'.ply')
111 |     
112 |     if args.dataset == 'SceneFun3D':
113 |         objects, _, _, _, _ = load_result(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz')
114 |         parts, _, _, _, _ = load_result(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz')
115 |         with open(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/cfslam_funcgraph_edges.pkl', "rb") as f:
116 |             edges = pickle.load(f)
117 |     elif args.dataset == 'FunGraph3D':
118 |         objects, _, _, _, _ = load_result(args.root_path+'/'+args.scene+'/'+args.video+'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz')
119 |         parts, _, _, _, _ = load_result(args.root_path+'/'+args.scene+'/'+args.video+'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz')
120 |         with open(args.root_path+'/'+args.scene+'/'+args.video+'/cfslam_funcgraph_edges.pkl', "rb") as f:
121 |             edges = pickle.load(f)
122 | 
123 |     all_labels_embeddings = np.load(args.root_path+'/all_labels_clip_embeddings.npy')
124 |     with open(args.root_path+'/all_labels.json', 'r') as f:
125 |         all_labels = json.load(f)
126 |     
127 |     obj_idx = []
128 |     part_idx = []
129 |     for edge in edges:
130 |         if edge[2] == -1:
131 |             # obj + part
132 |             obj_idx.append(edge[0])
133 |             part_idx.append(edge[1])
134 |         elif edge[1] == -1:
135 |             # obj + obj
136 |             obj_idx.append(edge[0])
137 |             obj_idx.append(edge[2])
138 |     obj_idx = list(set(obj_idx))
139 |     part_idx = list(set(part_idx))
140 | 
141 |     rk_num_obj = 0
142 |     rk_num_func = 0
143 |     r10_num_obj = 0
144 |     r10_num_func = 0
145 |     total_obj_num = 0
146 |     total_func_num = 0
147 | 
148 |     model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
149 |     processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
150 | 
151 |     for gt in tqdm(gt_annos):
152 |         gt_label = gt['label']
153 |         if gt_label in CLASS_LABELS_FUNC:
154 |             total_func_num += 1
155 |         else:
156 |             total_obj_num += 1
157 |         gt_mask = gt['indices']
158 |         gt_pc = np.asarray(scene_pc.points)[gt_mask]
159 |         gt_pc_o3d = o3d.geometry.PointCloud()
160 |         gt_pc_o3d.points = o3d.utility.Vector3dVector(gt_pc)
161 |         gt_bbd = gt_pc_o3d.get_oriented_bounding_box()
162 |         corr_flag = False
163 |         for obj_id in obj_idx:
164 |             pred_bbd = objects[obj_id]['bbox']
165 |             pred_label = objects[obj_id]['refined_obj_tag']
166 |             iou = compute_3d_iou(gt_bbd, pred_bbd)
167 |             if iou > args.iou_threshold:
168 |                 inputs = processor(text=[pred_label], return_tensors="pt", padding=True, truncation=True)
169 |                 with torch.no_grad():
170 |                     embeddings = model.get_text_features(**inputs)
171 |                 embeddings = embeddings.detach().numpy()
172 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
173 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
174 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
175 |                 topk_indices = np.argsort(similarity[0], axis=0)[-args.topk:][::-1]
176 |                 topk_label = [all_labels[idx] for idx in topk_indices]
177 |                 if gt_label in topk_label and gt_label in CLASS_LABELS_FUNC:
178 |                     rk_num_func += 1
179 |                     corr_flag = True
180 |                     break
181 |                 elif gt_label in topk_label:
182 |                     rk_num_obj += 1
183 |                     corr_flag = True
184 |                     break
185 | 
186 |         if corr_flag:
187 |             continue
188 |         for part_id in part_idx:
189 |             pred_bbd = parts[part_id]['bbox']
190 |             pred_label = parts[part_id]['refined_obj_tag']
191 |             iou = compute_3d_iou(gt_bbd, pred_bbd)
192 |             if iou > args.iou_threshold:
193 |                 inputs = processor(text=[pred_label], return_tensors="pt", padding=True, truncation=True)
194 |                 with torch.no_grad():
195 |                     embeddings = model.get_text_features(**inputs)
196 |                 embeddings = embeddings.detach().numpy()
197 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
198 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
199 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
200 |                 topk_indices = np.argsort(similarity[0], axis=0)[-args.topk:][::-1]
201 |                 topk_label = [all_labels[idx] for idx in topk_indices]
202 |                 if gt_label in topk_label and gt_label in CLASS_LABELS_FUNC:
203 |                     rk_num_func += 1
204 |                     corr_flag = True
205 |                     break
206 |                 elif gt_label in topk_label:
207 |                     rk_num_obj += 1
208 |                     corr_flag = True
209 |                     break
210 |         if corr_flag:
211 |             continue
212 |     
213 |     for gt in tqdm(gt_annos):
214 |         gt_label = gt['label']
215 |         gt_mask = gt['indices']
216 |         gt_pc = np.asarray(scene_pc.points)[gt_mask]
217 |         gt_pc_o3d = o3d.geometry.PointCloud()
218 |         gt_pc_o3d.points = o3d.utility.Vector3dVector(gt_pc)
219 |         gt_bbd = gt_pc_o3d.get_oriented_bounding_box()
220 |         corr_flag = False
221 |         for obj_id in obj_idx:
222 |             pred_bbd = objects[obj_id]['bbox']
223 |             pred_label = objects[obj_id]['refined_obj_tag']
224 |             iou = compute_3d_iou(gt_bbd, pred_bbd)
225 |             if iou > args.iou_threshold:
226 |                 inputs = processor(text=[pred_label], return_tensors="pt", padding=True, truncation=True)
227 |                 with torch.no_grad():
228 |                     embeddings = model.get_text_features(**inputs)
229 |                 embeddings = embeddings.detach().numpy()
230 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
231 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
232 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
233 |                 topk_indices = np.argsort(similarity[0], axis=0)[-10:][::-1]
234 |                 topk_label = [all_labels[idx] for idx in topk_indices]
235 |                 if gt_label in topk_label and gt_label in CLASS_LABELS_FUNC:
236 |                     r10_num_func += 1
237 |                     corr_flag = True
238 |                     break
239 |                 elif gt_label in topk_label:
240 |                     r10_num_obj += 1
241 |                     corr_flag = True
242 |                     break
243 | 
244 |         if corr_flag:
245 |             continue
246 |         for part_id in part_idx:
247 |             pred_bbd = parts[part_id]['bbox']
248 |             pred_label = parts[part_id]['refined_obj_tag']
249 |             iou = compute_3d_iou(gt_bbd, pred_bbd)
250 |             if iou > args.iou_threshold:
251 |                 inputs = processor(text=[pred_label], return_tensors="pt", padding=True, truncation=True)
252 |                 with torch.no_grad():
253 |                     embeddings = model.get_text_features(**inputs)
254 |                 embeddings = embeddings.detach().numpy()
255 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
256 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
257 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
258 |                 topk_indices = np.argsort(similarity[0], axis=0)[-10:][::-1]
259 |                 topk_label = [all_labels[idx] for idx in topk_indices]
260 |                 if gt_label in topk_label and gt_label in CLASS_LABELS_FUNC:
261 |                     r10_num_func += 1
262 |                     corr_flag = True
263 |                     break
264 |                 elif gt_label in topk_label:
265 |                     r10_num_obj += 1
266 |                     corr_flag = True
267 |                     break
268 |         if corr_flag:
269 |             continue
270 |     
271 |     print('Top ', args.topk, ' Obj Recall: ', rk_num_obj , ' / ' , total_obj_num , ': ' , rk_num_obj / total_obj_num)
272 |     print('Top ', args.topk, ' Fun Elements Recall: ', rk_num_func , ' / ' , total_func_num , ': ' , rk_num_func / total_func_num)
273 |     print('Top ', args.topk, ' Overall Recall: ', (rk_num_func + rk_num_obj) / (total_obj_num + total_func_num))
274 |     print('Top 10 Obj Recall: ', r10_num_obj , ' / ' , total_obj_num , ': ' , r10_num_obj / total_obj_num)
275 |     print('Top 10 Fun Elements Recall: ', r10_num_func , ' / ' , total_func_num , ': ' , r10_num_func / total_func_num)
276 |     print('Top 10 Overall Recall: ', (r10_num_func + r10_num_obj) / (total_obj_num + total_func_num))


--------------------------------------------------------------------------------
/openfungraph/scripts/generate_part_gsa_results.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The script is used to extract Grounded SAM results on a posed RGB-D dataset. 
  3 | The results will be dumped to a folder under the scene folder. 
  4 | '''
  5 | 
  6 | import os
  7 | import argparse
  8 | from pathlib import Path
  9 | from typing import Any, List
 10 | from PIL import Image
 11 | import cv2
 12 | import json
 13 | import imageio
 14 | import matplotlib
 15 | matplotlib.use("TkAgg")
 16 | import numpy as np
 17 | import pickle
 18 | import gzip
 19 | import open_clip
 20 | import torch
 21 | import torchvision
 22 | import supervision as sv
 23 | from tqdm import trange
 24 | 
 25 | from openfungraph.dataset.datasets_common import get_dataset
 26 | from openfungraph.utils.vis import vis_result_fast, vis_result_slow_caption
 27 | from openfungraph.utils.model_utils import compute_clip_features
 28 | 
 29 | try: 
 30 |     from groundingdino.util.inference import Model
 31 |     from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 32 | except ImportError as e:
 33 |     print("Import Error: Please install Grounded Segment Anything following the instructions in README.")
 34 |     raise e
 35 | 
 36 | # Set up some path used in this script
 37 | # Assuming all checkpoint files are downloaded as instructed by the original GSA repo
 38 | if "GSA_PATH" in os.environ:
 39 |     GSA_PATH = os.environ["GSA_PATH"]
 40 | else:
 41 |     raise ValueError("Please set the GSA_PATH environment variable to the path of the GSA repo. ")
 42 |     
 43 | import sys
 44 | sys.path.append(GSA_PATH) # This is needed for the following imports in this file
 45 | 
 46 | import torchvision.transforms as TS
 47 | try:
 48 |     from ram.models import ram
 49 |     from ram import inference_ram
 50 | except ImportError as e:
 51 |     print("RAM sub-package not found. Please check your GSA_PATH. ")
 52 |     raise e
 53 | 
 54 | # Disable torch gradient computation
 55 | torch.set_grad_enabled(False)
 56 |     
 57 | # GroundingDINO config and checkpoint
 58 | GROUNDING_DINO_CONFIG_PATH = os.path.join(GSA_PATH, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
 59 | GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./groundingdino_swint_ogc.pth")
 60 | 
 61 | # Segment-Anything checkpoint
 62 | SAM_ENCODER_VERSION = "vit_h"
 63 | SAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./sam_vit_h_4b8939.pth")
 64 | 
 65 | RAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./ram_swin_large_14m.pth")
 66 | 
 67 | 
 68 | def get_parser() -> argparse.ArgumentParser:
 69 |     parser = argparse.ArgumentParser()
 70 |     parser.add_argument(
 71 |         "--dataset_root", type=Path, required=True,
 72 |     )
 73 |     parser.add_argument(
 74 |         "--dataset_config", type=str, required=True,
 75 |         help="This path may need to be changed depending on where you run this script. "
 76 |     )
 77 |     
 78 |     parser.add_argument("--scene_id", type=str, default="train_3")
 79 |     
 80 |     parser.add_argument("--start", type=int, default=0)
 81 |     parser.add_argument("--end", type=int, default=-1)
 82 |     parser.add_argument("--stride", type=int, default=1)
 83 | 
 84 |     parser.add_argument("--desired-height", type=int, default=480)
 85 |     parser.add_argument("--desired-width", type=int, default=640)
 86 | 
 87 |     parser.add_argument("--box_threshold", type=float, default=0.25)
 88 |     parser.add_argument("--text_threshold", type=float, default=0.25)
 89 |     parser.add_argument("--nms_threshold", type=float, default=0.5)
 90 | 
 91 |     parser.add_argument("--class_set", type=str, default="scene", 
 92 |                         choices=["ram", "none"], 
 93 |                         help="If none, no tagging and detection will be used and the SAM will be run in dense sampling mode. ")
 94 |     parser.add_argument("--detector", type=str, default="dino", 
 95 |                         choices=["dino"])
 96 |     parser.add_argument("--add_bg_classes", action="store_true", 
 97 |                         help="If set, add background classes (wall, floor, ceiling) to the class set. ")
 98 |     parser.add_argument("--accumu_classes", action="store_true",
 99 |                         help="if set, the class set will be accumulated over frames")
100 |     
101 |     parser.add_argument("--device", type=str, default="cuda")
102 |     
103 |     parser.add_argument("--exp_suffix", type=str, default=None,
104 |                         help="The suffix of the folder that the results will be saved to. ")
105 |     
106 |     return parser
107 | 
108 | 
109 | # Prompting SAM with detected boxes
110 | def get_sam_segmentation_from_xyxy(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
111 |     sam_predictor.set_image(image)
112 |     result_masks = []
113 |     for box in xyxy:
114 |         masks, scores, logits = sam_predictor.predict(
115 |             box=box,
116 |             multimask_output=True
117 |         )
118 |         index = np.argmax(scores)
119 |         result_masks.append(masks[index])
120 |     return np.array(result_masks)
121 | 
122 | 
123 | def get_sam_predictor(device: str | int) -> SamPredictor:
124 | 
125 |     sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
126 |     sam.to(device)
127 |     sam_predictor = SamPredictor(sam)
128 |     return sam_predictor
129 |     
130 | 
131 | # The SAM based on automatic mask generation, without bbox prompting
132 | def get_sam_segmentation_dense(
133 |     model: Any, image: np.ndarray
134 | ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
135 |     '''
136 |     The SAM based on automatic mask generation, without bbox prompting
137 |     
138 |     Args:
139 |         model: The mask generator or the YOLO model
140 |         image: )H, W, 3), in RGB color space, in range [0, 255]
141 |         
142 |     Returns:
143 |         mask: (N, H, W)
144 |         xyxy: (N, 4)
145 |         conf: (N,)
146 |     '''
147 |    
148 |     results = model.generate(image)
149 |     mask = []
150 |     xyxy = []
151 |     conf = []
152 |     for r in results:
153 |         mask.append(r["segmentation"])
154 |         r_xyxy = r["bbox"].copy()
155 |         # Convert from xyhw format to xyxy format
156 |         r_xyxy[2] += r_xyxy[0]
157 |         r_xyxy[3] += r_xyxy[1]
158 |         xyxy.append(r_xyxy)
159 |         conf.append(r["predicted_iou"])
160 |     mask = np.array(mask)
161 |     xyxy = np.array(xyxy)
162 |     conf = np.array(conf)
163 |     return mask, xyxy, conf
164 | 
165 | 
166 | def get_sam_mask_generator(device: str | int) -> SamAutomaticMaskGenerator:
167 |     sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
168 |     sam.to(device)
169 |     mask_generator = SamAutomaticMaskGenerator(
170 |         model=sam,
171 |         points_per_side=12,
172 |         points_per_batch=144,
173 |         pred_iou_thresh=0.88,
174 |         stability_score_thresh=0.95,
175 |         crop_n_layers=0,
176 |         min_mask_region_area=100,
177 |     )
178 |     return mask_generator
179 | 
180 | 
181 | def process_tag_classes(text_prompt:str, add_classes:List[str]=[], remove_classes:List[str]=[]) -> list[str]:
182 |     '''
183 |     Convert a text prompt from Tag2Text to a list of classes. 
184 |     '''
185 |     classes = text_prompt.split(',')
186 |     classes = [obj_class.strip() for obj_class in classes]
187 |     classes = [obj_class for obj_class in classes if obj_class != '']
188 |     
189 |     for c in add_classes:
190 |         if c not in classes:
191 |             classes.append(c)
192 |     
193 |     for c in remove_classes:
194 |         classes = [obj_class for obj_class in classes if c not in obj_class.lower()]
195 |     
196 |     return classes
197 |     
198 |     
199 | def main(args: argparse.Namespace):
200 |     ### Initialize the Grounding DINO model ###
201 |     grounding_dino_model = Model(
202 |         model_config_path=GROUNDING_DINO_CONFIG_PATH, 
203 |         model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, 
204 |         device=args.device
205 |     )
206 | 
207 |     ### Initialize the SAM model ###
208 |     if args.class_set == "none":
209 |         mask_generator = get_sam_mask_generator(args.device)
210 |     else:
211 |         sam_predictor = get_sam_predictor(args.device)
212 |     
213 |     ###
214 |     # Initialize the CLIP model
215 |     clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
216 |         "ViT-H-14", "laion2b_s32b_b79k"
217 |     )
218 |     clip_model = clip_model.to(args.device)
219 |     clip_tokenizer = open_clip.get_tokenizer("ViT-H-14")
220 |     
221 |     # Initialize the dataset
222 |     dataset = get_dataset(
223 |         dataconfig=args.dataset_config,
224 |         start=args.start,
225 |         end=args.end,
226 |         stride=args.stride,
227 |         basedir=args.dataset_root,
228 |         sequence=args.scene_id,
229 |         desired_height=args.desired_height,
230 |         desired_width=args.desired_width,
231 |         device="cpu",
232 |         dtype=torch.float,
233 |     )
234 | 
235 |     global_classes = set()
236 |     
237 |     if args.class_set in ["ram"]:
238 |         ### Initialize RAM model ###
239 |         
240 |        
241 |         tagging_model = ram(pretrained=RAM_CHECKPOINT_PATH,
242 |                                         image_size=384,
243 |                                         vit='swin_l')
244 |             
245 |         tagging_model = tagging_model.eval().to(args.device)
246 |         
247 |         # initialize Tag2Text
248 |         tagging_transform = TS.Compose([
249 |             TS.Resize((384, 384)),
250 |             TS.ToTensor(), 
251 |             TS.Normalize(mean=[0.485, 0.456, 0.406],
252 |                          std=[0.229, 0.224, 0.225]),
253 |         ])
254 |         
255 |         classes = None
256 |     elif args.class_set == "none":
257 |         classes = ['item']
258 |     else:
259 |         raise ValueError("Unknown args.class_set: ", args.class_set)
260 | 
261 |     if args.class_set not in ["ram"]:
262 |         print("There are total", len(classes), "classes to detect. ")
263 |     elif args.class_set == "none":
264 |         print("Skipping tagging and detection models. ")
265 |     else:
266 |         print(f"{args.class_set} will be used to detect classes. ")
267 |         
268 |     save_name = f"{args.class_set}"
269 |     if args.exp_suffix:
270 |         save_name += f"_{args.exp_suffix}"
271 |     
272 |     for idx in trange(len(dataset)):
273 |         ### Relevant paths and load image ###
274 |         color_path = dataset.color_paths[idx]
275 | 
276 |         color_path = Path(color_path)
277 |         
278 |         vis_save_path = color_path.parent.parent / "part" / f"gsa_vis_{save_name}" / color_path.name
279 |         detections_save_path = color_path.parent.parent / "part" / f"gsa_detections_{save_name}" / color_path.name
280 |         detections_save_path = detections_save_path.with_suffix(".pkl.gz")
281 |         
282 |         os.makedirs(os.path.dirname(vis_save_path), exist_ok=True)
283 |         os.makedirs(os.path.dirname(detections_save_path), exist_ok=True)
284 |         
285 |         # opencv can't read Path objects... sigh...
286 |         color_path = str(color_path)
287 |         vis_save_path = str(vis_save_path)
288 |         detections_save_path = str(detections_save_path)
289 |         
290 |         image = cv2.imread(color_path) # This will in BGR color space
291 |         # rotate it
292 |         if hasattr(dataset, 'camera_axis'):
293 |             if dataset.camera_axis == 'Left':
294 |                 image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
295 |         image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert to RGB color space
296 |          
297 |         add_classes = ["handle", "button", "knob", "drawer", "door", "oven", "closet", "window", "remote", "radiator", "bathhub", "sink"]
298 | 
299 |         classes = add_classes
300 |             
301 |         # add classes to global classes
302 |         global_classes.update(classes)
303 |         
304 |         if args.accumu_classes:
305 |             # Use all the classes that have been seen so far
306 |             classes = list(global_classes)
307 |             
308 |         ### Detection and segmentation ###
309 |         if args.class_set == "none":
310 |             # Directly use SAM in dense sampling mode to get segmentation
311 |             mask, xyxy, conf = get_sam_segmentation_dense(mask_generator, image_rgb)
312 |             detections = sv.Detections(
313 |                 xyxy=xyxy,
314 |                 confidence=conf,
315 |                 class_id=np.zeros_like(conf).astype(int),
316 |                 mask=mask,
317 |             )
318 |             image_crops, image_feats, text_feats = compute_clip_features(
319 |                 image_rgb, detections, clip_model, clip_preprocess, clip_tokenizer, classes, args.device)
320 | 
321 |             ### Visualize results ###
322 |             annotated_image, labels = vis_result_fast(
323 |                 image, detections, classes, instance_random_color=True)
324 |             
325 |             cv2.imwrite(vis_save_path, annotated_image)
326 |         else:
327 |             if args.detector == "dino":
328 |                 # Using GroundingDINO to detect and SAM to segment
329 |                 detections = grounding_dino_model.predict_with_classes(
330 |                     image=image, # This function expects a BGR image...
331 |                     classes=classes,
332 |                     box_threshold=args.box_threshold,
333 |                     text_threshold=args.text_threshold,
334 |                 )
335 |             
336 |                 if len(detections.class_id) > 0:
337 |                     ### Non-maximum suppression ###
338 |                     # print(f"Before NMS: {len(detections.xyxy)} boxes")
339 |                     nms_idx = torchvision.ops.nms(
340 |                         torch.from_numpy(detections.xyxy), 
341 |                         torch.from_numpy(detections.confidence), 
342 |                         args.nms_threshold
343 |                     ).numpy().tolist()
344 |                     # print(f"After NMS: {len(detections.xyxy)} boxes")
345 | 
346 |                     detections.xyxy = detections.xyxy[nms_idx]
347 |                     detections.confidence = detections.confidence[nms_idx]
348 |                     detections.class_id = detections.class_id[nms_idx]
349 |                     
350 |                     # Somehow some detections will have class_id=-1, remove them
351 |                     valid_idx = detections.class_id != -1
352 |                     detections.xyxy = detections.xyxy[valid_idx]
353 |                     detections.confidence = detections.confidence[valid_idx]
354 |                     detections.class_id = detections.class_id[valid_idx]
355 |                 
356 |             if len(detections.class_id) > 0:
357 |                 
358 |                 ### Segment Anything ###
359 |                 detections.mask = get_sam_segmentation_from_xyxy(
360 |                     sam_predictor=sam_predictor,
361 |                     image=image_rgb,
362 |                     xyxy=detections.xyxy
363 |                 )
364 | 
365 |                 # Compute and save the clip features of detections  
366 |                 image_crops, image_feats, text_feats = compute_clip_features(
367 |                     image_rgb, detections, clip_model, clip_preprocess, clip_tokenizer, classes, args.device)
368 |             else:
369 |                 image_crops, image_feats, text_feats = [], [], []
370 |             
371 |             ### Visualize results ###
372 |             annotated_image, labels = vis_result_fast(image, detections, classes)
373 |             cv2.imwrite(vis_save_path, annotated_image)
374 |         
375 |         # Convert the detections to a dict. The elements are in np.array
376 |         results = {
377 |             "xyxy": detections.xyxy,
378 |             "confidence": detections.confidence,
379 |             "class_id": detections.class_id,
380 |             "mask": detections.mask,
381 |             "classes": classes,
382 |             "image_crops": image_crops,
383 |             "image_feats": image_feats,
384 |             "text_feats": text_feats,
385 |         }
386 |         
387 |         # save the detections using pickle
388 |         # Here we use gzip to compress the file, which could reduce the file size by 500x
389 |         with gzip.open(detections_save_path, "wb") as f:
390 |             pickle.dump(results, f)
391 |     
392 |     # save global classes
393 |     with open(args.dataset_root / args.scene_id / 'part' / f"gsa_classes_{save_name}.json", "w") as f:
394 |         json.dump(list(global_classes), f)
395 |         
396 | 
397 | if __name__ == "__main__":
398 |     parser = get_parser()
399 |     args = parser.parse_args()
400 |     main(args)


--------------------------------------------------------------------------------
/openfungraph/eval/eval_triplet.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import open3d as o3d
  3 | import torch
  4 | from transformers import CLIPProcessor, CLIPModel
  5 | from transformers import BertModel, BertTokenizer
  6 | import numpy as np
  7 | import argparse
  8 | import pickle
  9 | import gzip
 10 | from openfungraph.slam.slam_classes import MapObjectList
 11 | from tqdm import tqdm
 12 | from openfungraph.utils.ious import compute_3d_iou
 13 | 
 14 | 
 15 | def get_parser():
 16 | 
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument("--dataset", type=str, default=None)
 19 |     parser.add_argument("--root_path", type=str, default=None)
 20 |     parser.add_argument("--scene", type=str, default=None)
 21 |     parser.add_argument("--video", type=str, default=None)
 22 |     parser.add_argument("--split", type=str, default=None)
 23 |     parser.add_argument("--topk", type=int, default=5)
 24 |     parser.add_argument("--iou_threshold", type=float, default=0.)
 25 | 
 26 |     return parser
 27 | 
 28 | 
 29 | def load_result(result_path):
 30 | 
 31 |     with gzip.open(result_path, "rb") as f:
 32 |         results = pickle.load(f)
 33 |     
 34 |     objects = MapObjectList()
 35 |     objects.load_serializable(results["objects"])
 36 | 
 37 |     if 'bg_objects' not in results:
 38 |         bg_objects = None
 39 |     elif results['bg_objects'] is None:
 40 |         bg_objects = None
 41 |     else:
 42 |         bg_objects = MapObjectList()
 43 |         bg_objects.load_serializable(results["bg_objects"])
 44 | 
 45 |     class_colors = results['class_colors']
 46 |     class_names = results['class_names']
 47 |     try:
 48 |         obj_cand = results['inter_id_candidate']
 49 |     except:
 50 |         obj_cand = results['part_inter_id_candidate']
 51 | 
 52 |     return objects, bg_objects, class_colors, class_names, obj_cand
 53 | 
 54 | 
 55 | def compute_iou(pc1, pc2, voxel_size):
 56 |     # Voxelization
 57 |     pcd1 = pc1.voxel_down_sample(voxel_size)
 58 |     pcd2 = pc2.voxel_down_sample(voxel_size)
 59 | 
 60 |     # Create binary occupancy grids
 61 |     min_bound = np.minimum(pcd1.get_min_bound(), pcd2.get_min_bound())
 62 |     max_bound = np.maximum(pcd1.get_max_bound(), pcd2.get_max_bound())
 63 |     
 64 |     # Define grid size
 65 |     grid_size = np.ceil((max_bound - min_bound) / voxel_size).astype(int)
 66 | 
 67 |     # Create occupancy grid
 68 |     grid1 = np.zeros(grid_size, dtype=bool)
 69 |     grid2 = np.zeros(grid_size, dtype=bool)
 70 | 
 71 |     # Fill occupancy grids
 72 |     for point in np.asarray(pcd1.points):
 73 |         grid_idx = np.floor((point - min_bound) / voxel_size).astype(int)
 74 |         grid1[tuple(grid_idx)] = True
 75 | 
 76 |     for point in np.asarray(pcd2.points):
 77 |         grid_idx = np.floor((point - min_bound) / voxel_size).astype(int)
 78 |         grid2[tuple(grid_idx)] = True
 79 | 
 80 |     # Calculate intersection and union
 81 |     intersection = np.sum(grid1 & grid2)
 82 |     union = np.sum(grid1 | grid2)
 83 | 
 84 |     # Calculate IoU
 85 |     iou = intersection / union if union > 0 else 0
 86 |     return iou
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     parser = get_parser()
 91 |     args = parser.parse_args()
 92 |     if args.dataset == 'SceneFun3D':
 93 |         with open(args.root_path+'/SceneFun3D.annotations.json', 'r') as f:
 94 |             gt_annos = json.load(f)
 95 |         gt_annos = [anno for anno in gt_annos if anno['scene_id'] == args.scene]
 96 |         with open(args.root_path+'/SceneFun3D.relations.json', 'r') as f:
 97 |             gt_edge_annos = json.load(f)
 98 |         gt_edge_annos = [anno for anno in gt_edge_annos if anno['scene_id'] == args.scene]
 99 |     elif args.dataset == 'FunGraph3D':
100 |         with open(args.root_path+'/FunGraph3D.annotations.json', 'r') as f:
101 |             gt_annos = json.load(f)
102 |         gt_annos = [anno for anno in gt_annos if anno['scene_id'] == args.scene]
103 |         with open(args.root_path+'/FunGraph3D.relations.json', 'r') as f:
104 |             gt_edge_annos = json.load(f)
105 |         gt_edge_annos = [anno for anno in gt_edge_annos if anno['scene_id'] == args.scene]
106 |     else:
107 |         exit(1)
108 |     
109 |     if args.dataset == 'SceneFun3D':
110 |         scene_pc = o3d.io.read_point_cloud(args.root_path+'/scans/'+args.scene+'_laser_scan.ply')
111 |         refined_transform = np.load(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/'+args.video+'_refined_transform.npy') 
112 |         scene_pc.transform(refined_transform)
113 |     elif args.dataset == 'FunGraph3D':
114 |         scene_pc = o3d.io.read_point_cloud(args.root_path+'/scans/'+args.scene+'.ply')
115 |     
116 |     if args.dataset == 'SceneFun3D':
117 |         objects, _, _, _, _ = load_result(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz')
118 |         parts, _, _, _, _ = load_result(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz')
119 |         with open(args.root_path+'/'+args.split+'/'+args.scene+'/'+args.video+'/cfslam_funcgraph_edges.pkl', "rb") as f:
120 |             edges = pickle.load(f)
121 |     elif args.dataset == 'FunGraph3D':
122 |         objects, _, _, _, _ = load_result(args.root_path+'/'+args.scene+'/'+args.video+'/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.3_bbox0.9_simsum1.2_dbscan.1_post.pkl.gz')
123 |         parts, _, _, _, _ = load_result(args.root_path+'/'+args.scene+'/'+args.video+'/part/pcd_saves/full_pcd_ram_withbg_allclasses_overlap_maskconf0.15_bbox0.1_simsum1.2_dbscan.1_parts_post.pkl.gz')
124 |         with open(args.root_path+'/'+args.scene+'/'+args.video+'/cfslam_funcgraph_edges.pkl', "rb") as f:
125 |             edges = pickle.load(f)
126 | 
127 |     all_labels_embeddings = np.load(args.root_path+'/all_labels_clip_embeddings.npy')
128 |     with open(args.root_path+'/all_labels.json', 'r') as f:
129 |         all_labels = json.load(f)
130 |     
131 |     all_edges_embeddings = np.load(args.root_path+'/all_edges_bert_embeddings.npy')
132 |     with open(args.root_path+'/all_edges.json', 'r') as f:
133 |         all_edges = json.load(f)
134 | 
135 |     rk_num = 0
136 |     rk_num_obj = 0
137 |     rk_num_edge = 0
138 | 
139 |     model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
140 |     processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
141 |     tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
142 |     model_bert = BertModel.from_pretrained('bert-base-uncased')
143 | 
144 |     fail_num = 0
145 |     for gt_edge in tqdm(gt_edge_annos):
146 |         try:
147 |             gt_obj1 = [anno for anno in gt_annos if anno["annot_id"] == gt_edge["first_node_annot_id"]][0]  # functionality
148 |         except:
149 |             fail_num += 1
150 |             continue
151 |         # start to retrieve functionality
152 |         gt_label1 = gt_obj1['label']
153 |         gt_mask1 = gt_obj1['indices']
154 |         gt_pc1 = np.asarray(scene_pc.points)[gt_mask1]
155 |         gt_pc1_o3d = o3d.geometry.PointCloud()
156 |         gt_pc1_o3d.points = o3d.utility.Vector3dVector(gt_pc1)
157 |         gt_bbd1 = gt_pc1_o3d.get_oriented_bounding_box()
158 |         # retrieve object
159 |         try:
160 |             gt_obj2 = [anno for anno in gt_annos if anno["annot_id"] == gt_edge["second_node_annot_id"]][0]  # obj
161 |         except:
162 |             fail_num += 1
163 |             continue
164 |         gt_label2 = gt_obj2['label']
165 |         gt_mask2 = gt_obj2['indices']
166 |         gt_pc2 = np.asarray(scene_pc.points)[gt_mask2]
167 |         gt_pc2_o3d = o3d.geometry.PointCloud()
168 |         gt_pc2_o3d.points = o3d.utility.Vector3dVector(gt_pc2)
169 |         gt_bbd2 = gt_pc2_o3d.get_oriented_bounding_box()
170 |         # retrieve edge
171 |         gt_rel = gt_edge["description"]
172 |         flag_obj = False
173 |         flag_edge = False
174 |         for edge in edges:
175 |             if edge[2] == -1:
176 |                 # edge[1]: functionality
177 |                 pred_func = parts[edge[1]]
178 |             elif edge[1] == -1:
179 |                 # edge[0]: functionality
180 |                 pred_func = objects[edge[0]]
181 |             pred_bbd1 = pred_func['bbox']
182 |             pred_label1 = pred_func['refined_obj_tag']
183 |             iou1 = compute_3d_iou(gt_bbd1, pred_bbd1)
184 |             if iou1 > args.iou_threshold:
185 |                 inputs = processor(text=[pred_label1], return_tensors="pt", padding=True, truncation=True)
186 |                 with torch.no_grad():
187 |                     embeddings = model.get_text_features(**inputs)
188 |                 embeddings = embeddings.numpy()
189 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
190 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
191 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
192 |                 topk_indices = np.argsort(similarity[0], axis=0)[-args.topk:][::-1]
193 |                 topk_label = [all_labels[idx] for idx in topk_indices]
194 |                 if gt_label1 not in topk_label:
195 |                     continue
196 |             else:
197 |                 continue
198 |             # continue obj retrieval
199 |             if edge[2] == -1:
200 |                 # edge[0]: obj
201 |                 pred_obj = objects[edge[0]]
202 |             elif edge[1] == -1:
203 |                 # edge[2]: obj
204 |                 pred_obj = objects[edge[2]]
205 |             pred_bbd2 = pred_obj['bbox']
206 |             pred_label2 = pred_obj['refined_obj_tag']
207 |             iou2 = compute_3d_iou(gt_bbd2, pred_bbd2)
208 |             if iou2 > args.iou_threshold:
209 |                 inputs = processor(text=[pred_label2], return_tensors="pt", padding=True, truncation=True)
210 |                 with torch.no_grad():
211 |                     embeddings = model.get_text_features(**inputs)
212 |                 embeddings = embeddings.numpy()
213 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
214 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
215 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
216 |                 topk_indices = np.argsort(similarity[0], axis=0)[-args.topk:][::-1]
217 |                 topk_label = [all_labels[idx] for idx in topk_indices]
218 |                 if gt_label2 not in topk_label:
219 |                     continue
220 |             else:
221 |                 continue
222 |             flag_obj = True
223 |             # continue edge retrieval
224 |             pred_rel = edge[3]
225 |             inputs = tokenizer_bert(pred_rel, return_tensors='pt', padding=True, truncation=True)
226 |             outputs = model_bert(**inputs)
227 |             embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
228 |             norm_embeddings = embeddings / np.linalg.norm(embeddings)
229 |             norm_all_edges_embeddings = all_edges_embeddings / np.linalg.norm(all_edges_embeddings, axis=1, keepdims=True)
230 |             similarity = np.dot(norm_embeddings, norm_all_edges_embeddings.T)
231 |             topk_indices = np.argsort(similarity[0], axis=0)[-args.topk:][::-1]
232 |             topk_label = [all_edges[idx] for idx in topk_indices]
233 |             if gt_rel in topk_label:
234 |                 flag_edge = True
235 |                 rk_num += 1
236 |                 break
237 |         if flag_obj:
238 |             rk_num_obj += 1
239 |         if flag_edge:
240 |             rk_num_edge += 1
241 |        
242 |     print('Top ', args.topk, ' Recall: ', rk_num, ' / ', (len(gt_edge_annos) - fail_num), ': ', rk_num / (len(gt_edge_annos) - fail_num))
243 |     print('Top ', args.topk, ' Object Recall: ', rk_num_obj, ' / ', (len(gt_edge_annos) - fail_num), ': ', rk_num_obj / (len(gt_edge_annos) - fail_num))
244 |     print('Top ', args.topk, ' Edge Recall: ', rk_num_edge, ' / ', rk_num_obj, ': ', rk_num_edge / rk_num_obj)
245 | 
246 |     rk_num = 0
247 |     rk_num_obj = 0
248 |     rk_num_edge = 0
249 | 
250 |     fail_num = 0
251 |     for gt_edge in tqdm(gt_edge_annos):
252 |         try:
253 |             gt_obj1 = [anno for anno in gt_annos if anno["annot_id"] == gt_edge["first_node_annot_id"]][0]  # functionality
254 |         except:
255 |             fail_num += 1
256 |             continue
257 |         # start to retrieve functionality
258 |         gt_label1 = gt_obj1['label']
259 |         gt_mask1 = gt_obj1['indices']
260 |         gt_pc1 = np.asarray(scene_pc.points)[gt_mask1]
261 |         gt_pc1_o3d = o3d.geometry.PointCloud()
262 |         gt_pc1_o3d.points = o3d.utility.Vector3dVector(gt_pc1)
263 |         gt_bbd1 = gt_pc1_o3d.get_oriented_bounding_box()
264 |         # retrieve object
265 |         try:
266 |             gt_obj2 = [anno for anno in gt_annos if anno["annot_id"] == gt_edge["second_node_annot_id"]][0]  # obj
267 |         except:
268 |             fail_num += 1
269 |             continue
270 |         gt_label2 = gt_obj2['label']
271 |         gt_mask2 = gt_obj2['indices']
272 |         gt_pc2 = np.asarray(scene_pc.points)[gt_mask2]
273 |         gt_pc2_o3d = o3d.geometry.PointCloud()
274 |         gt_pc2_o3d.points = o3d.utility.Vector3dVector(gt_pc2)
275 |         gt_bbd2 = gt_pc2_o3d.get_oriented_bounding_box()
276 |         # retrieve edge
277 |         gt_rel = gt_edge["description"]
278 |         flag_obj = False
279 |         flag_edge = False
280 |         for edge in edges:
281 |             if edge[2] == -1:
282 |                 # edge[1]: functionality
283 |                 pred_func = parts[edge[1]]
284 |             elif edge[1] == -1:
285 |                 # edge[0]: functionality
286 |                 pred_func = objects[edge[0]]
287 |             pred_bbd1 = pred_func['bbox']
288 |             pred_label1 = pred_func['refined_obj_tag']
289 |             iou1 = compute_3d_iou(gt_bbd1, pred_bbd1)
290 |             if iou1 > args.iou_threshold:
291 |                 inputs = processor(text=[pred_label1], return_tensors="pt", padding=True, truncation=True)
292 |                 with torch.no_grad():
293 |                     embeddings = model.get_text_features(**inputs)
294 |                 embeddings = embeddings.numpy()
295 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
296 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
297 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
298 |                 topk_indices = np.argsort(similarity[0], axis=0)[-10:][::-1]
299 |                 topk_label = [all_labels[idx] for idx in topk_indices]
300 |                 if gt_label1 not in topk_label:
301 |                     continue
302 |             else:
303 |                 continue
304 |             # continue obj retrieval
305 |             if edge[2] == -1:
306 |                 # edge[0]: obj
307 |                 pred_obj = objects[edge[0]]
308 |             elif edge[1] == -1:
309 |                 # edge[2]: obj
310 |                 pred_obj = objects[edge[2]]
311 |             pred_bbd2 = pred_obj['bbox']
312 |             pred_label2 = pred_obj['refined_obj_tag']
313 |             iou2 = compute_3d_iou(gt_bbd2, pred_bbd2)
314 |             if iou2 > args.iou_threshold:
315 |                 inputs = processor(text=[pred_label2], return_tensors="pt", padding=True, truncation=True)
316 |                 with torch.no_grad():
317 |                     embeddings = model.get_text_features(**inputs)
318 |                 embeddings = embeddings.numpy()
319 |                 norm_embeddings = embeddings / np.linalg.norm(embeddings)
320 |                 norm_all_labels_embeddings = all_labels_embeddings / np.linalg.norm(all_labels_embeddings, axis=1, keepdims=True)
321 |                 similarity = np.dot(norm_embeddings, norm_all_labels_embeddings.T)
322 |                 topk_indices = np.argsort(similarity[0], axis=0)[-10:][::-1]
323 |                 topk_label = [all_labels[idx] for idx in topk_indices]
324 |                 if gt_label2 not in topk_label:
325 |                     continue
326 |             else:
327 |                 continue
328 |             flag_obj = True
329 |             # continue edge retrieval
330 |             pred_rel = edge[3]
331 |             inputs = tokenizer_bert(pred_rel, return_tensors='pt', padding=True, truncation=True)
332 |             outputs = model_bert(**inputs)
333 |             embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
334 |             norm_embeddings = embeddings / np.linalg.norm(embeddings)
335 |             norm_all_edges_embeddings = all_edges_embeddings / np.linalg.norm(all_edges_embeddings, axis=1, keepdims=True)
336 |             similarity = np.dot(norm_embeddings, norm_all_edges_embeddings.T)
337 |             topk_indices = np.argsort(similarity[0], axis=0)[-10:][::-1]
338 |             topk_label = [all_edges[idx] for idx in topk_indices]
339 |             if gt_rel in topk_label:
340 |                 flag_edge = True
341 |                 rk_num += 1
342 |                 break
343 |         if flag_obj:
344 |             rk_num_obj += 1
345 |         if flag_edge:
346 |             rk_num_edge += 1
347 |        
348 |     print('Top 10 Recall: ', rk_num, ' / ', (len(gt_edge_annos) - fail_num), ': ', rk_num / (len(gt_edge_annos) - fail_num))
349 |     print('Top 10 Object Recall: ', rk_num_obj, ' / ', (len(gt_edge_annos) - fail_num), ': ', rk_num_obj / (len(gt_edge_annos) - fail_num))
350 |     print('Top 10 Edge Recall: ', rk_num_edge, ' / ', rk_num_obj, ': ', rk_num_edge / rk_num_obj)


--------------------------------------------------------------------------------
/openfungraph/scripts/generate_gsa_results.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The script is used to extract Grounded SAM results on a posed RGB-D dataset. 
  3 | The results will be dumped to a folder under the scene folder. 
  4 | '''
  5 | 
  6 | import os
  7 | import argparse
  8 | from pathlib import Path
  9 | from typing import Any, List
 10 | from PIL import Image
 11 | import cv2
 12 | import json
 13 | import imageio
 14 | import matplotlib
 15 | matplotlib.use("TkAgg")
 16 | import numpy as np
 17 | import pickle
 18 | import gzip
 19 | import open_clip
 20 | import torch
 21 | import torchvision
 22 | import supervision as sv
 23 | from tqdm import trange
 24 | 
 25 | from openfungraph.dataset.datasets_common import get_dataset
 26 | from openfungraph.utils.vis import vis_result_fast, vis_result_slow_caption
 27 | from openfungraph.utils.model_utils import compute_clip_features
 28 | 
 29 | try: 
 30 |     from groundingdino.util.inference import Model
 31 |     from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 32 | except ImportError as e:
 33 |     print("Import Error: Please install Grounded Segment Anything following the instructions in README.")
 34 |     raise e
 35 | 
 36 | # Set up some path used in this script
 37 | # Assuming all checkpoint files are downloaded as instructed by the original GSA repo
 38 | if "GSA_PATH" in os.environ:
 39 |     GSA_PATH = os.environ["GSA_PATH"]
 40 | else:
 41 |     raise ValueError("Please set the GSA_PATH environment variable to the path of the GSA repo. ")
 42 |     
 43 | import sys
 44 | sys.path.append(GSA_PATH) # This is needed for the following imports in this file
 45 | 
 46 | import torchvision.transforms as TS
 47 | try:
 48 |     from ram.models import ram
 49 |     from ram import inference_ram
 50 | except ImportError as e:
 51 |     print("RAM sub-package not found. Please check your GSA_PATH. ")
 52 |     raise e
 53 | 
 54 | # Disable torch gradient computation
 55 | torch.set_grad_enabled(False)
 56 |     
 57 | # GroundingDINO config and checkpoint
 58 | GROUNDING_DINO_CONFIG_PATH = os.path.join(GSA_PATH, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
 59 | GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./groundingdino_swint_ogc.pth")
 60 | 
 61 | # Segment-Anything checkpoint
 62 | SAM_ENCODER_VERSION = "vit_h"
 63 | SAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./sam_vit_h_4b8939.pth")
 64 | 
 65 | RAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "./ram_swin_large_14m.pth")
 66 | 
 67 | 
 68 | def get_parser() -> argparse.ArgumentParser:
 69 |     parser = argparse.ArgumentParser()
 70 |     parser.add_argument(
 71 |         "--dataset_root", type=Path, required=True,
 72 |     )
 73 |     parser.add_argument(
 74 |         "--dataset_config", type=str, required=True,
 75 |         help="This path may need to be changed depending on where you run this script. "
 76 |     )
 77 |     
 78 |     parser.add_argument("--scene_id", type=str, default="train_3")
 79 |     
 80 |     parser.add_argument("--start", type=int, default=0)
 81 |     parser.add_argument("--end", type=int, default=-1)
 82 |     parser.add_argument("--stride", type=int, default=1)
 83 | 
 84 |     parser.add_argument("--desired-height", type=int, default=480)
 85 |     parser.add_argument("--desired-width", type=int, default=640)
 86 | 
 87 |     parser.add_argument("--box_threshold", type=float, default=0.25)
 88 |     parser.add_argument("--text_threshold", type=float, default=0.25)
 89 |     parser.add_argument("--nms_threshold", type=float, default=0.5)
 90 | 
 91 |     parser.add_argument("--class_set", type=str, default="scene", 
 92 |                         choices=["ram", "none"], 
 93 |                         help="If none, no tagging and detection will be used and the SAM will be run in dense sampling mode. ")
 94 |     parser.add_argument("--detector", type=str, default="dino", 
 95 |                         choices=["dino"])
 96 |     parser.add_argument("--add_bg_classes", action="store_true", 
 97 |                         help="If set, add background classes (wall, floor, ceiling) to the class set. ")
 98 |     parser.add_argument("--accumu_classes", action="store_true",
 99 |                         help="if set, the class set will be accumulated over frames")
100 |     
101 |     parser.add_argument("--device", type=str, default="cuda")
102 |     
103 |     parser.add_argument("--exp_suffix", type=str, default=None,
104 |                         help="The suffix of the folder that the results will be saved to. ")
105 |     
106 |     return parser
107 | 
108 | 
109 | # Prompting SAM with detected boxes
110 | def get_sam_segmentation_from_xyxy(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
111 |     sam_predictor.set_image(image)
112 |     result_masks = []
113 |     for box in xyxy:
114 |         masks, scores, logits = sam_predictor.predict(
115 |             box=box,
116 |             multimask_output=True
117 |         )
118 |         index = np.argmax(scores)
119 |         result_masks.append(masks[index])
120 |     return np.array(result_masks)
121 | 
122 | 
123 | def get_sam_predictor(device: str | int) -> SamPredictor:
124 | 
125 |     sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
126 |     sam.to(device)
127 |     sam_predictor = SamPredictor(sam)
128 |     return sam_predictor
129 |     
130 | 
131 | # The SAM based on automatic mask generation, without bbox prompting
132 | def get_sam_segmentation_dense(
133 |     model: Any, image: np.ndarray
134 | ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
135 |     '''
136 |     The SAM based on automatic mask generation, without bbox prompting
137 |     
138 |     Args:
139 |         model: The mask generator or the YOLO model
140 |         image: )H, W, 3), in RGB color space, in range [0, 255]
141 |         
142 |     Returns:
143 |         mask: (N, H, W)
144 |         xyxy: (N, 4)
145 |         conf: (N,)
146 |     '''
147 |    
148 |     results = model.generate(image)
149 |     mask = []
150 |     xyxy = []
151 |     conf = []
152 |     for r in results:
153 |         mask.append(r["segmentation"])
154 |         r_xyxy = r["bbox"].copy()
155 |         # Convert from xyhw format to xyxy format
156 |         r_xyxy[2] += r_xyxy[0]
157 |         r_xyxy[3] += r_xyxy[1]
158 |         xyxy.append(r_xyxy)
159 |         conf.append(r["predicted_iou"])
160 |     mask = np.array(mask)
161 |     xyxy = np.array(xyxy)
162 |     conf = np.array(conf)
163 |     return mask, xyxy, conf
164 | 
165 | 
166 | def get_sam_mask_generator(device: str | int) -> SamAutomaticMaskGenerator:
167 |     sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
168 |     sam.to(device)
169 |     mask_generator = SamAutomaticMaskGenerator(
170 |         model=sam,
171 |         points_per_side=12,
172 |         points_per_batch=144,
173 |         pred_iou_thresh=0.88,
174 |         stability_score_thresh=0.95,
175 |         crop_n_layers=0,
176 |         min_mask_region_area=100,
177 |     )
178 |     return mask_generator
179 | 
180 | 
181 | def process_tag_classes(text_prompt:str, add_classes:List[str]=[], remove_classes:List[str]=[]) -> list[str]:
182 |     '''
183 |     Convert a text prompt from Tag2Text to a list of classes. 
184 |     '''
185 |     classes = text_prompt.split(',')
186 |     classes = [obj_class.strip() for obj_class in classes]
187 |     classes = [obj_class for obj_class in classes if obj_class != '']
188 |     
189 |     for c in add_classes:
190 |         if c not in classes:
191 |             classes.append(c)
192 |     
193 |     for c in remove_classes:
194 |         classes = [obj_class for obj_class in classes if c not in obj_class.lower()]
195 |     
196 |     return classes
197 | 
198 |     
199 | def main(args: argparse.Namespace):
200 |     ### Initialize the Grounding DINO model ###
201 |     grounding_dino_model = Model(
202 |         model_config_path=GROUNDING_DINO_CONFIG_PATH, 
203 |         model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, 
204 |         device=args.device
205 |     )
206 | 
207 |     ### Initialize the SAM model ###
208 |     if args.class_set == "none":
209 |         mask_generator = get_sam_mask_generator(args.device)
210 |     else:
211 |         sam_predictor = get_sam_predictor(args.device)
212 |     
213 |     ###
214 |     # Initialize the CLIP model
215 |     clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
216 |         "ViT-H-14", "laion2b_s32b_b79k"
217 |     )
218 |     clip_model = clip_model.to(args.device)
219 |     clip_tokenizer = open_clip.get_tokenizer("ViT-H-14")
220 |     
221 |     # Initialize the dataset
222 |     dataset = get_dataset(
223 |         dataconfig=args.dataset_config,
224 |         start=args.start,
225 |         end=args.end,
226 |         stride=args.stride,
227 |         basedir=args.dataset_root,
228 |         sequence=args.scene_id,
229 |         desired_height=args.desired_height,
230 |         desired_width=args.desired_width,
231 |         device="cpu",
232 |         dtype=torch.float,
233 |     )
234 | 
235 |     global_classes = set()
236 |     
237 |     if args.class_set in ["ram"]:
238 |        
239 |         tagging_model = ram(pretrained=RAM_CHECKPOINT_PATH,
240 |                                         image_size=384,
241 |                                         vit='swin_l')
242 |             
243 |         tagging_model = tagging_model.eval().to(args.device)
244 |         
245 |         # initialize Tag2Text
246 |         tagging_transform = TS.Compose([
247 |             TS.Resize((384, 384)),
248 |             TS.ToTensor(), 
249 |             TS.Normalize(mean=[0.485, 0.456, 0.406],
250 |                          std=[0.229, 0.224, 0.225]),
251 |         ])
252 |         
253 |         classes = None
254 |     elif args.class_set == "none":
255 |         classes = ['item']
256 |     else:
257 |         raise ValueError("Unknown args.class_set: ", args.class_set)
258 | 
259 |     if args.class_set not in ["ram"]:
260 |         print("There are total", len(classes), "classes to detect. ")
261 |     elif args.class_set == "none":
262 |         print("Skipping tagging and detection models. ")
263 |     else:
264 |         print(f"{args.class_set} will be used to detect classes. ")
265 |         
266 |     save_name = f"{args.class_set}"
267 |     if args.exp_suffix:
268 |         save_name += f"_{args.exp_suffix}"
269 | 
270 |     for idx in trange(len(dataset)):
271 |         ### Relevant paths and load image ###
272 |         color_path = dataset.color_paths[idx]
273 | 
274 |         color_path = Path(color_path)
275 |         
276 |         vis_save_path = color_path.parent.parent / f"gsa_vis_{save_name}" / color_path.name
277 |         detections_save_path = color_path.parent.parent / f"gsa_detections_{save_name}" / color_path.name
278 |         detections_save_path = detections_save_path.with_suffix(".pkl.gz")
279 |         
280 |         os.makedirs(os.path.dirname(vis_save_path), exist_ok=True)
281 |         os.makedirs(os.path.dirname(detections_save_path), exist_ok=True)
282 |         
283 |         # opencv can't read Path objects... sigh...
284 |         color_path = str(color_path)
285 |         vis_save_path = str(vis_save_path)
286 |         detections_save_path = str(detections_save_path)
287 |         
288 |         image = cv2.imread(color_path) # This will in BGR color space
289 |         # rotate it
290 |         if hasattr(dataset, 'camera_axis'):
291 |             if dataset.camera_axis == 'Left':
292 |                 image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
293 |         image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert to RGB color space
294 |         image_pil = Image.fromarray(image_rgb)
295 |         
296 |         ### Tag2Text ###
297 |         if args.class_set in ["ram"]:
298 |             raw_image = image_pil.resize((384, 384))
299 |             raw_image = tagging_transform(raw_image).unsqueeze(0).to(args.device)
300 |             
301 |             if args.class_set == "ram":
302 |                 res = inference_ram(raw_image , tagging_model)
303 |                 caption="NA"
304 | 
305 |             # Currently ", " is better for detecting single tags
306 |             # while ". " is a little worse in some case
307 |             text_prompt=res[0].replace(' |', ',')
308 |             
309 |             add_classes = ["other item", "door", "window", "drawer", "closet", "chest", "cabinet", "dresser", "radiator", "remote", "electric outlet", "trashcan"]
310 |             remove_classes = [
311 |                 "room", "kitchen", "office", "house", "home", "building", "corner",
312 |                 "shadow", "carpet", "photo", "shade", "stall", "space", "aquarium", 
313 |                 "apartment", "image", "city", "blue", "skylight", "hallway", 
314 |                 "bureau", "modern", "salon", "doorway", "wooden floor", "bedroom", "tile", "wood wall",
315 |                 "wall paper", "wallpaper", "polka dot", "dormitory", "hardwood floor", "wood floor", 
316 |                 "mattress", "carpet", "plain", "sheet", "subway", "sun", "glass wall", "glass floor", "wall lamp", "glass door",
317 |                 "screen door", "hard wood", "ceiling", "grass", "close-up", "basement", "cement", "molding", "socket",
318 |                 "wood", "hardwood", "carpet", "rug", "heater", "concrete", "wall clock", "corridor"
319 |             ]
320 |             bg_classes = ["wall", "floor"]
321 | 
322 |             if args.add_bg_classes:
323 |                 add_classes += bg_classes
324 |             else:
325 |                 remove_classes += bg_classes
326 | 
327 |             classes = process_tag_classes(
328 |                 text_prompt, 
329 |                 add_classes = add_classes,
330 |                 remove_classes = remove_classes,
331 |             )
332 |             
333 |         # add classes to global classes
334 |         global_classes.update(classes)
335 |         
336 |         if args.accumu_classes:
337 |             # Use all the classes that have been seen so far
338 |             classes = list(global_classes)
339 |             
340 |         ### Detection and segmentation ###
341 |         if args.class_set == "none":
342 |             # Directly use SAM in dense sampling mode to get segmentation
343 |             mask, xyxy, conf = get_sam_segmentation_dense(mask_generator, image_rgb)
344 |             detections = sv.Detections(
345 |                 xyxy=xyxy,
346 |                 confidence=conf,
347 |                 class_id=np.zeros_like(conf).astype(int),
348 |                 mask=mask,
349 |             )
350 |             image_crops, image_feats, text_feats = compute_clip_features(
351 |                 image_rgb, detections, clip_model, clip_preprocess, clip_tokenizer, classes, args.device)
352 | 
353 |             ### Visualize results ###
354 |             annotated_image, labels = vis_result_fast(
355 |                 image, detections, classes, instance_random_color=True)
356 |             
357 |             cv2.imwrite(vis_save_path, annotated_image)
358 |         else:
359 |             if args.detector == "dino":
360 |                 # Using GroundingDINO to detect and SAM to segment
361 |                 detections = grounding_dino_model.predict_with_classes(
362 |                     image=image, # This function expects a BGR image...
363 |                     classes=classes,
364 |                     box_threshold=args.box_threshold,
365 |                     text_threshold=args.text_threshold,
366 |                 )
367 |             
368 |                 if len(detections.class_id) > 0:
369 |                     ### Non-maximum suppression ###
370 |                     # print(f"Before NMS: {len(detections.xyxy)} boxes")
371 |                     nms_idx = torchvision.ops.nms(
372 |                         torch.from_numpy(detections.xyxy), 
373 |                         torch.from_numpy(detections.confidence), 
374 |                         args.nms_threshold
375 |                     ).numpy().tolist()
376 |                     # print(f"After NMS: {len(detections.xyxy)} boxes")
377 | 
378 |                     detections.xyxy = detections.xyxy[nms_idx]
379 |                     detections.confidence = detections.confidence[nms_idx]
380 |                     detections.class_id = detections.class_id[nms_idx]
381 |                     
382 |                     # Somehow some detections will have class_id=-1, remove them
383 |                     valid_idx = detections.class_id != -1
384 |                     detections.xyxy = detections.xyxy[valid_idx]
385 |                     detections.confidence = detections.confidence[valid_idx]
386 |                     detections.class_id = detections.class_id[valid_idx]
387 |                 
388 |             if len(detections.class_id) > 0:
389 |                 
390 |                 ### Segment Anything ###
391 |                 detections.mask = get_sam_segmentation_from_xyxy(
392 |                     sam_predictor=sam_predictor,
393 |                     image=image_rgb,
394 |                     xyxy=detections.xyxy
395 |                 )
396 | 
397 |                 # Compute and save the clip features of detections  
398 |                 image_crops, image_feats, text_feats = compute_clip_features(
399 |                     image_rgb, detections, clip_model, clip_preprocess, clip_tokenizer, classes, args.device)
400 |             else:
401 |                 image_crops, image_feats, text_feats = [], [], []
402 |             
403 |             ### Visualize results ###
404 |             annotated_image, labels = vis_result_fast(image, detections, classes)
405 |             
406 |             # save the annotated grounded-sam image
407 |             cv2.imwrite(vis_save_path, annotated_image)
408 |         
409 |         # Convert the detections to a dict. The elements are in np.array
410 |         results = {
411 |             "xyxy": detections.xyxy,
412 |             "confidence": detections.confidence,
413 |             "class_id": detections.class_id,
414 |             "mask": detections.mask,
415 |             "classes": classes,
416 |             "image_crops": image_crops,
417 |             "image_feats": image_feats,
418 |             "text_feats": text_feats,
419 |         }
420 |         
421 |         if args.class_set in ["ram"]:
422 |             results["tagging_caption"] = caption
423 |             results["tagging_text_prompt"] = text_prompt
424 |         
425 |         # save the detections using pickle
426 |         # Here we use gzip to compress the file, which could reduce the file size by 500x
427 |         with gzip.open(detections_save_path, "wb") as f:
428 |             pickle.dump(results, f)
429 |     
430 |     # save global classes
431 |     with open(args.dataset_root / args.scene_id / f"gsa_classes_{save_name}.json", "w") as f:
432 |         json.dump(list(global_classes), f)
433 |         
434 | 
435 | if __name__ == "__main__":
436 |     parser = get_parser()
437 |     args = parser.parse_args()
438 |     main(args)


--------------------------------------------------------------------------------
/openfungraph/utils/vis.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Iterable
  3 | import dataclasses
  4 | from PIL import Image
  5 | import cv2
  6 | 
  7 | import numpy as np
  8 | import matplotlib
  9 | import matplotlib.pyplot as plt
 10 | import torch
 11 | import open3d as o3d
 12 | 
 13 | import supervision as sv
 14 | from supervision.draw.color import Color, ColorPalette
 15 | from openfungraph.slam.slam_classes import MapObjectList
 16 | 
 17 | class OnlineObjectRenderer():
 18 |     '''
 19 |     Refactor of the open3d visualization code to make it more modular
 20 |     '''
 21 |     def __init__(
 22 |         self, 
 23 |         view_param: str | dict,
 24 |         base_objects: MapObjectList | None = None,
 25 |         gray_map: bool = False
 26 |     ) -> None:
 27 |         # If the base objects are provided, we will visualize them
 28 |         if base_objects is not None:
 29 |             self.n_base_objects = len(base_objects)
 30 | 
 31 |             base_pcds_vis = copy.deepcopy(base_objects.get_values("pcd"))
 32 |             base_bboxes_vis = copy.deepcopy(base_objects.get_values("bbox"))
 33 |             for i in range(self.n_base_objects):
 34 |                 base_pcds_vis[i] = base_pcds_vis[i].voxel_down_sample(voxel_size=0.08)
 35 |                 if gray_map:
 36 |                     base_pcds_vis[i].paint_uniform_color([0.5, 0.5, 0.5])
 37 |             for i in range(self.n_base_objects):
 38 |                 base_bboxes_vis[i].color = [0.5, 0.5, 0.5]
 39 |             
 40 |             self.base_pcds_vis = base_pcds_vis
 41 |             self.base_bboxes_vis = base_bboxes_vis
 42 |         else:
 43 |             self.n_base_objects = 0
 44 |         
 45 |         self.est_traj = []
 46 |         self.gt_traj = []
 47 |         
 48 |         self.cmap = matplotlib.colormaps.get_cmap("turbo")
 49 | 
 50 |         if isinstance(view_param, str):
 51 |             self.view_param = o3d.io.read_pinhole_camera_parameters(view_param)
 52 |         else:
 53 |             self.view_param = view_param
 54 |             
 55 |         self.window_height = self.view_param.intrinsic.height
 56 |         self.window_width = self.view_param.intrinsic.width
 57 |         
 58 |         self.vis = o3d.visualization.Visualizer()
 59 |         self.vis.create_window(
 60 |             width = self.window_width,
 61 |             height = self.window_height,
 62 |         )
 63 |         
 64 |         self.vis_ctrl = self.vis.get_view_control()
 65 |         self.vis_ctrl.convert_from_pinhole_camera_parameters(self.view_param)
 66 |         
 67 |     def filter_base_by_mask(self, mask: Iterable[bool]):
 68 |         assert len(mask) == self.n_base_objects
 69 |         self.base_pcds_vis = [pcd for pcd, m in zip(self.base_pcds_vis, mask) if m]
 70 |         self.base_bboxes_vis = [bbox for bbox, m in zip(self.base_bboxes_vis, mask) if m]
 71 |         self.n_base_objects = len(self.base_pcds_vis)
 72 |     
 73 |     def step(
 74 |         self,
 75 |         image: Image.Image,
 76 |         pcds: list[o3d.geometry.PointCloud] | None = None,
 77 |         pcd_colors: np.ndarray | None = None,
 78 |         est_pose: np.ndarray | None = None,
 79 |         gt_pose: np.ndarray | None = None,
 80 |         base_objects_color: dict | None = None,
 81 |         new_objects: MapObjectList = None,
 82 |         paint_new_objects: bool = True,
 83 |         return_vis_handle: bool = False,
 84 |     ):
 85 |         # Remove all the geometries
 86 |         self.vis.clear_geometries()
 87 |         
 88 |         # Add the pose cameras and trajectories
 89 |         if est_pose is not None:
 90 |             self.est_traj.append(est_pose)
 91 |             est_camera_frustum = better_camera_frustum(
 92 |                 est_pose, image.height, image.width, scale=0.5, color=[1., 0, 0]
 93 |             )
 94 |             self.vis.add_geometry(est_camera_frustum)
 95 |             if len(self.est_traj) > 1:
 96 |                 est_traj_lineset = poses2lineset(np.stack(self.est_traj), color=[1., 0, 0])
 97 |                 self.vis.add_geometry(est_traj_lineset)
 98 |             
 99 |         if gt_pose is not None:
100 |             self.gt_traj.append(gt_pose)
101 |             gt_camera_frustum = better_camera_frustum(
102 |                 gt_pose, image.height, image.width, scale=0.5, color=[0, 1., 0]
103 |             )
104 |             self.vis.add_geometry(gt_camera_frustum)
105 |             if len(self.gt_traj) > 1:
106 |                 gt_traj_lineset = poses2lineset(np.stack(self.gt_traj), color=[0, 1., 0])
107 |                 self.vis.add_geometry(gt_traj_lineset)
108 |     
109 |         # Add the base objects
110 |         if self.n_base_objects > 0:
111 |             if base_objects_color is not None:
112 |                 for obj_id in range(self.n_base_objects):
113 |                     color = base_objects_color[obj_id]
114 |                     self.base_pcds_vis[obj_id].paint_uniform_color(color)
115 |                     self.base_bboxes_vis[obj_id].color = color
116 |             
117 |             for geom in self.base_pcds_vis + self.base_bboxes_vis:
118 |                 self.vis.add_geometry(geom)
119 |             
120 |         # Show the extra pcds to visualize
121 |         if pcds is not None:
122 |             for i in range(len(pcds)):
123 |                 pcds[i].transform(est_pose)
124 |                 if pcd_colors is not None:
125 |                     pcds[i].paint_uniform_color(pcd_colors[i][:3])
126 |                 self.vis.add_geometry(pcds[i])
127 |             
128 |         # Show the extra new objects
129 |         if new_objects is not None:
130 |             for obj in new_objects:
131 |                 pcd = copy.deepcopy(obj['pcd'])
132 |                 bbox = copy.deepcopy(obj['bbox'])
133 |                 bbox.color = [0.0, 0.0, 1.0]
134 |                 if paint_new_objects:
135 |                     pcd.paint_uniform_color([0.0, 1.0, 0.0])
136 |                     bbox.color = [0.0, 1.0, 0.0]
137 |                 
138 |                 self.vis.add_geometry(pcd)
139 |                 self.vis.add_geometry(bbox)
140 |         
141 |         self.vis_ctrl.convert_from_pinhole_camera_parameters(self.view_param)
142 |         
143 |         self.vis.poll_events()
144 |         self.vis.update_renderer()
145 |         
146 |         rendered_image = self.vis.capture_screen_float_buffer(False)
147 |         rendered_image = np.asarray(rendered_image)
148 |         
149 |         if return_vis_handle:
150 |             return rendered_image, self.vis
151 |         else:
152 |             return rendered_image, None
153 | 
154 | def get_random_colors(num_colors):
155 |     '''
156 |     Generate random colors for visualization
157 |     
158 |     Args:
159 |         num_colors (int): number of colors to generate
160 |         
161 |     Returns:
162 |         colors (np.ndarray): (num_colors, 3) array of colors, in RGB, [0, 1]
163 |     '''
164 |     colors = []
165 |     for i in range(num_colors):
166 |         colors.append(np.random.rand(3))
167 |     colors = np.array(colors)
168 |     return colors
169 | 
170 | def show_mask(mask, ax, random_color=False):
171 |     if random_color:
172 |         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
173 |     else:
174 |         color = np.array([30/255, 144/255, 255/255, 0.6])
175 |     h, w = mask.shape[-2:]
176 |     mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
177 |     ax.imshow(mask_image)
178 |     
179 | def show_points(coords, labels, ax, marker_size=375):
180 |     pos_points = coords[labels==1]
181 |     neg_points = coords[labels==0]
182 |     ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
183 |     ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   
184 |     
185 | def show_box(box, ax, label=None):
186 |     x0, y0 = box[0], box[1]
187 |     w, h = box[2] - box[0], box[3] - box[1]
188 |     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))    
189 |     
190 |     if label is not None:
191 |         ax.text(x0, y0, label)
192 |         
193 | def vis_result_fast(
194 |     image: np.ndarray,
195 |     detections: sv.Detections,
196 |     classes: list[str],
197 |     color: Color | ColorPalette = ColorPalette.DEFAULT,
198 |     instance_random_color: bool = False,
199 |     draw_bbox: bool = True,
200 | ) -> tuple[np.ndarray, list[str]]:
201 |     '''
202 |     Annotate the image with the detection results.
203 |     This is fast but of the same resolution of the input image, thus can be blurry.
204 |     '''
205 |     # Create annotators
206 |     box_annotator = sv.BoxAnnotator(
207 |         color=color,
208 |         thickness=2
209 |     )
210 |     label_annotator = sv.LabelAnnotator(
211 |         color=color,
212 |         text_color=Color.WHITE,
213 |         text_scale=0.3,
214 |         text_thickness=1,
215 |         text_padding=2
216 |     )
217 |     mask_annotator = sv.MaskAnnotator(
218 |         color=color
219 |     )
220 | 
221 |     # Generate labels if class and confidence info exists
222 |     if hasattr(detections, 'confidence') and hasattr(detections, 'class_id'):
223 |         confidences = detections.confidence
224 |         class_ids = detections.class_id
225 |         if confidences is not None:
226 |             labels = [
227 |                 f"{classes[class_id]} {confidence:0.2f}"
228 |                 for confidence, class_id in zip(confidences, class_ids)
229 |             ]
230 |         else:
231 |             labels = [f"{classes[class_id]}" for class_id in class_ids]
232 |     else:
233 |         labels = []
234 |         print("Detections object does not have 'confidence' or 'class_id' attributes or one of them is missing.")
235 | 
236 |     # Optionally assign random colors per instance
237 |     if instance_random_color:
238 |         detections = dataclasses.replace(detections)
239 |         detections.class_id = np.arange(len(detections))
240 | 
241 |     # Annotate image
242 |     annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
243 |     if draw_bbox:
244 |         annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
245 |         annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
246 | 
247 |     return annotated_image, labels
248 | 
249 | def vis_result_slow_caption(image, masks, boxes_filt, pred_phrases, caption, text_prompt):
250 |     '''
251 |     Annotate the image with detection results, together with captions and text prompts.
252 |     This function is very slow, but the output is more readable.
253 |     '''
254 |     plt.figure(figsize=(10, 10))
255 |     plt.imshow(image)
256 |     for mask in masks:
257 |         show_mask(mask, plt.gca(), random_color=True)
258 |     for box, label in zip(boxes_filt, pred_phrases):
259 |         show_box(box, plt.gca(), label)
260 | 
261 |     plt.title('Tagging-Caption: ' + caption + '\n' + 'Tagging-classes: ' + text_prompt + '\n')
262 |     plt.axis('off')
263 |     
264 |     # Convert the fig to a numpy array
265 |     fig = plt.gcf()
266 |     fig.tight_layout(pad=0)
267 |     fig.canvas.draw()
268 |     vis_image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
269 |     vis_image = vis_image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
270 |     plt.close()
271 |     
272 |     return vis_image
273 | 
274 | def vis_sam_mask(anns):
275 |     sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
276 | 
277 |     img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4))
278 |     img[:,:,3] = 0
279 |     for ann in sorted_anns:
280 |         m = ann['segmentation']
281 |         color_mask = np.concatenate([np.random.random(3), [0.35]])
282 |         img[m] = color_mask
283 |         
284 |     return img
285 | 
286 | def poses2lineset(poses, color=[0, 0, 1]):
287 |     '''
288 |     Create a open3d line set from a batch of poses
289 | 
290 |     poses: (N, 4, 4)
291 |     color: (3,)
292 |     '''
293 |     N = poses.shape[0]
294 |     lineset = o3d.geometry.LineSet()
295 |     lineset.points = o3d.utility.Vector3dVector(poses[:, :3, 3])
296 |     lineset.lines = o3d.utility.Vector2iVector(
297 |         np.array([[i, i + 1] for i in range(N - 1)])
298 |     )
299 |     lineset.colors = o3d.utility.Vector3dVector([color for _ in range(len(lineset.lines))])
300 |     return lineset
301 | 
302 | def create_camera_frustum(
303 |     camera_pose, width=1, height=1, z_near=0.5, z_far=1, color=[0, 0, 1]
304 | ):
305 |     K = np.array([[z_near, 0, 0], [0, z_near, 0], [0, 0, z_near + z_far]])
306 |     points = np.array(
307 |         [
308 |             [-width / 2, -height / 2, z_near],
309 |             [width / 2, -height / 2, z_near],
310 |             [width / 2, height / 2, z_near],
311 |             [-width / 2, height / 2, z_near],
312 |             [0, 0, 0],
313 |         ]
314 |     )
315 |     points_transformed = camera_pose[:3, :3] @ (K @ points.T) + camera_pose[:3, 3:4]
316 |     points_transformed = points_transformed.T
317 |     frustum = o3d.geometry.LineSet()
318 |     frustum.points = o3d.utility.Vector3dVector(points_transformed)
319 |     lines = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3]]
320 |     frustum.lines = o3d.utility.Vector2iVector(lines)
321 |     frustum.colors = o3d.utility.Vector3dVector([color for _ in range(len(lines))])
322 |     return frustum
323 | 
324 | 
325 | def better_camera_frustum(camera_pose, img_h, img_w, scale=3.0, color=[0, 0, 1]):
326 |     # Convert camera pose tensor to numpy array
327 |     if isinstance(camera_pose, torch.Tensor):
328 |         camera_pose = camera_pose.numpy()
329 |     
330 |     # Define near and far distance (adjust these as needed)
331 |     near = scale * 0.1
332 |     far = scale * 1.0
333 |     
334 |     # Define frustum dimensions at the near plane (replace with appropriate values)
335 |     frustum_h = near
336 |     frustum_w = frustum_h * img_w / img_h  # Set frustum width based on its height and the image aspect ratio
337 |     
338 |     # Compute the 8 points that define the frustum
339 |     points = []
340 |     for x in [-1, 1]:
341 |         for y in [-1, 1]:
342 |             for z in [-1, 1]:
343 |                 u = x * (frustum_w // 2 if z == -1 else frustum_w * far / near)
344 |                 v = y * (frustum_h // 2 if z == -1 else frustum_h * far / near)
345 |                 d = near if z == -1 else far # Negate depth here
346 |                 # d = -near if z == -1 else -far # Negate depth here
347 |                 point = np.array([u, v, d, 1]).reshape(-1, 1)
348 |                 transformed_point = (camera_pose @ point).ravel()[:3]
349 |                 # transformed_point[0] *= -1  # Flip X-coordinate
350 |                 points.append(transformed_point) # Using camera pose directly
351 |                 # points.append((camera_pose_np @ point).ravel()[:3]) # Using camera pose directly
352 |     
353 |     # Create lines that connect the 8 points
354 |     lines = [[0, 1], [1, 3], [3, 2], [2, 0], [4, 5], [5, 7], [7, 6], [6, 4], [0, 4], [1, 5], [3, 7], [2, 6]]
355 |     
356 |     frustum = o3d.geometry.LineSet()
357 |     frustum.points = o3d.utility.Vector3dVector(points)
358 |     frustum.lines = o3d.utility.Vector2iVector(lines)
359 |     frustum.colors = o3d.utility.Vector3dVector([color for _ in range(len(lines))])
360 | 
361 |     return frustum
362 | 
363 | 
364 | # Copied from https://github.com/isl-org/Open3D/pull/738
365 | def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])):
366 |     """
367 |     Aligns vector a to vector b with axis angle rotation
368 |     """
369 |     if np.array_equal(a, b):
370 |         return None, None
371 |     axis_ = np.cross(a, b)
372 |     axis_ = axis_ / np.linalg.norm(axis_)
373 |     angle = np.arccos(np.dot(a, b))
374 | 
375 |     return axis_, angle
376 | 
377 | 
378 | def normalized(a, axis=-1, order=2):
379 |     """Normalizes a numpy array of points"""
380 |     l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
381 |     l2[l2 == 0] = 1
382 |     return a / np.expand_dims(l2, axis), l2
383 | 
384 | def save_video_detections(exp_out_path, save_path=None, fps=30):
385 |     '''
386 |     Save the detections in the folder as a video
387 |     '''
388 |     if save_path is None:
389 |         save_path = exp_out_path / "vis_video.mp4"
390 |     
391 |     # Get the list of images
392 |     image_files = list((exp_out_path / "vis").glob("*.jpg"))
393 |     image_files.sort()
394 |     
395 |     # Read the first image to get the size
396 |     image = Image.open(image_files[0])
397 |     width, height = image.size
398 |     
399 |     # Create the video writer
400 |     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
401 |     out = cv2.VideoWriter(str(save_path), fourcc, fps, (width, height))
402 |     
403 |     # Write the images to the video
404 |     for image_file in image_files:
405 |         image = cv2.imread(str(image_file))
406 |         out.write(image)
407 |     
408 |     out.release()
409 |     print(f"Video saved at {save_path}")
410 | 
411 | 
412 | class LineMesh(object):
413 |     def __init__(self, points, lines=None, colors=[0, 1, 0], radius=0.15):
414 |         """Creates a line represented as sequence of cylinder triangular meshes
415 | 
416 |         Arguments:
417 |             points {ndarray} -- Numpy array of ponts Nx3.
418 | 
419 |         Keyword Arguments:
420 |             lines {list[list] or None} -- List of point index pairs denoting line segments. If None, implicit lines from ordered pairwise points. (default: {None})
421 |             colors {list} -- list of colors, or single color of the line (default: {[0, 1, 0]})
422 |             radius {float} -- radius of cylinder (default: {0.15})
423 |         """
424 |         self.points = np.array(points)
425 |         self.lines = np.array(
426 |             lines) if lines is not None else self.lines_from_ordered_points(self.points)
427 |         self.colors = np.array(colors)
428 |         self.radius = radius
429 |         self.cylinder_segments = []
430 | 
431 |         self.create_line_mesh()
432 | 
433 |     @staticmethod
434 |     def lines_from_ordered_points(points):
435 |         lines = [[i, i + 1] for i in range(0, points.shape[0] - 1, 1)]
436 |         return np.array(lines)
437 | 
438 |     def create_line_mesh(self):
439 |         first_points = self.points[self.lines[:, 0], :]
440 |         second_points = self.points[self.lines[:, 1], :]
441 |         line_segments = second_points - first_points
442 |         line_segments_unit, line_lengths = normalized(line_segments)
443 | 
444 |         z_axis = np.array([0, 0, 1])
445 |         # Create triangular mesh cylinder segments of line
446 |         for i in range(line_segments_unit.shape[0]):
447 |             line_segment = line_segments_unit[i, :]
448 |             line_length = line_lengths[i]
449 |             # get axis angle rotation to allign cylinder with line segment
450 |             axis, angle = align_vector_to_another(z_axis, line_segment)
451 |             # Get translation vector
452 |             translation = first_points[i, :] + line_segment * line_length * 0.5
453 |             # create cylinder and apply transformations
454 |             cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder(
455 |                 self.radius, line_length)
456 |             cylinder_segment = cylinder_segment.translate(
457 |                 translation, relative=False)
458 |             if axis is not None:
459 |                 axis_a = axis * angle
460 |                 # cylinder_segment = cylinder_segment.rotate(
461 |                 #     R=o3d.geometry.get_rotation_matrix_from_axis_angle(axis_a))
462 |                 # cylinder_segment = cylinder_segment.rotate(
463 |                 #   axis_a, center=True, type=o3d.geometry.RotationType.AxisAngle)
464 |                 cylinder_segment = cylinder_segment.rotate(
465 |                     R=o3d.geometry.get_rotation_matrix_from_axis_angle(axis_a))
466 |             # color cylinder
467 |             color = self.colors if self.colors.ndim == 1 else self.colors[i, :]
468 |             cylinder_segment.paint_uniform_color(color)
469 | 
470 |             self.cylinder_segments.append(cylinder_segment)
471 | 
472 |     def add_line(self, vis):
473 |         """Adds this line to the visualizer"""
474 |         for cylinder in self.cylinder_segments:
475 |             vis.add_geometry(cylinder, reset_bounding_box=False)
476 | 
477 |     def remove_line(self, vis):
478 |         """Removes this line from the visualizer"""
479 |         for cylinder in self.cylinder_segments:
480 |             vis.remove_geometry(cylinder, reset_bounding_box=False)


--------------------------------------------------------------------------------
/openfungraph/utils/ious.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | import open3d as o3d
  5 | 
  6 | def compute_3d_iou(bbox1, bbox2, padding=0, use_iou=True):
  7 |     # Get the coordinates of the first bounding box
  8 |     bbox1_min = np.asarray(bbox1.get_min_bound()) - padding
  9 |     bbox1_max = np.asarray(bbox1.get_max_bound()) + padding
 10 | 
 11 |     # Get the coordinates of the second bounding box
 12 |     bbox2_min = np.asarray(bbox2.get_min_bound()) - padding
 13 |     bbox2_max = np.asarray(bbox2.get_max_bound()) + padding
 14 | 
 15 |     # Compute the overlap between the two bounding boxes
 16 |     overlap_min = np.maximum(bbox1_min, bbox2_min)
 17 |     overlap_max = np.minimum(bbox1_max, bbox2_max)
 18 |     overlap_size = np.maximum(overlap_max - overlap_min, 0.0)
 19 | 
 20 |     overlap_volume = np.prod(overlap_size)
 21 |     bbox1_volume = np.prod(bbox1_max - bbox1_min)
 22 |     bbox2_volume = np.prod(bbox2_max - bbox2_min)
 23 |     
 24 |     obj_1_overlap = overlap_volume / bbox1_volume
 25 |     obj_2_overlap = overlap_volume / bbox2_volume
 26 |     max_overlap = max(obj_1_overlap, obj_2_overlap)
 27 | 
 28 |     iou = overlap_volume / (bbox1_volume + bbox2_volume - overlap_volume)
 29 | 
 30 |     if use_iou:
 31 |         return iou
 32 |     else:
 33 |         return max_overlap
 34 | 
 35 | def compute_iou_batch(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
 36 |     '''
 37 |     Compute IoU between two sets of axis-aligned 3D bounding boxes.
 38 |     
 39 |     bbox1: (M, V, D), e.g. (M, 8, 3)
 40 |     bbox2: (N, V, D), e.g. (N, 8, 3)
 41 |     
 42 |     returns: (M, N)
 43 |     '''
 44 |     # Compute min and max for each box
 45 |     bbox1_min, _ = bbox1.min(dim=1) # Shape: (M, 3)
 46 |     bbox1_max, _ = bbox1.max(dim=1) # Shape: (M, 3)
 47 |     bbox2_min, _ = bbox2.min(dim=1) # Shape: (N, 3)
 48 |     bbox2_max, _ = bbox2.max(dim=1) # Shape: (N, 3)
 49 | 
 50 |     # Expand dimensions for broadcasting
 51 |     bbox1_min = bbox1_min.unsqueeze(1)  # Shape: (M, 1, 3)
 52 |     bbox1_max = bbox1_max.unsqueeze(1)  # Shape: (M, 1, 3)
 53 |     bbox2_min = bbox2_min.unsqueeze(0)  # Shape: (1, N, 3)
 54 |     bbox2_max = bbox2_max.unsqueeze(0)  # Shape: (1, N, 3)
 55 | 
 56 |     # Compute max of min values and min of max values
 57 |     # to obtain the coordinates of intersection box.
 58 |     inter_min = torch.max(bbox1_min, bbox2_min)  # Shape: (M, N, 3)
 59 |     inter_max = torch.min(bbox1_max, bbox2_max)  # Shape: (M, N, 3)
 60 | 
 61 |     # Compute volume of intersection box
 62 |     inter_vol = torch.prod(torch.clamp(inter_max - inter_min, min=0), dim=2)  # Shape: (M, N)
 63 | 
 64 |     # Compute volumes of the two sets of boxes
 65 |     bbox1_vol = torch.prod(bbox1_max - bbox1_min, dim=2)  # Shape: (M, 1)
 66 |     bbox2_vol = torch.prod(bbox2_max - bbox2_min, dim=2)  # Shape: (1, N)
 67 | 
 68 |     # Compute IoU, handling the special case where there is no intersection
 69 |     # by setting the intersection volume to 0.
 70 |     iou = inter_vol / (bbox1_vol + bbox2_vol - inter_vol + 1e-10)
 71 | 
 72 |     return iou
 73 |     
 74 | def compute_3d_giou(bbox1, bbox2):
 75 |     # Get the coordinates of the first bounding box
 76 |     bbox1_min = np.asarray(bbox1.get_min_bound())
 77 |     bbox1_max = np.asarray(bbox1.get_max_bound())
 78 | 
 79 |     # Get the coordinates of the second bounding box
 80 |     bbox2_min = np.asarray(bbox2.get_min_bound())
 81 |     bbox2_max = np.asarray(bbox2.get_max_bound())
 82 |     
 83 |     # Intersection
 84 |     intersec_min = np.maximum(bbox1_min, bbox2_min)
 85 |     intersec_max = np.minimum(bbox1_max, bbox2_max)
 86 |     intersec_size = np.maximum(intersec_max - intersec_min, 0.0)
 87 |     intersec_volume = np.prod(intersec_size)
 88 | 
 89 |     # Union
 90 |     bbox1_volume = np.prod(bbox1_max - bbox1_min)
 91 |     bbox2_volume = np.prod(bbox2_max - bbox2_min)
 92 |     union_volume = bbox1_volume + bbox2_volume - intersec_volume
 93 |     
 94 |     iou = intersec_volume / union_volume
 95 |     
 96 |     # Enclosing box
 97 |     enclosing_min = np.minimum(bbox1_min, bbox2_min)
 98 |     enclosing_max = np.maximum(bbox1_max, bbox2_max)
 99 |     enclosing_size = np.maximum(enclosing_max - enclosing_min, 0.0)
100 |     enclosing_volume = np.prod(enclosing_size)
101 |     
102 |     giou = iou - (enclosing_volume - union_volume) / enclosing_volume
103 |     
104 |     return giou
105 | 
106 | def compute_giou_batch(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
107 |     '''
108 |     Compute the generalized IoU between two sets of axis-aligned 3D bounding boxes.
109 |     
110 |     bbox1: (M, V, D), e.g. (M, 8, 3)
111 |     bbox2: (N, V, D), e.g. (N, 8, 3)
112 |     
113 |     returns: (M, N)
114 |     '''
115 |     # Compute min and max for each box
116 |     bbox1_min, _ = bbox1.min(dim=1) # Shape: (M, D)
117 |     bbox1_max, _ = bbox1.max(dim=1) # Shape: (M, D)
118 |     bbox2_min, _ = bbox2.min(dim=1) # Shape: (N, D)
119 |     bbox2_max, _ = bbox2.max(dim=1) # Shape: (N, D)
120 | 
121 |     # Expand dimensions for broadcasting
122 |     bbox1_min = bbox1_min.unsqueeze(1)  # Shape: (M, 1, D)
123 |     bbox1_max = bbox1_max.unsqueeze(1)  # Shape: (M, 1, D)
124 |     bbox2_min = bbox2_min.unsqueeze(0)  # Shape: (1, N, D)
125 |     bbox2_max = bbox2_max.unsqueeze(0)  # Shape: (1, N, D)
126 | 
127 |     # Compute max of min values and min of max values
128 |     # to obtain the coordinates of intersection box.
129 |     inter_min = torch.max(bbox1_min, bbox2_min)  # Shape: (M, N, D)
130 |     inter_max = torch.min(bbox1_max, bbox2_max)  # Shape: (M, N, D)
131 |     
132 |     # to obtain the coordinates of enclosing box
133 |     enclosing_min = torch.min(bbox1_min, bbox2_min)  # Shape: (M, N, D)
134 |     enclosing_max = torch.max(bbox1_max, bbox2_max)  # Shape: (M, N, D)
135 | 
136 |     # Compute volume of intersection box
137 |     inter_vol = torch.prod(torch.clamp(inter_max - inter_min, min=0), dim=2)  # Shape: (M, N)
138 |     enclosing_vol = torch.prod(enclosing_max - enclosing_min, dim=2)  # Shape: (M, N)
139 | 
140 |     # Compute volumes of the two sets of boxes
141 |     bbox1_vol = torch.prod(bbox1_max - bbox1_min, dim=2)  # Shape: (M, 1)
142 |     bbox2_vol = torch.prod(bbox2_max - bbox2_min, dim=2)  # Shape: (1, N)
143 |     union_vol = bbox1_vol + bbox2_vol - inter_vol
144 | 
145 |     # Compute IoU, handling the special case where there is no intersection
146 |     # by setting the intersection volume to 0.
147 |     iou = inter_vol / (union_vol + 1e-10)
148 |     giou = iou - (enclosing_vol - union_vol) / (enclosing_vol + 1e-10)
149 | 
150 |     return giou
151 | 
152 | def compute_3d_iou_accuracte_batch(bbox1, bbox2):
153 |     '''
154 |     Compute IoU between two sets of oriented (or axis-aligned) 3D bounding boxes.
155 |     
156 |     bbox1: (M, 8, D), e.g. (M, 8, 3)
157 |     bbox2: (N, 8, D), e.g. (N, 8, 3)
158 |     
159 |     returns: (M, N)
160 |     '''
161 |     # Must expend the box beforehand, otherwise it may results overestimated results
162 |     bbox1 = expand_3d_box(bbox1, 0.02)
163 |     bbox2 = expand_3d_box(bbox2, 0.02)
164 |     
165 |     import pytorch3d.ops as ops
166 | 
167 |     bbox1 = bbox1[:, [0, 2, 5, 3, 1, 7, 4, 6]]
168 |     bbox2 = bbox2[:, [0, 2, 5, 3, 1, 7, 4, 6]]
169 |     
170 |     inter_vol, iou = ops.box3d_overlap(bbox1.float(), bbox2.float())
171 |     
172 |     return iou
173 | 
174 | def compute_3d_giou_accurate(obj1, obj2):
175 |     '''
176 |     Compute the 3D GIoU in a more accurate way. 
177 |     '''
178 |     import pytorch3d.ops as ops
179 |     
180 |     # This is too slow
181 |     # bbox1 = pcd1.get_minimal_oriented_bounding_box()
182 |     # bbox2 = pcd2.get_minimal_oriented_bounding_box()
183 |     
184 |     # This is still slow ... 
185 |     # Moved it outside of this function so that it is computed less times
186 |     # bbox1 = pcd1.get_oriented_bounding_box()
187 |     # bbox2 = pcd2.get_oriented_bounding_box()
188 |     
189 |     bbox1 = obj1['bbox']
190 |     bbox2 = obj2['bbox']
191 |     pcd1 = obj1['pcd']
192 |     pcd2 = obj2['pcd']
193 |     
194 |     # Get the coordinates of the bounding boxes
195 |     box_points1 = np.asarray(bbox1.get_box_points())
196 |     box_points2 = np.asarray(bbox2.get_box_points())
197 |     
198 |     # Re-order the points to fit the format required in pytorch3d
199 |     # xyz should be [---, -+-, -++, --+,    +--, ++-, +++, +-+]
200 |     box_points1 = box_points1[[0, 2, 5, 3, 1, 7, 4, 6]]
201 |     box_points2 = box_points2[[0, 2, 5, 3, 1, 7, 4, 6]]
202 |     
203 |     # Computet the intersection of the two boxes
204 |     try:
205 |         vols, ious = ops.box3d_overlap(
206 |             torch.from_numpy(box_points1).unsqueeze(0).float(), 
207 |             torch.from_numpy(box_points2).unsqueeze(0).float()
208 |         )
209 |     except ValueError as e: # This indicates colinear
210 |         union_volume = 0.0
211 |         iou = 0.0
212 |     else:
213 |         union_volume = vols[0,0].item()
214 |         iou = ious[0,0].item()
215 |     
216 |     # Join the two point cloud
217 |     pcd_union = pcd1 + pcd2
218 | 
219 |     # compute_convex_hull() somehow cannot provide watertight mesh
220 |     # union_hull_mesh, union_hull_point_list = pcd_union.compute_convex_hull()
221 |     # enclosing_volume = union_hull_mesh.get_volume()
222 |     
223 |     # enclosing_box = pcd_union.get_minimal_oriented_bounding_box()
224 |     enclosing_box = pcd_union.get_oriented_bounding_box()
225 |     enclosing_volume = enclosing_box.volume()
226 |     
227 |     giou = iou - (enclosing_volume - union_volume) / enclosing_volume
228 |     
229 |     # print(vols, ious)
230 |     # print(enclosing_volume, union_volume, iou, giou)
231 |     
232 |     # o3d.visualization.draw_geometries([
233 |     #     pcd1, pcd2, bbox1, bbox2
234 |     # ])
235 |     
236 |     return giou
237 | 
238 | def compute_3d_box_volume_batch(bbox: torch.Tensor) -> torch.Tensor:
239 |     '''
240 |     Compute the volume of a set of rectangular boxes.
241 |     This assumes bbox corner order follows the open3d convention, which is:
242 |     ---, +--, -+-, --+, +++, -++, +-+, ++-
243 |     See https://github.com/isl-org/Open3D/blob/47f4ee936841ae9f8a9c4ce5d9162bd5b3e0279f/cpp/open3d/geometry/BoundingVolume.cpp#L92
244 |     
245 |     bbox: (N, 8, D)
246 |     
247 |     returns: (N,)
248 |     '''
249 |     a = torch.linalg.vector_norm(bbox[:, 0, :] - bbox[:, 1, :], ord=2, dim=1)
250 |     b = torch.linalg.vector_norm(bbox[:, 0, :] - bbox[:, 2, :], ord=2, dim=1)
251 |     c = torch.linalg.vector_norm(bbox[:, 0, :] - bbox[:, 3, :], ord=2, dim=1)
252 |     
253 |     vol = a * b * c
254 |     return vol
255 |     
256 | def expand_3d_box(bbox: torch.Tensor, eps=0.02) -> torch.Tensor:
257 |     '''
258 |     Expand the side of 3D boxes such that each side has at least eps length.
259 |     Assumes the bbox cornder order in open3d convention. 
260 |     
261 |     bbox: (N, 8, D)
262 |     
263 |     returns: (N, 8, D)
264 |     '''
265 |     center = bbox.mean(dim=1)  # shape: (N, D)
266 | 
267 |     va = bbox[:, 1, :] - bbox[:, 0, :]  # shape: (N, D)
268 |     vb = bbox[:, 2, :] - bbox[:, 0, :]  # shape: (N, D)
269 |     vc = bbox[:, 3, :] - bbox[:, 0, :]  # shape: (N, D)
270 |     
271 |     a = torch.linalg.vector_norm(va, ord=2, dim=1, keepdim=True)  # shape: (N, 1)
272 |     b = torch.linalg.vector_norm(vb, ord=2, dim=1, keepdim=True)  # shape: (N, 1)
273 |     c = torch.linalg.vector_norm(vc, ord=2, dim=1, keepdim=True)  # shape: (N, 1)
274 |     
275 |     va = torch.where(a < eps, va / a * eps, va)  # shape: (N, D)
276 |     vb = torch.where(b < eps, vb / b * eps, vb)  # shape: (N, D)
277 |     vc = torch.where(c < eps, vc / c * eps, vc)  # shape: (N, D)
278 |     
279 |     new_bbox = torch.stack([
280 |         center - va/2.0 - vb/2.0 - vc/2.0,
281 |         center + va/2.0 - vb/2.0 - vc/2.0,
282 |         center - va/2.0 + vb/2.0 - vc/2.0,
283 |         center - va/2.0 - vb/2.0 + vc/2.0,
284 |         center + va/2.0 + vb/2.0 + vc/2.0,
285 |         center - va/2.0 + vb/2.0 + vc/2.0,
286 |         center + va/2.0 - vb/2.0 + vc/2.0,
287 |         center + va/2.0 + vb/2.0 - vc/2.0,
288 |     ], dim=1) # shape: (N, 8, D)
289 |     
290 |     new_bbox = new_bbox.to(bbox.device)
291 |     new_bbox = new_bbox.type(bbox.dtype)
292 |     
293 |     return new_bbox
294 |     
295 | def compute_enclosing_vol(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
296 |     '''
297 |     Compute the enclosing volume between every pair of boxes in bbox1 and bbox2.
298 |     This is an accurate but slow version using convex hull
299 |     
300 |     bbox1: (M, 8, D)
301 |     bbox2: (N, 8, D)
302 |     
303 |     returns: (M, N)
304 |     '''
305 |     M = bbox1.shape[0]
306 |     N = bbox2.shape[0]
307 |     
308 |     enclosing_vol = torch.zeros((M, N), dtype=bbox1.dtype, device=bbox1.device)
309 |     for i in range(bbox1.shape[0]):
310 |         for j in range(bbox2.shape[0]):
311 |             pcd_union = o3d.geometry.PointCloud()
312 |             bbox_points_union = torch.cat([bbox1[i], bbox2[j]], dim=0) # (16, 3)
313 |             pcd_union.points = o3d.utility.Vector3dVector(bbox_points_union.cpu().numpy())
314 |             enclosing_mesh, _ = pcd_union.compute_convex_hull(joggle_inputs=True)
315 |             try:
316 |                 enclosing_vol[i, j] = enclosing_mesh.get_volume()
317 |             except:
318 |                 # This occurs commonly when the enclosing_mesh is not watertight.
319 |                 enclosing_mesh = pcd_union.get_axis_aligned_bounding_box()
320 |                 enclosing_vol[i, j] = enclosing_mesh.volume()
321 |                 
322 |     return enclosing_vol
323 |     
324 | def compute_enclosing_vol_fast(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
325 |     '''
326 |     Compute the enclosing volume between every pair of boxes in bbox1 and bbox2.
327 |     This is fast but approximate version using axis-aligned bounding box
328 |     
329 |     bbox1: (M, 8, 3)
330 |     bbox2: (N, 8, 3)
331 |     
332 |     returns: (M, N)
333 |     '''
334 |     M = bbox1.shape[0]
335 |     N = bbox2.shape[0]
336 |     
337 |     # Expand dimensions to compute the pairwise enclosing box
338 |     bbox1 = bbox1.unsqueeze(1).expand(-1, N, -1, -1) # (M, N, 8, 3)
339 |     bbox2 = bbox2.unsqueeze(0).expand(M, -1, -1, -1) # (M, N, 8, 3)
340 |     
341 |     # Compute the minimum and maximum coordinates for each pair of boxes
342 |     min_coords = torch.minimum(bbox1, bbox2).amin(dim=2) # (M, N, 3)
343 |     max_coords = torch.maximum(bbox1, bbox2).amax(dim=2) # (M, N, 3)
344 | 
345 |     # Compute the dimensions of the enclosing boxes
346 |     enclosing_dims = max_coords - min_coords # (M, N, 3)
347 |     
348 |     # Clamp the dimensions to be non-negative (in case there's no overlap)
349 |     enclosing_dims = torch.clamp(enclosing_dims, min=0) # (M, N, 3)
350 |     
351 |     # Compute the volume of the enclosing boxes
352 |     vol = enclosing_dims[:, :, 0] * enclosing_dims[:, :, 1] * enclosing_dims[:, :, 2] # (M, N)
353 | 
354 |     return vol
355 | 
356 | def compute_3d_giou_accurate_batch(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
357 |     '''
358 |     Compute Generalized IoU between two sets of oriented (or axis-aligned) 3D bounding boxes.
359 |     
360 |     bbox1: (M, 8, D), e.g. (M, 8, 3)
361 |     bbox2: (N, 8, D), e.g. (N, 8, 3)
362 |     
363 |     returns: (M, N)
364 |     '''
365 |     # Must expend the box beforehand, otherwise it may results overestimated results
366 |     bbox1 = expand_3d_box(bbox1, 0.02)
367 |     bbox2 = expand_3d_box(bbox2, 0.02)
368 |     
369 |     bbox1_vol = compute_3d_box_volume_batch(bbox1)
370 |     bbox2_vol = compute_3d_box_volume_batch(bbox2)
371 |     
372 |     import pytorch3d.ops as ops
373 | 
374 |     inter_vol, iou = ops.box3d_overlap(
375 |         bbox1[:, [0, 2, 5, 3, 1, 7, 4, 6]].float(), 
376 |         bbox2[:, [0, 2, 5, 3, 1, 7, 4, 6]].float()
377 |     )
378 |     union_vol = bbox1_vol.unsqueeze(1) + bbox2_vol.unsqueeze(0) - inter_vol
379 |     
380 |     enclosing_vol = compute_enclosing_vol(bbox1, bbox2)
381 |     # enclosing_vol = compute_enclosing_vol_fast(bbox1, bbox2)
382 |     
383 |     giou = iou - (enclosing_vol - union_vol) / enclosing_vol
384 |     
385 |     return giou
386 | 
387 | def compute_3d_contain_ratio_accurate_batch(bbox1: torch.Tensor, bbox2: torch.Tensor) -> torch.Tensor:
388 |     '''
389 |     Compute for i-th box in bbox1, how much of it is contained in j-th box in bbox2.
390 |     
391 |     bbox1: (M, 8, D), e.g. (M, 8, 3)
392 |     bbox2: (N, 8, D), e.g. (N, 8, 3)
393 |     
394 |     returns: (M, N)
395 |     '''
396 |     # Must expend the box beforehand, otherwise it may results overestimated results
397 |     bbox1 = expand_3d_box(bbox1)
398 |     bbox2 = expand_3d_box(bbox2)
399 |     
400 |     bbox1_vol = compute_3d_box_volume_batch(bbox1) # (M,)
401 |     bbox2_vol = compute_3d_box_volume_batch(bbox2) # (M,)
402 |     
403 |     import pytorch3d.ops as ops
404 |     
405 |     inter_vol, iou = ops.box3d_overlap(
406 |         bbox1[:, [0, 2, 5, 3, 1, 7, 4, 6]].float(), 
407 |         bbox2[:, [0, 2, 5, 3, 1, 7, 4, 6]].float()
408 |     ) # (M, N), (M, N)
409 |     
410 |     contain_ratio = inter_vol / bbox1_vol.unsqueeze(1) # (M, N)
411 |     
412 |     # Seems the following bug is unavoidable but happens very rarely
413 |     # print((contain_ratio > 1.001).sum() / contain_ratio.numel())
414 |     # if contain_ratio.amax() > 1.1:
415 |     #     print('contain_ratio > 1.0')
416 |     #     import pdb; pdb.set_trace()
417 |     
418 |     # Therefore we manually clamp it to [0, 1]
419 |     contain_ratio = contain_ratio.clamp(min=0, max=1)
420 |     
421 |     return contain_ratio, iou
422 | 
423 | def compute_2d_box_contained_batch(bbox: torch.Tensor, thresh:float=0.95) -> torch.Tensor:
424 |     '''
425 |     For each bbox, compute how many other bboxes are containing it. 
426 |     First compute the area of the intersection between each pair of bboxes. 
427 |     Then for each bbox, count how many bboxes have the intersection area larger than thresh of its own area.
428 |     
429 |     bbox: (N, 4), in (x1, y1, x2, y2) format
430 |     
431 |     returns: (N,)
432 |     '''
433 |     N = bbox.shape[0]
434 | 
435 |     # Get areas of each bbox
436 |     areas = (bbox[:, 2] - bbox[:, 0]) * (bbox[:, 3] - bbox[:, 1])
437 | 
438 |     # Compute intersection boxes
439 |     lt = torch.max(bbox[:, None, :2], bbox[:, :2])  # left-top points
440 |     rb = torch.min(bbox[:, None, 2:], bbox[:, 2:])  # right-bottom points
441 | 
442 |     inter = (rb - lt).clamp(min=0)  # intersection sizes (dx, dy), if no overlap, clamp to zero
443 | 
444 |     # Compute areas of intersection boxes
445 |     inter_areas = inter[:, :, 0] * inter[:, :, 1]
446 | 
447 |     # Count how many boxes have intersection area larger than thresh of its own area
448 |     mask = inter_areas > (areas * thresh).unsqueeze(1)
449 |     count = mask.sum(dim=1) - 1  # exclude itself
450 | 
451 |     return count
452 | 
453 | def mask_subtract_contained(xyxy: np.ndarray, mask: np.ndarray, th1=0.8, th2=0.7):
454 |     '''
455 |     Compute the containing relationship between all pair of bounding boxes.
456 |     For each mask, subtract the mask of bounding boxes that are contained by it.
457 |      
458 |     Args:
459 |         xyxy: (N, 4), in (x1, y1, x2, y2) format
460 |         mask: (N, H, W), binary mask
461 |         th1: float, threshold for computing intersection over box1
462 |         th2: float, threshold for computing intersection over box2
463 |         
464 |     Returns:
465 |         mask_sub: (N, H, W), binary mask
466 |     '''
467 |     N = xyxy.shape[0] # number of boxes
468 | 
469 |     # Get areas of each xyxy
470 |     areas = (xyxy[:, 2] - xyxy[:, 0]) * (xyxy[:, 3] - xyxy[:, 1]) # (N,)
471 | 
472 |     # Compute intersection boxes
473 |     lt = np.maximum(xyxy[:, None, :2], xyxy[None, :, :2])  # left-top points (N, N, 2)
474 |     rb = np.minimum(xyxy[:, None, 2:], xyxy[None, :, 2:])  # right-bottom points (N, N, 2)
475 |     
476 |     inter = (rb - lt).clip(min=0)  # intersection sizes (dx, dy), if no overlap, clamp to zero (N, N, 2)
477 | 
478 |     # Compute areas of intersection boxes
479 |     inter_areas = inter[:, :, 0] * inter[:, :, 1] # (N, N)
480 |     
481 |     inter_over_box1 = inter_areas / areas[:, None] # (N, N)
482 |     # inter_over_box2 = inter_areas / areas[None, :] # (N, N)
483 |     inter_over_box2 = inter_over_box1.T # (N, N)
484 |     
485 |     # if the intersection area is smaller than th2 of the area of box1, 
486 |     # and the intersection area is larger than th1 of the area of box2,
487 |     # then box2 is considered contained by box1
488 |     contained = (inter_over_box1 < th2) & (inter_over_box2 > th1) # (N, N)
489 |     contained_idx = contained.nonzero() # (num_contained, 2)
490 | 
491 |     mask_sub = mask.copy() # (N, H, W)
492 |     # mask_sub[contained_idx[0]] = mask_sub[contained_idx[0]] & (~mask_sub[contained_idx[1]])
493 |     for i in range(len(contained_idx[0])):
494 |         mask_sub[contained_idx[0][i]] = mask_sub[contained_idx[0][i]] & (~mask_sub[contained_idx[1][i]])
495 | 
496 |     return mask_sub


--------------------------------------------------------------------------------
/openfungraph/utils/colmap.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, ETH Zurich and UNC Chapel Hill.
  2 | # All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are met:
  6 | #
  7 | #     * Redistributions of source code must retain the above copyright
  8 | #       notice, this list of conditions and the following disclaimer.
  9 | #
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #
 14 | #     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
 15 | #       its contributors may be used to endorse or promote products derived
 16 | #       from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 21 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 22 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 23 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 24 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 25 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 26 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 27 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 28 | # POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
 31 | 
 32 | from typing import List, Tuple, Dict
 33 | import os
 34 | import collections
 35 | import numpy as np
 36 | import struct
 37 | import argparse
 38 | 
 39 | 
 40 | CameraModel = collections.namedtuple(
 41 |     "CameraModel", ["model_id", "model_name", "num_params"])
 42 | BaseCamera = collections.namedtuple(
 43 |     "Camera", ["id", "model", "width", "height", "params"])
 44 | BaseImage = collections.namedtuple(
 45 |     "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
 46 | Point3D = collections.namedtuple(
 47 |     "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
 48 | 
 49 | def camera_to_intrinsic(camera):
 50 |     '''
 51 |     camera object to intrinsic matrix
 52 |     fx 0  cx
 53 |     0  fy cy
 54 |     0  0  1
 55 |     '''
 56 |     return np.array([
 57 |         [camera.params[0], 0, camera.params[2]],
 58 |         [0, camera.params[1], camera.params[3]],
 59 |         [0, 0, 1]
 60 |     ])
 61 | 
 62 | 
 63 | class Image(BaseImage):
 64 |     def qvec2rotmat(self):
 65 |         return qvec2rotmat(self.qvec)
 66 | 
 67 |     def to_transform_mat(self):
 68 |         '''
 69 |         R, t matrix
 70 |         '''
 71 |         R = self.qvec2rotmat()
 72 |         t = self.tvec 
 73 |         T = np.eye(4)
 74 |         T[:3, :3] = R
 75 |         T[:3, 3] = t
 76 |         return T
 77 | 
 78 | 
 79 |     @property
 80 |     def world_to_camera(self) -> np.ndarray:
 81 |         R = qvec2rotmat(self.qvec)
 82 |         t = self.tvec
 83 |         world2cam = np.eye(4)
 84 |         world2cam[:3, :3] = R
 85 |         world2cam[:3, 3] = t
 86 |         return world2cam
 87 | 
 88 | 
 89 | class Camera(BaseCamera):
 90 |     @property
 91 |     def K(self):
 92 |         K = np.eye(3)
 93 |         if self.model == "SIMPLE_PINHOLE" or self.model == "SIMPLE_RADIAL" or self.model == "RADIAL" or self.model == "SIMPLE_RADIAL_FISHEYE" or self.model == "RADIAL_FISHEYE":
 94 |             K[0, 0] = self.params[0]
 95 |             K[1, 1] = self.params[0]
 96 |             K[0, 2] = self.params[1]
 97 |             K[1, 2] = self.params[2]
 98 |         elif self.model == "PINHOLE" or self.model == "OPENCV" or self.model == "OPENCV_FISHEYE" or self.model == "FULL_OPENCV" or self.model == "FOV" or self.model == "THIN_PRISM_FISHEYE":
 99 |             K[0, 0] = self.params[0]
100 |             K[1, 1] = self.params[1]
101 |             K[0, 2] = self.params[2]
102 |             K[1, 2] = self.params[3]
103 |         else:
104 |             raise NotImplementedError
105 |         return K
106 | 
107 | 
108 | CAMERA_MODELS = {
109 |     CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
110 |     CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
111 |     CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
112 |     CameraModel(model_id=3, model_name="RADIAL", num_params=5),
113 |     CameraModel(model_id=4, model_name="OPENCV", num_params=8),
114 |     CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
115 |     CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
116 |     CameraModel(model_id=7, model_name="FOV", num_params=5),
117 |     CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
118 |     CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
119 |     CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
120 | }
121 | CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
122 |                          for camera_model in CAMERA_MODELS])
123 | CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
124 |                            for camera_model in CAMERA_MODELS])
125 | 
126 | 
127 | def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
128 |     """Read and unpack the next bytes from a binary file.
129 |     :param fid:
130 |     :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
131 |     :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
132 |     :param endian_character: Any of {@, =, <, >, !}
133 |     :return: Tuple of read and unpacked values.
134 |     """
135 |     data = fid.read(num_bytes)
136 |     return struct.unpack(endian_character + format_char_sequence, data)
137 | 
138 | 
139 | def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
140 |     """pack and write to a binary file.
141 |     :param fid:
142 |     :param data: data to send, if multiple elements are sent at the same time,
143 |     they should be encapsuled either in a list or a tuple
144 |     :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
145 |     should be the same length as the data list or tuple
146 |     :param endian_character: Any of {@, =, <, >, !}
147 |     """
148 |     if isinstance(data, (list, tuple)):
149 |         bytes = struct.pack(endian_character + format_char_sequence, *data)
150 |     else:
151 |         bytes = struct.pack(endian_character + format_char_sequence, data)
152 |     fid.write(bytes)
153 | 
154 | 
155 | def read_cameras_text(path):
156 |     """
157 |     see: src/base/reconstruction.cc
158 |         void Reconstruction::WriteCamerasText(const std::string& path)
159 |         void Reconstruction::ReadCamerasText(const std::string& path)
160 |     """
161 |     cameras = {}
162 |     with open(path, "r") as fid:
163 |         while True:
164 |             line = fid.readline()
165 |             if not line:
166 |                 break
167 |             line = line.strip()
168 |             if len(line) > 0 and line[0] != "#":
169 |                 elems = line.split()
170 |                 camera_id = int(elems[0])
171 |                 model = elems[1]
172 |                 width = int(elems[2])
173 |                 height = int(elems[3])
174 |                 params = np.array(tuple(map(float, elems[4:])))
175 |                 cameras[camera_id] = Camera(id=camera_id, model=model,
176 |                                             width=width, height=height,
177 |                                             params=params)
178 |     return cameras
179 | 
180 | 
181 | def read_cameras_binary(path_to_model_file):
182 |     """
183 |     see: src/base/reconstruction.cc
184 |         void Reconstruction::WriteCamerasBinary(const std::string& path)
185 |         void Reconstruction::ReadCamerasBinary(const std::string& path)
186 |     """
187 |     cameras = {}
188 |     with open(path_to_model_file, "rb") as fid:
189 |         num_cameras = read_next_bytes(fid, 8, "Q")[0]
190 |         for _ in range(num_cameras):
191 |             camera_properties = read_next_bytes(
192 |                 fid, num_bytes=24, format_char_sequence="iiQQ")
193 |             camera_id = camera_properties[0]
194 |             model_id = camera_properties[1]
195 |             model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
196 |             width = camera_properties[2]
197 |             height = camera_properties[3]
198 |             num_params = CAMERA_MODEL_IDS[model_id].num_params
199 |             params = read_next_bytes(fid, num_bytes=8*num_params,
200 |                                      format_char_sequence="d"*num_params)
201 |             cameras[camera_id] = Camera(id=camera_id,
202 |                                         model=model_name,
203 |                                         width=width,
204 |                                         height=height,
205 |                                         params=np.array(params))
206 |         assert len(cameras) == num_cameras
207 |     return cameras
208 | 
209 | 
210 | def write_cameras_text(cameras, path):
211 |     """
212 |     see: src/base/reconstruction.cc
213 |         void Reconstruction::WriteCamerasText(const std::string& path)
214 |         void Reconstruction::ReadCamerasText(const std::string& path)
215 |     """
216 |     HEADER = "# Camera list with one line of data per camera:\n" + \
217 |              "#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n" + \
218 |              "# Number of cameras: {}\n".format(len(cameras))
219 |     with open(path, "w") as fid:
220 |         fid.write(HEADER)
221 |         for _, cam in cameras.items():
222 |             to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
223 |             line = " ".join([str(elem) for elem in to_write])
224 |             fid.write(line + "\n")
225 | 
226 | 
227 | def write_cameras_binary(cameras, path_to_model_file):
228 |     """
229 |     see: src/base/reconstruction.cc
230 |         void Reconstruction::WriteCamerasBinary(const std::string& path)
231 |         void Reconstruction::ReadCamerasBinary(const std::string& path)
232 |     """
233 |     with open(path_to_model_file, "wb") as fid:
234 |         write_next_bytes(fid, len(cameras), "Q")
235 |         for _, cam in cameras.items():
236 |             model_id = CAMERA_MODEL_NAMES[cam.model].model_id
237 |             camera_properties = [cam.id,
238 |                                  model_id,
239 |                                  cam.width,
240 |                                  cam.height]
241 |             write_next_bytes(fid, camera_properties, "iiQQ")
242 |             for p in cam.params:
243 |                 write_next_bytes(fid, float(p), "d")
244 |     return cameras
245 | 
246 | 
247 | def read_images_text(path):
248 |     """
249 |     see: src/base/reconstruction.cc
250 |         void Reconstruction::ReadImagesText(const std::string& path)
251 |         void Reconstruction::WriteImagesText(const std::string& path)
252 |     """
253 |     images = {}
254 |     with open(path, "r") as fid:
255 |         while True:
256 |             line = fid.readline()
257 |             if not line:
258 |                 break
259 |             line = line.strip()
260 |             if len(line) > 0 and line[0] != "#":
261 |                 elems = line.split()
262 |                 image_id = int(elems[0])
263 |                 qvec = np.array(tuple(map(float, elems[1:5])))
264 |                 tvec = np.array(tuple(map(float, elems[5:8])))
265 |                 camera_id = int(elems[8])
266 |                 image_name = elems[9]
267 |                 elems = fid.readline().split()
268 |                 xys = np.column_stack([tuple(map(float, elems[0::3])),
269 |                                        tuple(map(float, elems[1::3]))])
270 |                 point3D_ids = np.array(tuple(map(int, elems[2::3])))
271 |                 images[image_id] = Image(
272 |                     id=image_id, qvec=qvec, tvec=tvec,
273 |                     camera_id=camera_id, name=image_name,
274 |                     xys=xys, point3D_ids=point3D_ids)
275 |     return images
276 | 
277 | 
278 | def read_images_binary(path_to_model_file):
279 |     """
280 |     see: src/base/reconstruction.cc
281 |         void Reconstruction::ReadImagesBinary(const std::string& path)
282 |         void Reconstruction::WriteImagesBinary(const std::string& path)
283 |     """
284 |     images = {}
285 |     with open(path_to_model_file, "rb") as fid:
286 |         num_reg_images = read_next_bytes(fid, 8, "Q")[0]
287 |         for _ in range(num_reg_images):
288 |             binary_image_properties = read_next_bytes(
289 |                 fid, num_bytes=64, format_char_sequence="idddddddi")
290 |             image_id = binary_image_properties[0]
291 |             qvec = np.array(binary_image_properties[1:5])
292 |             tvec = np.array(binary_image_properties[5:8])
293 |             camera_id = binary_image_properties[8]
294 |             image_name = ""
295 |             current_char = read_next_bytes(fid, 1, "c")[0]
296 |             while current_char != b"\x00":   # look for the ASCII 0 entry
297 |                 image_name += current_char.decode("utf-8")
298 |                 current_char = read_next_bytes(fid, 1, "c")[0]
299 |             num_points2D = read_next_bytes(fid, num_bytes=8,
300 |                                            format_char_sequence="Q")[0]
301 |             x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
302 |                                        format_char_sequence="ddq"*num_points2D)
303 |             xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
304 |                                    tuple(map(float, x_y_id_s[1::3]))])
305 |             point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
306 |             images[image_id] = Image(
307 |                 id=image_id, qvec=qvec, tvec=tvec,
308 |                 camera_id=camera_id, name=image_name,
309 |                 xys=xys, point3D_ids=point3D_ids)
310 |     return images
311 | 
312 | 
313 | def write_images_text(images, path):
314 |     """
315 |     see: src/base/reconstruction.cc
316 |         void Reconstruction::ReadImagesText(const std::string& path)
317 |         void Reconstruction::WriteImagesText(const std::string& path)
318 |     """
319 |     if len(images) == 0:
320 |         mean_observations = 0
321 |     else:
322 |         mean_observations = sum((len(img.point3D_ids) for _, img in images.items()))/len(images)
323 |     HEADER = "# Image list with two lines of data per image:\n" + \
324 |              "#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n" + \
325 |              "#   POINTS2D[] as (X, Y, POINT3D_ID)\n" + \
326 |              "# Number of images: {}, mean observations per image: {}\n".format(len(images), mean_observations)
327 | 
328 |     with open(path, "w") as fid:
329 |         fid.write(HEADER)
330 |         for _, img in images.items():
331 |             image_header = [img.id, *img.qvec, *img.tvec, img.camera_id, img.name]
332 |             first_line = " ".join(map(str, image_header))
333 |             fid.write(first_line + "\n")
334 | 
335 |             points_strings = []
336 |             for xy, point3D_id in zip(img.xys, img.point3D_ids):
337 |                 points_strings.append(" ".join(map(str, [*xy, point3D_id])))
338 |             fid.write(" ".join(points_strings) + "\n")
339 | 
340 | 
341 | def write_images_binary(images, path_to_model_file):
342 |     """
343 |     see: src/base/reconstruction.cc
344 |         void Reconstruction::ReadImagesBinary(const std::string& path)
345 |         void Reconstruction::WriteImagesBinary(const std::string& path)
346 |     """
347 |     with open(path_to_model_file, "wb") as fid:
348 |         write_next_bytes(fid, len(images), "Q")
349 |         for _, img in images.items():
350 |             write_next_bytes(fid, img.id, "i")
351 |             write_next_bytes(fid, img.qvec.tolist(), "dddd")
352 |             write_next_bytes(fid, img.tvec.tolist(), "ddd")
353 |             write_next_bytes(fid, img.camera_id, "i")
354 |             for char in img.name:
355 |                 write_next_bytes(fid, char.encode("utf-8"), "c")
356 |             write_next_bytes(fid, b"\x00", "c")
357 |             write_next_bytes(fid, len(img.point3D_ids), "Q")
358 |             for xy, p3d_id in zip(img.xys, img.point3D_ids):
359 |                 write_next_bytes(fid, [*xy, p3d_id], "ddq")
360 | 
361 | 
362 | def read_points3D_text(path):
363 |     """
364 |     see: src/base/reconstruction.cc
365 |         void Reconstruction::ReadPoints3DText(const std::string& path)
366 |         void Reconstruction::WritePoints3DText(const std::string& path)
367 |     """
368 |     points3D = {}
369 |     with open(path, "r") as fid:
370 |         while True:
371 |             line = fid.readline()
372 |             if not line:
373 |                 break
374 |             line = line.strip()
375 |             if len(line) > 0 and line[0] != "#":
376 |                 elems = line.split()
377 |                 point3D_id = int(elems[0])
378 |                 xyz = np.array(tuple(map(float, elems[1:4])))
379 |                 rgb = np.array(tuple(map(int, elems[4:7])))
380 |                 error = float(elems[7])
381 |                 image_ids = np.array(tuple(map(int, elems[8::2])))
382 |                 point2D_idxs = np.array(tuple(map(int, elems[9::2])))
383 |                 points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,
384 |                                                error=error, image_ids=image_ids,
385 |                                                point2D_idxs=point2D_idxs)
386 |     return points3D
387 | 
388 | 
389 | def read_points3D_binary(path_to_model_file):
390 |     """
391 |     see: src/base/reconstruction.cc
392 |         void Reconstruction::ReadPoints3DBinary(const std::string& path)
393 |         void Reconstruction::WritePoints3DBinary(const std::string& path)
394 |     """
395 |     points3D = {}
396 |     with open(path_to_model_file, "rb") as fid:
397 |         num_points = read_next_bytes(fid, 8, "Q")[0]
398 |         for _ in range(num_points):
399 |             binary_point_line_properties = read_next_bytes(
400 |                 fid, num_bytes=43, format_char_sequence="QdddBBBd")
401 |             point3D_id = binary_point_line_properties[0]
402 |             xyz = np.array(binary_point_line_properties[1:4])
403 |             rgb = np.array(binary_point_line_properties[4:7])
404 |             error = np.array(binary_point_line_properties[7])
405 |             track_length = read_next_bytes(
406 |                 fid, num_bytes=8, format_char_sequence="Q")[0]
407 |             track_elems = read_next_bytes(
408 |                 fid, num_bytes=8*track_length,
409 |                 format_char_sequence="ii"*track_length)
410 |             image_ids = np.array(tuple(map(int, track_elems[0::2])))
411 |             point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
412 |             points3D[point3D_id] = Point3D(
413 |                 id=point3D_id, xyz=xyz, rgb=rgb,
414 |                 error=error, image_ids=image_ids,
415 |                 point2D_idxs=point2D_idxs)
416 |     return points3D
417 | 
418 | 
419 | def write_points3D_text(points3D, path):
420 |     """
421 |     see: src/base/reconstruction.cc
422 |         void Reconstruction::ReadPoints3DText(const std::string& path)
423 |         void Reconstruction::WritePoints3DText(const std::string& path)
424 |     """
425 |     if len(points3D) == 0:
426 |         mean_track_length = 0
427 |     else:
428 |         mean_track_length = sum((len(pt.image_ids) for _, pt in points3D.items()))/len(points3D)
429 |     HEADER = "# 3D point list with one line of data per point:\n" + \
430 |              "#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n" + \
431 |              "# Number of points: {}, mean track length: {}\n".format(len(points3D), mean_track_length)
432 | 
433 |     with open(path, "w") as fid:
434 |         fid.write(HEADER)
435 |         for _, pt in points3D.items():
436 |             point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
437 |             fid.write(" ".join(map(str, point_header)) + " ")
438 |             track_strings = []
439 |             for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
440 |                 track_strings.append(" ".join(map(str, [image_id, point2D])))
441 |             fid.write(" ".join(track_strings) + "\n")
442 | 
443 | 
444 | def write_points3D_binary(points3D, path_to_model_file):
445 |     """
446 |     see: src/base/reconstruction.cc
447 |         void Reconstruction::ReadPoints3DBinary(const std::string& path)
448 |         void Reconstruction::WritePoints3DBinary(const std::string& path)
449 |     """
450 |     with open(path_to_model_file, "wb") as fid:
451 |         write_next_bytes(fid, len(points3D), "Q")
452 |         for _, pt in points3D.items():
453 |             write_next_bytes(fid, pt.id, "Q")
454 |             write_next_bytes(fid, pt.xyz.tolist(), "ddd")
455 |             write_next_bytes(fid, pt.rgb.tolist(), "BBB")
456 |             write_next_bytes(fid, pt.error, "d")
457 |             track_length = pt.image_ids.shape[0]
458 |             write_next_bytes(fid, track_length, "Q")
459 |             for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
460 |                 write_next_bytes(fid, [image_id, point2D_id], "ii")
461 | 
462 | 
463 | def detect_model_format(path, ext):
464 |     if os.path.isfile(os.path.join(path, "cameras"  + ext)) and \
465 |        os.path.isfile(os.path.join(path, "images"   + ext)) and \
466 |        os.path.isfile(os.path.join(path, "points3D" + ext)):
467 |         print("Detected model format: '" + ext + "'")
468 |         return True
469 | 
470 |     return False
471 | 
472 | 
473 | def read_model(path, ext="") -> Tuple[Dict[int, Camera], Dict[int, Image], Dict[int, Point3D]]:
474 |     # try to detect the extension automatically
475 |     if ext == "":
476 |         if detect_model_format(path, ".bin"):
477 |             ext = ".bin"
478 |         elif detect_model_format(path, ".txt"):
479 |             ext = ".txt"
480 |         else:
481 |             raise ValueError("Provide model format: '.bin' or '.txt'")
482 | 
483 |     if ext == ".txt":
484 |         cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
485 |         images = read_images_text(os.path.join(path, "images" + ext))
486 |         # points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
487 |     else:
488 |         cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
489 |         images = read_images_binary(os.path.join(path, "images" + ext))
490 |         # points3D = read_points3D_binary(os.path.join(path, "points3D") + ext)
491 |     return cameras, images
492 | 
493 | 
494 | def write_model(cameras, images, points3D, path, ext=".bin"):
495 |     if ext == ".txt":
496 |         write_cameras_text(cameras, os.path.join(path, "cameras" + ext))
497 |         write_images_text(images, os.path.join(path, "images" + ext))
498 |         write_points3D_text(points3D, os.path.join(path, "points3D") + ext)
499 |     else:
500 |         write_cameras_binary(cameras, os.path.join(path, "cameras" + ext))
501 |         write_images_binary(images, os.path.join(path, "images" + ext))
502 |         write_points3D_binary(points3D, os.path.join(path, "points3D") + ext)
503 |     return cameras, images, points3D
504 | 
505 | 
506 | def qvec2rotmat(qvec):
507 |     return np.array([
508 |         [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
509 |          2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
510 |          2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
511 |         [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
512 |          1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
513 |          2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
514 |         [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
515 |          2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
516 |          1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])
517 | 
518 | 
519 | def rotmat2qvec(R):
520 |     Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
521 |     K = np.array([
522 |         [Rxx - Ryy - Rzz, 0, 0, 0],
523 |         [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
524 |         [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
525 |         [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
526 |     eigvals, eigvecs = np.linalg.eigh(K)
527 |     qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
528 |     if qvec[0] < 0:
529 |         qvec *= -1
530 |     return qvec
531 | 
532 | 
533 | def main():
534 |     parser = argparse.ArgumentParser(description="Read and write COLMAP binary and text models")
535 |     parser.add_argument("--input_model", help="path to input model folder")
536 |     parser.add_argument("--input_format", choices=[".bin", ".txt"],
537 |                         help="input model format", default="")
538 |     parser.add_argument("--output_model",
539 |                         help="path to output model folder")
540 |     parser.add_argument("--output_format", choices=[".bin", ".txt"],
541 |                         help="outut model format", default=".txt")
542 |     args = parser.parse_args()
543 | 
544 |     cameras, images, points3D = read_model(path=args.input_model, ext=args.input_format)
545 | 
546 |     print("num_cameras:", len(cameras))
547 |     print("num_images:", len(images))
548 |     print("num_points3D:", len(points3D))
549 | 
550 |     if args.output_model is not None:
551 |         write_model(cameras, images, points3D, path=args.output_model, ext=args.output_format)
552 | 
553 | 
554 | if __name__ == "__main__":
555 |     main()
556 | 


--------------------------------------------------------------------------------