├── .gitignore
├── LICENSE
├── README.md
├── data
    └── README.md
├── example_data
    ├── annotations
    │   └── example_input.json
    └── images
    │   └── example_dataset
    │       └── example_scene
    │           └── example_image.jpg
├── requirements.txt
├── robospatial
    ├── README.md
    ├── __init__.py
    ├── annotation_generator.py
    ├── configs
    │   ├── embodiedscan.yaml
    │   ├── example_config.yaml
    │   └── example_dataset.yaml
    ├── data_loader
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base_loader.py
    │   ├── embodiedscan_loader.py
    │   └── example_loader.py
    ├── run_generation.py
    └── spatial_analysis
    │   ├── __init__.py
    │   ├── compatibility
    │       ├── compatibility.py
    │       └── compatibility_utils.py
    │   ├── configuration
    │       ├── configuration.py
    │       └── configuration_utils.py
    │   ├── context
    │       ├── context.py
    │       └── context_utils.py
    │   ├── grounding.py
    │   ├── obj_properties.py
    │   ├── relationship_utils.py
    │   ├── relationships.py
    │   └── topdown_map.py
└── scripts
    ├── visualize_input.py
    └── visualize_output.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 
173 | # outputs
174 | outputs/
175 | checkpoints/
176 | wandb/
177 | 
178 | example_data/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024-Present, NVIDIA Corporation & affiliates. All rights reserved.
 2 | 
 3 | 
 4 | =======================================================================
 5 | 
 6 | 1. Definitions
 7 | 
 8 | "Licensor" means any person or entity that distributes its Work.
 9 | 
10 | "Software" means the original work of authorship made available under
11 | this License.
12 | 
13 | "Work" means the Software and any additions to or derivative works of
14 | the Software that are made available under this License.
15 | 
16 | The terms "reproduce," "reproduction," "derivative works," and
17 | "distribution" have the meaning as provided under U.S. copyright law;
18 | provided, however, that for the purposes of this License, derivative
19 | works shall not include works that remain separable from, or merely
20 | link (or bind by name) to the interfaces of, the Work.
21 | 
22 | Works, including the Software, are "made available" under this License
23 | by including in or with the Work either (a) a copyright notice
24 | referencing the applicability of this License to the Work, or (b) a
25 | copy of this License.
26 | 
27 | 2. License Grants
28 | 
29 |     2.1 Copyright Grant. Subject to the terms and conditions of this
30 |     License, each Licensor grants to you a perpetual, worldwide,
31 |     non-exclusive, royalty-free, copyright license to reproduce,
32 |     prepare derivative works of, publicly display, publicly perform,
33 |     sublicense and distribute its Work and any resulting derivative
34 |     works in any form.
35 | 
36 | 3. Limitations
37 | 
38 |     3.1 Redistribution. You may reproduce or distribute the Work only
39 |     if (a) you do so under this License, (b) you include a complete
40 |     copy of this License with your distribution, and (c) you retain
41 |     without modification any copyright, patent, trademark, or
42 |     attribution notices that are present in the Work.
43 | 
44 |     3.2 Derivative Works. You may specify that additional or different
45 |     terms apply to the use, reproduction, and distribution of your
46 |     derivative works of the Work ("Your Terms") only if (a) Your Terms
47 |     provide that the use limitation in Section 3.3 applies to your
48 |     derivative works, and (b) you identify the specific derivative
49 |     works that are subject to Your Terms. Notwithstanding Your Terms,
50 |     this License (including the redistribution requirements in Section
51 |     3.1) will continue to apply to the Work itself.
52 | 
53 |     3.3 Use Limitation. The Work and any derivative works thereof only
54 |     may be used or intended for use non-commercially. Notwithstanding
55 |     the foregoing, NVIDIA and its affiliates may use the Work and any
56 |     derivative works commercially. As used herein, "non-commercially"
57 |     means for research or evaluation purposes only.
58 | 
59 |     3.4 Patent Claims. If you bring or threaten to bring a patent claim
60 |     against any Licensor (including any claim, cross-claim or
61 |     counterclaim in a lawsuit) to enforce any patents that you allege
62 |     are infringed by any Work, then your rights under this License from
63 |     such Licensor (including the grant in Section 2.1) will terminate
64 |     immediately.
65 | 
66 |     3.5 Trademarks. This License does not grant any rights to use any
67 |     Licensor�s or its affiliates� names, logos, or trademarks, except
68 |     as necessary to reproduce the notices described in this License.
69 | 
70 |     3.6 Termination. If you violate any term of this License, then your
71 |     rights under this License (including the grant in Section 2.1) will
72 |     terminate immediately.
73 | 
74 | 4. Disclaimer of Warranty.
75 | 
76 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
77 | KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
78 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
79 | NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
80 | THIS LICENSE.
81 | 
82 | 5. Limitation of Liability.
83 | 
84 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
85 | THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
86 | SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
87 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
88 | OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
89 | (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
90 | LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
91 | COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
92 | THE POSSIBILITY OF SUCH DAMAGES.
93 | 
94 | =======================================================================


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RoboSpatial: Teaching Spatial Understanding to 2D and 3D Vision-Language Models for Robotics
  2 | 
  3 | [**🌐 Homepage**](https://chanh.ee/RoboSpatial/) | [**📖 arXiv**](https://arxiv.org/abs/2411.16537) | [**📂 Benchmark**](https://huggingface.co/datasets/chanhee-luke/RoboSpatial-Home) | [**📊 Evaluation**](https://github.com/chanhee-luke/RoboSpatial-Eval)
  4 | 
  5 | **✨ CVPR 2025 (Oral) ✨**
  6 | 
  7 | Authors: [Chan Hee Song](https://chanh.ee)<sup>1</sup>, [Valts Blukis](https://research.nvidia.com/person/valts-blukis)<sup>2</sup>, [Jonathan Tremblay](https://research.nvidia.com/person/jonathan-tremblay)<sup>2</sup>, [Stephen Tyree](https://research.nvidia.com/person/stephen-tyree)<sup>2</sup>, [Yu Su](https://ysu1989.github.io/)<sup>1</sup>, [Stan Birchfield](https://sbirchfield.github.io/)<sup>2</sup>
  8 | 
  9 |  <sup>1</sup> The Ohio State University  <sup>2</sup> NVIDIA
 10 | 
 11 | ---
 12 | 
 13 | ## 🔔News
 14 | 
 15 | - **🔥[2025-04-24]: Released the RoboSpatial data generation pipeline, RoboSpatial-Home dataset, and evaluation script!**
 16 | 
 17 | ---
 18 | 
 19 | **Project Components:**
 20 | 
 21 | This repository contains the code for **generating** the spatial annotations used in the RoboSpatial dataset.
 22 | 
 23 | *   **Benchmark Dataset:** [**📂 RoboSpatial-Home**](https://huggingface.co/datasets/chanhee-luke/RoboSpatial-Home)
 24 | *   **Evaluation Script:** [**📊 RoboSpatial-Eval**](https://github.com/chanhee-luke/RoboSpatial-Eval)
 25 | 
 26 | **Coming up!**
 27 | 
 28 | -    [ ] Unified data loader supporting [BOP datasets](https://bop.felk.cvut.cz/datasets/) and [GraspNet dataset](https://graspnet.net/). (Turn object pose estimation datasets into spatial QA!)
 29 | -    [ ] Support for additional scan datasets like [SCRREAM](https://sites.google.com/view/scrream/about).
 30 | 
 31 | ---
 32 | 
 33 | # RoboSpatial Annotation Generation
 34 | 
 35 | This codebase generates rich spatial annotations for 3D scan datasets. While initially built using the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) conventions, it is designed to be extensible to other data formats through custom data loaders (see [Data Loader Documentation](#data-loader-documentation)). It extracts various spatial relationships from image data and associated 3D information, including:
 36 | 
 37 | *   **Object Grounding:** Locating objects mentioned in text within the image.
 38 | *   **Spatial Context:** Identifying points in empty space relative to objects (e.g., "in front of the chair").
 39 | *   **Spatial Configuration:** Describing the relative arrangement of multiple objects (e.g., "the chair is next to the table").
 40 | *   **Spatial Compatibility:** Determining if an object *could* fit in a specific location.
 41 | 
 42 | The generated annotations are saved in JSON format, one file per image.
 43 | 
 44 | ## Prerequisites
 45 | 
 46 | 1.  **Python Environment:** Ensure you have a Python (3.8+) environment set up (e.g., using `conda` or `venv`). Required packages can be installed via `pip install -r requirements.txt`.
 47 | 2.  **Datasets:** You need access to the 3D scan datasets you intend to process.
 48 |    * **Note:** For specific instructions on downloading and setting up the **EmbodiedScan** dataset, please refer to the guide in [**`data/README.md`**](data/README.md).
 49 | 3.  **Configuration:** The main configuration file (e.g., `robospatial/configs/embodiedscan.yaml`) needs to be updated with paths relevant to your chosen data loader and dataset:
 50 |     *   `data_loading.loader_class`: Specifies the Python class for your data loader (e.g., `data_loader.embodiedscan_loader.EmbodiedScanLoader`).
 51 |     *   Dataset-specific paths (e.g., `image_root`, format-specific annotation files like `embodiedscan_ann`). Consult the configuration file and your data loader's requirements. See [Data Loader Documentation](#data-loader-documentation) for more details on adding custom formats.
 52 |     *   `data_generation.output_dir`: The directory where the generated `.annotations.json` files will be saved.
 53 | 
 54 | ## Running Annotation Generation
 55 | 
 56 | The core script for generating annotations is `robospatial/run_generation.py`.
 57 | 
 58 | **Running with Provided Example Data (Recommended First Step):**
 59 | 
 60 | We provide a small example scene with input annotations and images in the `example_data/` directory. This allows you to test the generation pipeline without downloading large datasets.
 61 | 
 62 | 1.  **Navigate to the `robospatial` directory:**
 63 |     ```bash
 64 |     cd robospatial
 65 |     ```
 66 | 2.  **Run the generation script:**
 67 |     ```bash
 68 |     python run_generation.py --config configs/example_dataset.yaml
 69 |     ```
 70 |     This will process only the example scene defined in `example_dataset.yaml` and generate the annotation in the `example_data/example_qa` folder.
 71 | 
 72 | **Running on Full Datasets:**
 73 | 
 74 | Once you have confirmed the example works and have downloaded your target datasets:
 75 | 
 76 | 1.  **Configure your data loader:** Ensure the `data_loading` section in your chosen configuration file (e.g., `configs/example_dataset.yaml`) correctly points to your dataset paths and uses the appropriate `loader_class`.
 77 | 2.  **Run the script:**
 78 |     ```bash
 79 |     cd robospatial
 80 |     python run_generation.py --config configs/your_chosen_config.yaml
 81 |     ```
 82 | 
 83 | This command will process all scenes found by the data loader using the settings defined in `your_chosen_config.yaml`.
 84 | 
 85 | **Command-Line Options:**
 86 | 
 87 | *   `--config <path>`: **(Required)** Specifies the path to the YAML configuration file.
 88 | *   `--scene <dataset/scene_id>`: Process only a single specific scene.
 89 |     ```bash
 90 |     python run_generation.py --config configs/embodiedscan.yaml --scene "scannet/scene0191_00"
 91 |     ```
 92 | *   `--image <image_basename>`: Process only a single specific image within the specified scene (requires `--scene`). Useful for debugging.
 93 |     ```bash
 94 |     python run_generation.py --config configs/embodiedscan.yaml --scene "scannet/scene0191_00" --image "00090.jpg"
 95 |     ```
 96 | *   `--range <start_idx> <end_idx>`: Process a specific range of scenes based on their index in the loaded list (inclusive start, inclusive end).
 97 |     ```bash
 98 |     python run_generation.py --config configs/embodiedscan.yaml --range 0 10 # Process first 11 scenes
 99 |     ```
100 | *   `--num_workers <int>`: Specify the number of parallel worker threads to use for processing scenes. Overrides the `num_workers` setting in the config file. Defaults to `min(os.cpu_count(), 4)` if neither is provided.
101 |     ```bash
102 |     python run_generation.py --config configs/embodiedscan.yaml --num_workers 8
103 |     ```
104 | *   `--dry-run`: Process only the first 5 images of each scene. Useful for quickly testing the pipeline.
105 |     ```bash
106 |     python run_generation.py --config configs/embodiedscan.yaml --dry-run
107 |     ```
108 | 
109 | ## Visualizing Input/Outputs
110 | 
111 | Two scripts are provided in the `scripts/` directory for visualizing inputs/outputs:
112 | 
113 | ### 1. Visualizing Input Data (`scripts/visualize_input.py`)
114 | 
115 | Use this script to check if your input annotations (e.g., 3D bounding boxes from your dataset's original format, after conversion by your data loader) are being loaded and interpreted correctly. It reads the intermediate JSON format produced by the data loader for a single image and overlays the 3D bounding boxes onto the image.
116 | 
117 | **Usage:**
118 | 
119 | ```bash
120 | python scripts/visualize_input.py \
121 |     --image_path <path_to_specific_image.jpg> \
122 |     --annotation_file <path_to_intermediate_json_for_image>
123 | ```
124 | 
125 | *   Replace `<path_to_specific_image.jpg>` with the direct path to the image file.
126 | *   Replace `<path_to_intermediate_json_for_image>` with the path to the JSON file representing the *input* annotations for that image (this file's location and naming depend on your data loader implementation).
127 | 
128 | **Example using the provided example data:**
129 | ```bash
130 | python scripts/visualize_input.py \
131 |     --image_path example_data/images/example_dataset/example_scene/example_image.jpg \
132 |     --annotation_file example_data/annotations/example_input.json
133 | ```
134 | 
135 | ### 2. Visualizing Generated Output (`scripts/visualize_output.py`)
136 | 
137 | Use this script to debug and inspect the spatial relationships generated by `run_generation.py`. It reads the final `.annotations.json` file for a specific image and allows you to visualize different types of generated annotations, including object grounding and spatial relationships (context, configuration, compatibility).
138 | 
139 | **Usage:**
140 | 
141 | ```bash
142 | python scripts/visualize_output.py \
143 |     --image_path <path_to_specific_image.jpg> \
144 |     --annotation_file <path_to_output_dir>/<dataset>/<scene_id>/<image_name>.annotations.json \
145 |     --object_3d_grounding \
146 |     --context
147 | ```
148 | 
149 | *   Replace `<path_to_specific_image.jpg>` with the direct path to the image file.
150 | *   Replace `<path_to_output_dir>` with the path used in your configuration's `data_generation.output_dir`.
151 | *   Adjust `<dataset>`, `<scene_id>`, and `<image_name>` to match the specific output file you want to visualize.
152 | *   Include flags like `--object_2d_grounding`, `--object_3d_grounding`, `--context`, `--configuration`, or `--compatibility` to select what to visualize. Use the `--verbose` or `-v` flag for more detailed output. Refer to the script's internal documentation (`--help`) for detailed controls and options.
153 | 
154 | **Example using the provided example data (run the generation first):**
155 | ```bash
156 | python scripts/visualize_output.py \
157 |     --image_path example_data/images/example_dataset/example_scene/example_image.jpg \
158 |     --annotation_file example_data/example_qa/example_scene/example_image.jpg.annotations.json \
159 |     --object_3d_grounding \
160 |     --context
161 | ```
162 | 
163 | ## Data Loader Documentation
164 | 
165 | This project supports adding custom data loaders to handle different 3D dataset formats. The configuration file (`data_loading.loader_class`) specifies which loader to use.
166 | 
167 | For detailed instructions on the expected interface for a data loader and how to implement your own, please refer to the README within the data loader directory: [**`robospatial/data_loader/README.md`**](robospatial/data_loader/README.md)
168 | 
169 | ## Project Structure
170 | 
171 | For a detailed explanation of the annotation generation logic and hyperparameters within the `spatial_analysis` modules, please refer to the [**`robospatial/README.md`**](robospatial/README.md).
172 | 
173 | *   `robospatial/`: Main source code directory.
174 |     *   `configs/`: Contains YAML configuration files (e.g., `example_config.yaml`).
175 |     *   `data_loader/`: Contains modules for loading and interfacing with different 3D datasets. Includes examples like `embodiedscan_loader.py` and can be extended with custom loaders. See the [README](robospatial/data_loader/README.md) in this directory for details.
176 |     *   `spatial_analysis/`: Modules performing the core spatial reasoning and annotation generation logic.
177 |     *   `annotation_generator.py`: Orchestrates the generation process for a single scene by calling functions from `spatial_analysis`.
178 |     *   `run_generation.py`: Main script to run the annotation generation across datasets/scenes based on configuration.
179 | 
180 | ## Output Files
181 | 
182 | *   **`<output_dir>/<dataset>/<scene_id>/<image_name>.annotations.json`**: The primary output. Contains the generated spatial annotations for a single image.
183 | *   **`generation_progress.json`**: Stores a list of scenes that have been successfully processed. This allows the script to resume if interrupted. Located in the directory where `run_generation.py` is executed.
184 | *   **`generation_stats.json`**: Contains aggregated statistics about the generated annotations (e.g., counts of each annotation type) overall and per-dataset. Located in the directory where `run_generation.py` is executed.
185 | 
186 | ## Acknowledgements
187 | 
188 | We thank the authors of [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/tree/main) for providing their unified annotations for various 3D scan datasets, which served as the foundation for this project's data loading capabilities.
189 | 
190 | ## Contact
191 | - Luke Song: song.1855@osu.edu
192 | - NVIDIA internal: Valts Blukis (vblukis@nvidia.com), Jonathan Tremblay (jtremblay@nvidia.com) 
193 | - Or Github Issues!
194 | 
195 | ## Citation
196 | 
197 | **BibTex:**
198 | ```bibtex
199 | @inproceedings{song2025robospatial,
200 |   author    = {Song, Chan Hee and Blukis, Valts and Tremblay, Jonathan and Tyree, Stephen and Su, Yu and Birchfield, Stan},
201 |   title     = {{RoboSpatial}: Teaching Spatial Understanding to {2D} and {3D} Vision-Language Models for Robotics},
202 |   booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
203 |   year      = {2025},
204 |   note      = {Oral Presentation},
205 | }
206 | ```
207 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Processing EmbodiedScan Data
 2 | 
 3 | To use the EmbodiedScan dataset with this project, you first need to download and process the raw data according to the instructions provided by the original EmbodiedScan authors.
 4 | 
 5 | ## 1. Download and Preprocess Raw Data
 6 | 
 7 | Follow the steps outlined in the official EmbodiedScan data preparation guide:
 8 | [https://github.com/OpenRobotLab/EmbodiedScan/tree/main/data](https://github.com/OpenRobotLab/EmbodiedScan/tree/main/data)
 9 | 
10 | Specifically, you need to complete steps 1 through 5 and step 7:
11 | 1.  Download ScanNet v2 data.
12 | 2.  Download 3RScan data.
13 | 3.  Download Matterport3D data.
14 | 4.  Download ARKitScenes data.
15 | 5.  Download EmbodiedScan annotations (`.pkl` files).
16 | 7.  Extract images for ScanNet and 3RScan using the provided scripts (`generate_image_scannet.py` and `generate_image_3rscan.py`).
17 | 
18 | **Note:** You do **not** need to perform step 6 (extracting occupancy annotations) for this project.
19 | 
20 | Ensure your final data directory structure matches the one specified in the EmbodiedScan README.
21 | 
22 | ## 2. Update Configuration File
23 | 
24 | Once the data is downloaded and processed, you need to update the configuration file to point to the correct locations on your system.
25 | 
26 | Edit the `robospatial/configs/embodiedscan.yaml` file.
27 | 
28 | Update the following paths under the `data_loading` section:
29 | -   `image_root`: Set this to the directory where the extracted images (e.g., `scannet/posed_images`, `3rscan/<scene_id>/sequence`) are located. The specific structure might depend on how you organized the datasets downloaded in step 1.
30 | -   `embodiedscan_ann`: Update the `train`, `val`, and `test` paths to point to the downloaded `.pkl` annotation files (from step 5).
31 | 
32 | Example relevant section in `robospatial/configs/embodiedscan.yaml`:
33 | 
34 | ```yaml
35 | data_loading:
36 |   # ... other settings ...
37 |   image_root: /path/to/your/processed/image/data # <- UPDATE THIS
38 |   embodiedscan_ann:
39 |     train: /path/to/your/EmbodiedScan/data/embodiedscan_infos_train.pkl # <- UPDATE THIS
40 |     val:   /path/to/your/EmbodiedScan/data/embodiedscan_infos_val.pkl   # <- UPDATE THIS
41 |     test:  /path/to/your/EmbodiedScan/data/embodiedscan_infos_test.pkl  # <- UPDATE THIS
42 |   # ... other settings ...
43 | ```
44 | 
45 | After completing these steps, you should be able to load and use the EmbodiedScan dataset with the project.
46 | 


--------------------------------------------------------------------------------
/example_data/annotations/example_input.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dataset": "example_dataset",
  3 |     "scene_name": "example_scene",
  4 |     "image_name": "example_image.jpg",
  5 |     "image_size": [
  6 |         1280,
  7 |         800
  8 |     ],
  9 |     "visible_instance_ids": [
 10 |         0,
 11 |         1,
 12 |         2,
 13 |         3,
 14 |         4,
 15 |         5,
 16 |         6,
 17 |         7
 18 |     ],
 19 |     "camera_annotations": {
 20 |         "extrinsic": [
 21 |             [
 22 |                 1.0,
 23 |                 0.0,
 24 |                 0.0,
 25 |                 0.0
 26 |             ],
 27 |             [
 28 |                 0.0,
 29 |                 -0.6427876096865394,
 30 |                 0.766044443118978,
 31 |                 0.0
 32 |             ],
 33 |             [
 34 |                 0.0,
 35 |                 -0.766044443118978,
 36 |                 -0.6427876096865394,
 37 |                 0.0
 38 |             ],
 39 |             [
 40 |                 0.0,
 41 |                 0.0,
 42 |                 0.0,
 43 |                 1.0
 44 |             ]
 45 |         ],
 46 |         "intrinsic": [
 47 |             [
 48 |                 669.9725341796875,
 49 |                 0.0,
 50 |                 640.0,
 51 |                 0.0
 52 |             ],
 53 |             [
 54 |                 0.0,
 55 |                 669.9725341796875,
 56 |                 400.0,
 57 |                 0.0
 58 |             ],
 59 |             [
 60 |                 0.0,
 61 |                 0.0,
 62 |                 1.0,
 63 |                 0.0
 64 |             ],
 65 |             [
 66 |                 0.0,
 67 |                 0.0,
 68 |                 0.0,
 69 |                 1.0
 70 |             ]
 71 |         ],
 72 |         "axis_align_matrix": [
 73 |             [
 74 |                 1.0,
 75 |                 0.0,
 76 |                 0.0,
 77 |                 0.0
 78 |             ],
 79 |             [
 80 |                 0.0,
 81 |                 1.0,
 82 |                 0.0,
 83 |                 0.0
 84 |             ],
 85 |             [
 86 |                 0.0,
 87 |                 0.0,
 88 |                 1.0,
 89 |                 0.0
 90 |             ],
 91 |             [
 92 |                 0.0,
 93 |                 0.0,
 94 |                 0.0,
 95 |                 1.0
 96 |             ]
 97 |         ]
 98 |     },
 99 |     "objects": [
100 |         {
101 |             "Name": "Brown potato.",
102 |             "bbox_3d": [
103 |                 [
104 |                     0.0965536352212597,
105 |                     0.6457258245959925,
106 |                     -0.708612940810858,
107 |                     0.12224379445579278,
108 |                     0.14591369211531313,
109 |                     0.07074560840853938,
110 |                     -1.11690709649976,
111 |                     0.0,
112 |                     0.0
113 |                 ]
114 |             ]
115 |         },
116 |         {
117 |             "Name": "Yellow banana.",
118 |             "bbox_3d": [
119 |                 [
120 |                     0.6281575669935243,
121 |                     0.8318574965411679,
122 |                     -0.7111925077538456,
123 |                     0.11816268404379843,
124 |                     0.20877171553860133,
125 |                     0.06713331741369433,
126 |                     3.499465406810771,
127 |                     0.0,
128 |                     0.0
129 |                 ]
130 |             ]
131 |         },
132 |         {
133 |             "Name": "Light blue cylindrical cup.",
134 |             "bbox_3d": [
135 |                 [
136 |                     0.24272731261345826,
137 |                     0.5509605710209379,
138 |                     -0.7126530522734699,
139 |                     0.13114139619345888,
140 |                     0.14536641809246212,
141 |                     0.10336367309178618,
142 |                     2.398133451820688,
143 |                     0.0,
144 |                     0.0
145 |                 ]
146 |             ]
147 |         },
148 |         {
149 |             "Name": "Green and white bok choy.",
150 |             "bbox_3d": [
151 |                 [
152 |                     -0.16947051050441583,
153 |                     0.48852897068541257,
154 |                     -0.7355282463047628,
155 |                     0.17038007531409313,
156 |                     0.25531524115276,
157 |                     0.07396988788683145,
158 |                     -1.0229929055844709,
159 |                     0.0,
160 |                     0.0
161 |                 ]
162 |             ]
163 |         },
164 |         {
165 |             "Name": "White circular alarm clock.",
166 |             "bbox_3d": [
167 |                 [
168 |                     -0.29906058568573957,
169 |                     0.7773581812536778,
170 |                     -0.6563527562208662,
171 |                     0.11525623601554463,
172 |                     0.14794219955016966,
173 |                     0.13351970865123064,
174 |                     -1.453594536933811,
175 |                     0.0,
176 |                     0.0
177 |                 ]
178 |             ]
179 |         },
180 |         {
181 |             "Name": "Orange juice carton with orange and white colors, \"ORANGE JUICE\" text.",
182 |             "bbox_3d": [
183 |                 [
184 |                     -0.11910513774912884,
185 |                     0.6559001853803491,
186 |                     -0.6265974918806398,
187 |                     0.16297827806663062,
188 |                     0.15375061767211012,
189 |                     0.2198072386901998,
190 |                     -1.3278610167121037,
191 |                     0.0,
192 |                     0.0
193 |                 ]
194 |             ]
195 |         },
196 |         {
197 |             "Name": "Heart-shaped pink eraser.",
198 |             "bbox_3d": [
199 |                 [
200 |                     -0.11139781936853714,
201 |                     0.6552562452466136,
202 |                     -0.572456284198834,
203 |                     0.05915619125851447,
204 |                     0.11470179230647526,
205 |                     0.08474071881173639,
206 |                     -1.1605898554992926,
207 |                     0.0,
208 |                     0.0
209 |                 ]
210 |             ]
211 |         },
212 |         {
213 |             "Name": "Wooden pallet crate",
214 |             "bbox_3d": [
215 |                 [
216 |                     0.5996147851768805,
217 |                     0.9048717598769047,
218 |                     -0.5165727925513247,
219 |                     0.3457252111932344,
220 |                     0.3813317410243554,
221 |                     0.32189707409624985,
222 |                     -1.538087589994403,
223 |                     0.0,
224 |                     0.0
225 |                 ]
226 |             ]
227 |         }
228 |     ]
229 | }
230 | 


--------------------------------------------------------------------------------
/example_data/images/example_dataset/example_scene/example_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RoboSpatial/59b091d7694a724d3a46bb2b636d1bc49b899eb9/example_data/images/example_dataset/example_scene/example_image.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML
2 | numpy
3 | tqdm
4 | opencv-python
5 | matplotlib
6 | open3d
7 | 


--------------------------------------------------------------------------------
/robospatial/README.md:
--------------------------------------------------------------------------------
 1 | # RoboSpatial Annotation Generation Logic
 2 | 
 3 | For those who wants to hack the codebase!
 4 | 
 5 | ## Annotation Generation Details
 6 | 
 7 | This section provides a more detailed overview of the logic used to generate each type of spatial annotation and highlights the key configuration parameters found in `configs/example_config.yaml` (and other configuration files) that control this process.
 8 | 
 9 | ### 1. Object Grounding (`spatial_analysis/grounding.py`)
10 | 
11 | *   **Purpose:** Generates a tight 2D axis-aligned bounding box (`clipped_bbox`) encompassing all visible pixels of an object in the image.
12 | *   **Logic:**
13 |     *   Relies on a pre-calculated 2D boolean `occupancy_map` for each object, which indicates the precise pixels covered by the object's 3D model when projected onto the image.
14 |     *   It finds the minimum and maximum `x` (column) and `y` (row) coordinates within this occupancy map.
15 |     *   These bounds directly define the `[xmin, ymin, xmax, ymax]` coordinates of the 2D bounding box.
16 | *   **Key Parameters:** None directly in the configuration for this step; it depends on the accuracy of the input 3D models and the camera parameters used to generate the `occupancy_map`.
17 | 
18 | ### 2. Spatial Context (`spatial_analysis/context/context.py`)
19 | 
20 | *   **Purpose:** Samples points in the empty space surrounding a *reference object* and categorizes them based on their spatial relationship (infront, behind, left, right) in three different frames: `objectcentric`, `cameracentric`, and `worldcentric`.
21 | *   **Logic:**
22 |     1.  Calculates empty space on the floor using a top-down 2D grid based on environment geometry.
23 |     2.  Identifies empty grid points within a specific distance (`threshold`) from the reference object's 2D footprint.
24 |     3.  Projects these candidate 3D points (at the object's base height) onto the image, filtering those outside the view or behind the camera.
25 |     4.  Checks if the projected 2D points are occluded by *other* objects using a pre-computed environment occupancy map.
26 |     5.  Categorizes the non-occluded points based on their position relative to the reference object in the three frames (using object orientation for object-centric, and pixel/depth coordinates for camera/world-centric).
27 |     6.  Randomly samples up to `num_samples` non-occluded points for each valid category (frame + direction).
28 | *   **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):**
29 |     *   `context_threshold`: Maximum distance (in world units, e.g., meters) from the reference object's footprint to consider sampling points.
30 |     *   `context_grid_resolution`: The size of each cell in the temporary top-down 2D grid used for finding nearby empty space. Smaller values are more precise but computationally more expensive.
31 |     *   `context_num_samples`: The maximum number of points to sample and output for each valid category (e.g., max 10 points for 'camera_centric' 'left').
32 | 
33 | ### 3. Spatial Configuration (`spatial_analysis/configuration/configuration.py`)
34 | 
35 | *   **Purpose:** Determines the pairwise spatial relationship between two objects (`obj1` relative to `obj2`) across the three reference frames (camera, world, object). Relationships include left/right, infront/behind, above/below, and overlapping.
36 | *   **Logic:**
37 |     1.  Calculates various geometric metrics for both objects (projected 2D bounds, average visible depth, world Z bounds, etc.) using their individual pre-computed `occupancy_map`s.
38 |     2.  **Camera/World-centric:** Compares these metrics. The `strictness` parameter controls the comparison method:
39 |         *   `'strict'`: Uses the absolute min/max bounds. Requires clear separation; considers objects overlapping if their projected bounds intersect at all. Sensitive to partial occlusions.
40 |         *   `'lenient'`: Uses projected centers, average visible depths, and average Z coordinates. More robust to partial occlusion but might misclassify tightly packed objects.
41 |     3.  **Object-centric:** Uses the Separating Axis Theorem (SAT) on the 3D OBBs to check for overlap. If not overlapping, it determines the direction based on the relative position of `obj1`'s center projected onto `obj2`'s local forward and right axes. Above/below still uses world Z coordinates.
42 | *   **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):**
43 |     *   `spatial_configuration_strictness`: (`'strict'` or `'lenient'`) Selects the comparison logic for camera-centric and world-centric frames. Default is `'lenient'`.
44 |     *   `pairwise_relationship_mode`: (`'unique_categories_only'` or `'all_visible_objects'`) Determines which pairs of objects are considered for configuration analysis. `'unique_categories_only'` only considers pairs where each object is the only instance of its category visible, while `'all_visible_objects'` considers all permutations of visible objects.
45 | 
46 | ### 4. Spatial Compatibility (`spatial_analysis/compatibility/compatibility.py`)
47 | 
48 | *   **Purpose:** Assesses whether one object (`obj_a`) *could* be placed in the empty space relative to another (`obj_b`) without collision. It checks directions like left, right, in front, behind, and specifically `on_top`.
49 | *   **Logic:**
50 |     1.  Samples potential placement points around `obj_b` using the Spatial Context logic (`get_point_in_space_relative_to_object`), using a dynamic threshold based on the sizes of `obj_a` and `obj_b`.
51 |     2.  For each sampled point, it simulates placing `obj_a` horizontally centered at that point's 2D location.
52 |     3.  It checks for collisions between the placed `obj_a` (potentially with a `buffer_ratio`) and:
53 |         *   The static environment (using a 2D occupancy grid).
54 |         *   The reference object `obj_b` (maintaining a `min_distance`).
55 |     4.  A relationship (e.g., 'left') is considered compatible (`True`) if *any* sampled point corresponding to that relationship allows `obj_a` to fit.
56 |     5.  A separate, simpler check (`can_fit_on_top`) determines the 'on_top' relationship by comparing the horizontal dimensions of `obj_a` and `obj_b`, but only if `obj_a` is placeable and `obj_b` has a flat surface.
57 | *   **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):**
58 |     *   `compatibility_grid_resolution`: Resolution of the 2D grid used for collision checking against the environment.
59 |     *   `compatibility_num_samples`: How many potential placement points to sample around `obj_b`.
60 |     *   `compatibility_min_distance`: The minimum required distance (in world units) between the placed `obj_a` and the reference `obj_b`.
61 |     *   `compatibility_buffer_ratio`: A ratio applied to `obj_a`'s dimensions during collision checks, effectively adding a safety margin. 0 means no buffer, 0.1 means 10% buffer.
62 |     *   `context_threshold`: The *base* threshold used for sampling points (dynamically increased based on object sizes).
63 | 
64 | ---
65 | 
66 | ## Project Structure
67 | 
68 | *   `configs/`: Contains YAML configuration files (e.g., `example_config.yaml`).
69 | *   `data_loader/`: Modules for loading and interfacing with different 3D datasets. Includes `embodiedscan_loader.py` and a [README](data_loader/README.md) explaining how to add custom loaders.
70 | *   `spatial_analysis/`: Modules performing the core spatial reasoning and annotation generation logic.
71 |     *   `context/`: Logic for spatial context (points relative to an object).
72 |     *   `configuration/`: Logic for spatial configuration (relative position between objects).
73 |     *   `compatibility/`: Logic for spatial compatibility (fitting assessment).
74 |     *   `grounding.py`: Logic for 2D object grounding.
75 |     *   `relationships.py`: High-level wrappers for spatial analysis functions.
76 |     *   `relationship_utils.py`: Utility functions for geometry and projections.
77 |     *   `topdown_map.py`: Functions for creating 2D top-down occupancy grids.
78 |     *   `obj_properties.py`: Lists defining object properties (e.g., `items_with_face`).
79 | *   `annotation_generator.py`: Orchestrates the generation process for a single scene.
80 | *   `run_generation.py`: Main script to run annotation generation across datasets/scenes.
81 | 
82 | ## Output Files
83 | 
84 | *   **`<output_dir>/<dataset>/<scene_id>/<image_name>.annotations.json`**: The primary output. Contains the generated spatial annotations for a single image, structured by type (grounding, unary relations, pairwise relations).
85 | *   **`generation_progress.json`**: Stores a map of datasets to lists of scene names that have been successfully processed. Allows the script to resume if interrupted. Located in the directory where `run_generation.py` is executed.
86 | *   **`generation_stats.json`**: Contains aggregated statistics about the generated annotations (e.g., counts of each annotation type) overall and per-dataset. Located in the directory where `run_generation.py` is executed.
87 | 


--------------------------------------------------------------------------------
/robospatial/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/robospatial/annotation_generator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | """Core annotation generation logic for a single scene.
 11 | 
 12 | This module defines the `generate_and_save_annotations` function, which is responsible
 13 | for processing all images within a given scene. It calculates various spatial
 14 | relationships and grounding information based on object data (OBBs, categories)
 15 | and camera parameters.
 16 | 
 17 | It utilizes functions from the `spatial_analysis` package to compute:
 18 | - Object Grounding: Bounding boxes in 2D.
 19 | - Spatial Context: Points relative to an object (e.g., in front, behind).
 20 | - Spatial Compatibility: Fit assessment (e.g., can A fit on B).
 21 | - Spatial Configuration: Relative positioning (e.g., left/right, above/below).
 22 | 
 23 | The generated annotations are saved as JSON files, one per processed image.
 24 | This module is typically called by a higher-level script (e.g. `run_generation.py`)
 25 | that handles dataset iteration and overall workflow management.
 26 | """
 27 | 
 28 | 
 29 | import itertools
 30 | import os
 31 | import json
 32 | import cv2
 33 | from collections import defaultdict
 34 | import numpy as np
 35 | from tqdm import tqdm
 36 | 
 37 | from spatial_analysis.grounding import get_object_grounding
 38 | from spatial_analysis.relationships import get_spatial_configuration, get_spatial_compatibility, get_spatial_context
 39 | from spatial_analysis.relationship_utils import calculate_occupied_pixels
 40 | 
 41 | 
 42 | 
 43 | # --- Main Annotation Generation Function ---
 44 | 
 45 | def generate_and_save_annotations(loader, dataset_name, scene_name, images_ann_dict, config, num_workers):
 46 |     """
 47 |     Generates and saves annotations for a scene based on the configuration.
 48 |     Handles multiple annotation types: localization, compatibility, point_grounding, bbox_grounding.
 49 |     """
 50 | 
 51 |     # --- Statistics Initialization ---
 52 |     stats = defaultdict(int)
 53 |     stats['num_total_images'] = 0
 54 | 
 55 |     # --- Read Compatibility Check Configs --- 
 56 |     comp_grid_res = config["data_generation"]["generation_options"]["compatibility_grid_resolution"]
 57 |     comp_min_distance = config["data_generation"]["generation_options"]["compatibility_min_distance"]
 58 |     comp_buffer_ratio = config["data_generation"]["generation_options"]["compatibility_buffer_ratio"]
 59 |     comp_num_samples = config["data_generation"]["generation_options"]["compatibility_num_samples"]
 60 | 
 61 |     # --- Read Spatial Context Configs --- 
 62 |     context_threshold = config["data_generation"]["generation_options"]["context_threshold"] 
 63 |     context_grid_res = config["data_generation"]["generation_options"]["context_grid_resolution"] 
 64 |     context_num_samples = config["data_generation"]["generation_options"]["context_num_samples"] 
 65 | 
 66 |     # --- Read Spatial Configuration Strictness ---
 67 |     spatial_config_strictness = config["data_generation"]["generation_options"]["spatial_configuration_strictness"]
 68 | 
 69 |     # --- Read Pairwise Relationship Mode ---
 70 |     pairwise_mode = config["data_generation"]["generation_options"]["pairwise_relationship_mode"]
 71 | 
 72 | 
 73 |     # --- Generate Annotations ---
 74 |     # Determine the iterator based on whether tqdm should be used
 75 |     image_iterator = images_ann_dict.items()
 76 |     if num_workers <= 1:
 77 |         # Wrap with tqdm only if single-threaded
 78 |         image_iterator = tqdm(image_iterator, desc=f"Processing images in {scene_name}", leave=False)
 79 | 
 80 |     for image_name, image_ann in image_iterator:
 81 | 
 82 |         # Initial setup
 83 |         relationships_to_generate = config["data_generation"]["generation_options"]["spatial_relationship_types"]
 84 |         extrinsic = image_ann['extrinsic']
 85 |         intrinsic = image_ann['intrinsic']
 86 | 
 87 |         # --- Image Setup ---
 88 |         image_path = os.path.join(config["data_loading"]["image_root"], image_ann["img_path"])  # Use path as identifier
 89 |         image_file = cv2.imread(image_path)
 90 |         
 91 |         if image_file is None:
 92 |             print(f"Warning: Could not read image {image_path}. Skipping.")
 93 |             continue
 94 |         h, w, _ = image_file.shape
 95 |         image_size = (w, h)
 96 | 
 97 |         # Process visible objects in the image
 98 |         vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs = loader.list_objects(dataset_name, scene_name, image_ann)
 99 | 
100 |         if len(all_objs) == 0:
101 |             print(f"Warning: No objects detected in image {image_name}. Skipping image.")
102 |             continue
103 | 
104 |         # Get all OBBs
105 |         obbs = {obj["name"]: obj["obb"] for obj in all_objs.values() if "name" in obj and "obb" in obj}
106 | 
107 |         # --- Precompute Environment Occupancy Maps (Combined and Individual) ---
108 |         # Pass list of object dictionaries to calculate_occupied_pixels
109 |         objects_for_occupancy = [obj for obj in all_objs.values() if 'obb' in obj and 'name' in obj]
110 |         if not objects_for_occupancy:
111 |             print(f"Warning: No objects with OBB and name found for occupancy calculation in {image_name}. Skipping image.")
112 |             continue # Or handle appropriately
113 | 
114 |         env_occupancy_map, individual_occupancy_maps = calculate_occupied_pixels(
115 |             objects_for_occupancy, extrinsic, intrinsic, image_size
116 |         )
117 | 
118 |         # --- Annotation Generation ---
119 |         spatial_relationships = {
120 |             "unary_relations": [],
121 |             "pairwise_relations": []
122 |         }
123 |         # Initialize defaultdict to create a dictionary with empty lists for grounding keys
124 |         object_grounding = []
125 |         generated_something_for_image = False
126 | 
127 |         # 1. Generate Grounding Annotations (per object type)
128 |         if 'spatial_context' in relationships_to_generate or 'object_grounding' in relationships_to_generate:
129 | 
130 |             for obj_name, obj in vis_objs.items():
131 | 
132 |                 category = obj["category"]
133 |                 category = obj.get("category")
134 |                 obj_map = individual_occupancy_maps.get(obj_name) # Get precomputed map
135 | 
136 |                 if category is None or obj_map is None:
137 |                     print(f"Warning: Skipping object {obj_name} due to missing category or precomputed occupancy map.")
138 |                     continue
139 | 
140 |                 # No need to differentiate unique/multi here as we iterate through vis_objs directly
141 | 
142 |                 #NOTE object grounding handles both single and multi instance objects
143 |                 if 'object_grounding' in relationships_to_generate:
144 |                     grounding_info = get_object_grounding(obj, obj_map)
145 | 
146 |                     if grounding_info:
147 |                         results = {
148 |                             "name": obj_name,
149 |                             "category": category,
150 |                             "bbox": grounding_info["clipped_bbox"],
151 |                             "bbox_3d": obj["bbox_3d"],
152 |                         }
153 |                         object_grounding.append(results)
154 |                         generated_something_for_image = True
155 |                         stats['num_object_grounding_generated'] += 1
156 | 
157 |                 #NOTE spatial context handles single instance objects
158 |                 if 'spatial_context' in relationships_to_generate and category in unique_vis_categories:
159 |                     # Filter obbs to exclude the current object
160 |                     context_obbs = [obb for name, obb in obbs.items() if name != obj_name]
161 |                     # Pass precomputed maps to get_spatial_context
162 |                     points_2d, points_3d, generated = get_spatial_context(
163 |                         obj, extrinsic, intrinsic, floor_bound, context_obbs, image_size, image_path,
164 |                         individual_occupancy_maps=individual_occupancy_maps, # Pass individual maps
165 |                         env_occupancy_map=env_occupancy_map,                 # Pass combined env map
166 |                         threshold=context_threshold, # Pass configured threshold
167 |                         grid_resolution=context_grid_res, # Pass configured grid resolution
168 |                         num_samples=context_num_samples, # Pass configured num samples
169 |                     )
170 |                     if generated:
171 |                         results = {
172 |                             "name": obj_name,
173 |                             "category": category,
174 |                             "point_space_2d": points_2d,
175 |                             "point_space_3d": points_3d,
176 |                         }
177 |                         spatial_relationships["unary_relations"].append(results)
178 |                         generated_something_for_image = True
179 |                         stats['num_spatial_context_generated'] += 1
180 |         
181 |         # 2. Generate Relationship Annotations (per object pair)
182 |         generate_pairwise = 'spatial_configuration' in relationships_to_generate or 'spatial_compatibility' in relationships_to_generate
183 |         objects_available_for_pairwise = (pairwise_mode == 'unique_categories_only' and len(unique_vis_categories) >= 2) or \
184 |                                          (pairwise_mode == 'all_visible_objects' and len(vis_objs) >= 2)
185 | 
186 | 
187 |         if generate_pairwise and objects_available_for_pairwise:
188 |             # Determine the iterator based on the mode
189 |             if pairwise_mode == 'unique_categories_only':
190 |                 iterator = itertools.permutations(unique_vis_categories, 2)
191 |                 get_obj = lambda cat: vis_objs[cat] # Function to get object by category
192 |                 # Use unique_vis_categories which are typically names/keys in vis_objs for unique items
193 |                 iterator = itertools.permutations(unique_vis_categories, 2)
194 |                 # Need to handle potential KeyError if a category name isn't directly a key in vis_objs
195 |                 # Assuming unique_vis_categories contains keys that *are* in vis_objs
196 |                 get_obj = lambda cat_key: vis_objs.get(cat_key)
197 |             else: # pairwise_mode == 'all_visible_objects'
198 |                 iterator = itertools.permutations(vis_objs.keys(), 2)
199 |                 get_obj = lambda key: vis_objs[key] # Function to get object by key
200 |                 get_obj = lambda key: vis_objs.get(key)
201 | 
202 | 
203 |             for item1_key, item2_key in iterator:
204 |                 obj1 = get_obj(item1_key)
205 |                 obj2 = get_obj(item2_key)
206 | 
207 |                 # Skip if objects couldn't be retrieved (e.g., bad key from unique_vis_categories)
208 |                 if obj1 is None or obj2 is None:
209 |                      print(f"Warning: Could not retrieve objects for pair ({item1_key}, {item2_key}). Skipping.")
210 |                      continue
211 | 
212 |                 # Get object names
213 |                 obj1_name = obj1["name"]
214 |                 obj2_name = obj2["name"]
215 |                 # Get object names and categories safely
216 |                 obj1_name = obj1.get("name")
217 |                 obj2_name = obj2.get("name")
218 |                 obj1_cat = obj1.get("category")
219 |                 obj2_cat = obj2.get("category")
220 | 
221 |                 if not all([obj1_name, obj2_name, obj1_cat, obj2_cat]):
222 |                      print(f"Warning: Missing name or category for objects in pair ({item1_key}, {item2_key}). Skipping.")
223 |                      continue
224 | 
225 |                 pair_result = {
226 |                     "pair": (obj1_name, obj2_name),
227 |                     "pair_category": (obj1["category"], obj2["category"]),
228 |                     "pair_category": (obj1_cat, obj2_cat),
229 |                 }
230 | 
231 | 
232 |                 if 'spatial_configuration' in relationships_to_generate:
233 |                     # Pass individual maps to get_spatial_configuration
234 |                     config_rels = get_spatial_configuration(
235 |                         obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps,spatial_config_strictness)
236 |                     pair_result["spatial_configuration"] = config_rels
237 |                     generated_something_for_image = True
238 |                     stats['num_spatial_configuration_pairs'] += 1
239 | 
240 |                 if 'spatial_compatibility' in relationships_to_generate:
241 |                     # Filter obbs to exclude obj1
242 |                     compatibility_obbs = [obb for name, obb in obbs.items() if name != obj1_name]
243 |                     # Pass individual and combined maps to get_spatial_compatibility
244 |                     comp_rels = get_spatial_compatibility(
245 |                         obj1, obj2, extrinsic, intrinsic, floor_bound, compatibility_obbs, image_size, image_path,
246 |                         individual_occupancy_maps=individual_occupancy_maps, # Pass individual maps
247 |                         env_occupancy_map=env_occupancy_map,                 # Pass combined env map
248 |                         grid_resolution=comp_grid_res, # Pass configured grid resolution
249 |                         num_samples=comp_num_samples, # Pass configured num samples
250 |                         min_distance=comp_min_distance, # Pass configured min distance
251 |                         buffer_ratio=comp_buffer_ratio # Pass configured buffer ratio
252 |                     )
253 |                     pair_result["spatial_compatibility"] = comp_rels
254 |                     generated_something_for_image = True
255 |                     stats['num_spatial_compatibility_pairs'] += 1
256 | 
257 |                 if len(pair_result) > 2: # Check if more than just pair info was added
258 |                     spatial_relationships["pairwise_relations"].append(pair_result)
259 | 
260 |         # --- Save Results ---
261 |         if generated_something_for_image:
262 |             stats['num_total_images'] += 1
263 | 
264 |             image_results = {
265 |                 "dataset": dataset_name,
266 |                 "scene_name": scene_name,
267 |                 "image_identifier": image_name,
268 |                 "image_path": image_path,
269 |                 "image_size": image_size,
270 |                 "depth_path": image_ann.get("depth_path", ""),
271 |                 "visible_instance_ids": image_ann.get('visible_instance_ids', []),
272 |             }
273 |             
274 |             cam_ann = {}
275 |             for key in ['extrinsic', 'intrinsic']:
276 |                 if key in image_ann and image_ann[key] is not None:
277 |                      cam_ann[key] = image_ann[key].tolist()
278 |             image_results["camera_annotations"] = cam_ann
279 | 
280 |             if object_grounding:
281 |                 image_results["object_grounding"] = object_grounding
282 |             if spatial_relationships:
283 |                 image_results["spatial_relationships"] = spatial_relationships
284 | 
285 |             folder_path = os.path.join(config["data_generation"]["output_dir"], scene_name)
286 |             os.makedirs(folder_path, exist_ok=True)
287 | 
288 |             output_suffix = config.get("output_suffix", ".annotations.json")
289 |             file_name = f"{image_ann['image_basename']}{output_suffix}"
290 |             file_path = os.path.join(folder_path, file_name)
291 | 
292 |             with open(file_path, 'w') as json_file:
293 |                 json.dump(image_results, json_file, indent=4)
294 |         
295 |     # --- Return Scene Statistics ---
296 |     scene_ann_stats = {
297 |         'dataset_name': dataset_name,
298 |         'scene_name': scene_name,
299 |         'num_processed_images': stats['num_total_images'],
300 |         'num_spatial_configuration_pairs': stats['num_spatial_configuration_pairs'],
301 |         'num_spatial_compatibility_pairs': stats['num_spatial_compatibility_pairs'],
302 |         'num_object_grounding_generated': stats['num_object_grounding_generated'],
303 |         'num_spatial_context_generated': stats['num_spatial_context_generated'],
304 |     }
305 | 
306 |     return scene_ann_stats 


--------------------------------------------------------------------------------
/robospatial/configs/embodiedscan.yaml:
--------------------------------------------------------------------------------
 1 | data_generation:
 2 |   # Number of parallel worker threads for processing scenes.
 3 |   # Defaults to min(os.cpu_count(), 4) if not specified or overridden by CLI --num_workers.
 4 |   num_workers: 8
 5 |   progress_file: generation_progress.json
 6 |   stats_file: generation_stats.json
 7 |   output_suffix: .annotations.json
 8 | 
 9 |   generation_options:
10 |     spatial_relationship_types:
11 |       - object_grounding
12 |       - spatial_context
13 |       - spatial_configuration
14 |       - spatial_compatibility
15 |     
16 |     # Settings for spatial context point generation
17 |     context_threshold: 0.5             # Distance threshold from reference object
18 |     context_grid_resolution: 0.1       # Grid resolution for empty space check during context generation
19 |     context_num_samples: 50            # Number of points to sample for spatial context
20 |     
21 |     # Settings for spatial compatibility checks
22 |     compatibility_grid_resolution: 0.1 # Grid resolution for can_fit checks
23 |     compatibility_min_distance: 0.2 # Minimum distance between objects in can_fit checks
24 |     compatibility_buffer_ratio: 0.5 # Buffer ratio for can_fit checks
25 |     compatibility_num_samples: 50    # Number of samples for point space in can_fit checks
26 | 
27 |     # Strictness level for spatial configuration checks ('strict' or 'lenient')
28 |     spatial_configuration_strictness: 'strict'
29 |   
30 |     # Mode for calculating pairwise relationships ('unique_categories_only' or 'all_visible_objects')
31 |     pairwise_relationship_mode: 'unique_categories_only' # Default to current behavior
32 |   
33 |   output_dir: /path/to/output/dir/
34 | 
35 | # Data Loader Settings
36 | data_loading:
37 |   verbose: True
38 |   datasets:
39 |     - scannet
40 |     - matterport3d
41 |     - 3rscan
42 |     - arkitscenes
43 |   loader_class: data_loader.embodiedscan_loader.EmbodiedScanLoader
44 |   annotation_key: embodiedscan_ann
45 |   split: # Which splits to use, for EmbodiedScan we use train and val
46 |     - train
47 |     - val
48 |   
49 |   image_root: /path/to/your/processed/image/data
50 |   embodiedscan_ann:
51 |     train: /path/to/embodiedscan_infos_train.pkl
52 |     val:   /path/to/embodiedscan_infos_val.pkl
53 |     test:  /path/to/embodiedscan_infos_test.pkl
54 | 


--------------------------------------------------------------------------------
/robospatial/configs/example_config.yaml:
--------------------------------------------------------------------------------
  1 | # Example configuration with minimal required fields for using a custom loader
  2 | 
  3 | # Top-level structure often separates different concerns like loading and generation
  4 | 
  5 | data_loading:
  6 |   # --- Required Fields ---
  7 | 
  8 |   # Specifies the Python class for your custom data loader. (Required)
  9 |   # Format: "module_name.ClassName"
 10 |   loader_class: "data_loader.example_loader.ExampleLoader"
 11 | 
 12 |   # List of dataset identifiers that this configuration applies to. (Required)
 13 |   # These identifiers must match keys under the data_loading section
 14 |   # where dataset-specific parameters are defined.
 15 |   datasets:
 16 |     - example_dataset
 17 | 
 18 |   # --- Dataset Specific Configuration --- 
 19 |   # Parameters required by the specific loader ('ExampleLoader' in this case).
 20 |   # These fields are essential *for this loader*.
 21 |   annotation_dir: "/path/to/your/dataset/annotations/"
 22 |   image_root: "/path/to/your/dataset/images/"
 23 | 
 24 |   # Specifies which data split(s) to process (e.g., train, validation, test).
 25 |   # If omitted, the loader might default to a specific split or load all.
 26 |   # split:
 27 |   #   - "train"
 28 |   #   - "val"
 29 | 
 30 |   # Enable verbose logging output during data loading. (Optional, defaults may vary)
 31 |   # verbose: True
 32 | 
 33 |   # Add any *other* parameters required only by your specific CustomDatasetLoader here.
 34 | 
 35 | 
 36 | data_generation:
 37 |   # Base directory where all generated QA data will be saved.
 38 |   output_dir: "/path/to/your/output/qa_data"
 39 | 
 40 |   # Number of parallel worker threads for processing scenes.
 41 |   # If omitted or null, defaults to os.cpu_count() (or 4 if count is unavailable).
 42 |   # Can be overridden by the --num_workers command-line argument.
 43 |   num_workers: 1 # Example: Set to 1 to disable parallel processing, or e.g., 4 to use 4 threads.
 44 | 
 45 |   # Suffix to append to output annotation filenames.
 46 |   output_suffix: ".annotations.json"
 47 | 
 48 |   # Path to the file used for saving and resuming generation progress.
 49 |   progress_file: "generation_progress_example.json"
 50 | 
 51 |   # Path to the file where generation statistics will be saved.
 52 |   stats_file: "generation_stats_example.json"
 53 | 
 54 |   # --- QA Generation Options ---
 55 |   # These options control the types and specifics of the Question-Answering pairs generated.
 56 |   generation_options:
 57 |     # List of spatial relationship/QA types to generate.
 58 |     # Common types might include:
 59 |     # - object_grounding: Questions about the location/existence of specific objects.
 60 |     # - spatial_context: Questions about objects relative to empty space or general areas.
 61 |     # - spatial_configuration: Questions about the arrangement of multiple objects.
 62 |     # - spatial_compatibility: Questions about whether objects *could* fit somewhere.
 63 |     spatial_relationship_types:
 64 |       - "object_grounding"
 65 |       - "spatial_context"
 66 |       - "spatial_configuration"
 67 |       - "spatial_compatibility"
 68 |       # Add any custom QA types your system supports
 69 | 
 70 |     # --- Settings for Specific QA Types ---
 71 | 
 72 |     # Threshold distance (in meters) from a reference object when generating
 73 |     # points for "spatial_context" questions. Points further than this are ignored.
 74 |     context_threshold: 0.5
 75 | 
 76 |     # Grid resolution (in meters) used for checking empty space when sampling
 77 |     # points for "spatial_context" questions.
 78 |     context_grid_resolution: 0.1
 79 | 
 80 |     # Number of points to sample around reference objects for "spatial_context" questions.
 81 |     context_num_samples: 50
 82 | 
 83 |     # Grid resolution (in meters) used for collision checking (e.g., "can this object fit here?")
 84 |     # in "spatial_compatibility" questions.
 85 |     compatibility_grid_resolution: 0.1
 86 | 
 87 |     # Minimum distance (in meters) between objects when checking for "spatial_compatibility" questions.
 88 |     compatibility_min_distance: 0
 89 | 
 90 |     # Buffer ratio for "spatial_compatibility" checks.
 91 |     # This is the ratio of how much of the buffer zone can be occupied by other objects.
 92 |     # For example, if the buffer ratio is 0.6, then the buffer zone can be occupied by other objects
 93 |     # up to 60% of the time.
 94 |     compatibility_buffer_ratio: 0.6
 95 | 
 96 |     # Number of points to sample on object surfaces or within volumes for
 97 |     # collision/fitting checks in "spatial_compatibility" questions.
 98 |     compatibility_num_samples: 50
 99 | 
100 |     # Strictness level for "spatial_configuration" checks.
101 |     # 'strict': Requires precise matching of object arrangements.
102 |     # 'lenient': Allows for some tolerance in positions/orientations.
103 |     spatial_configuration_strictness: 'lenient'
104 | 
105 |     # Mode for calculating pairwise relationships between objects (used in several QA types).
106 |     # 'unique_categories_only': Considers relationships only between objects of different categories.
107 |     # 'all_visible_objects': Considers relationships between all pairs of visible objects.
108 |     pairwise_relationship_mode: 'unique_categories_only' 


--------------------------------------------------------------------------------
/robospatial/configs/example_dataset.yaml:
--------------------------------------------------------------------------------
 1 | data_generation:
 2 |   num_workers: 1
 3 |   progress_file: generation_progress_example.json # Use a different progress file
 4 |   stats_file: generation_stats_example.json # Use a different stats file
 5 |   output_suffix: .annotations.json
 6 | 
 7 |   generation_options:
 8 |     spatial_relationship_types:
 9 |       - object_grounding
10 |       - spatial_context 
11 |       - spatial_configuration
12 |       - spatial_compatibility 
13 |     
14 |     # Settings for spatial context point generation
15 |     context_threshold: 0.5            
16 |     context_grid_resolution: 0.1      
17 |     context_num_samples: 50           
18 |     
19 |     # Settings for spatial compatibility checks
20 |     compatibility_grid_resolution: 0.1
21 |     compatibility_min_distance: 0
22 |     compatibility_buffer_ratio: 0.6
23 |     compatibility_num_samples: 50   
24 | 
25 |     # Strictness level for spatial configuration checks
26 |     spatial_configuration_strictness: lenient
27 |   
28 |     # Mode for calculating pairwise relationships
29 |     pairwise_relationship_mode: unique_categories_only 
30 |   
31 |   # Adjust output directory as needed
32 |   output_dir: ../example_data/example_qa
33 | 
34 | # Data Loader Settings for Example Dataset JSON Annotations
35 | data_loading:
36 |   verbose: True
37 |   # Define the dataset name(s) you want to load. 
38 |   # The loader will look for this key in the 'dataset' field within the JSON files.
39 |   datasets:
40 |     - example_dataset
41 |   
42 |   # Specify the modified ExampleLoader class
43 |   loader_class: data_loader.example_loader.ExampleLoader
44 |   
45 |   # Specify the directory containing the JSON annotation files
46 |   annotation_dir: ../example_data/annotations/
47 | 
48 |   # Specify the directory containing the images
49 |   image_root: ../example_data/images/
50 | 


--------------------------------------------------------------------------------
/robospatial/data_loader/README.md:
--------------------------------------------------------------------------------
  1 | # Implementing a Custom Data Loader
  2 | 
  3 | This document outlines the steps and requirements for implementing a custom data loader compatible with the RoboSpatial annotation generation pipeline.
  4 | 
  5 | ## Overview
  6 | 
  7 | The data loader is responsible for interfacing with your specific dataset format and providing the necessary information (scenes, images, object instances, metadata) to the generation pipeline. To ensure compatibility, your custom loader must inherit from the `BaseLoader` abstract base class (`robospatial.data_loader.base_loader.BaseLoader`) and implement its required methods.
  8 | 
  9 | ## BaseLoader Interface
 10 | 
 11 | Your custom loader class must implement the following methods:
 12 | 
 13 | ### `__init__(self, config)`
 14 | 
 15 | *   **Purpose:** Initializes the data loader. This typically involves loading annotations, setting up paths, and potentially pre-processing metadata.
 16 | *   **Args:**
 17 |     *   `config (dict)`: A dictionary containing the `data_loading` section from the configuration file (e.g., `configs/embodiedscan.yaml`). This allows access to dataset paths, annotation file locations, selected datasets, splits, and other relevant parameters.
 18 | *   **Implementation Notes:**
 19 |     *   Use the `config` dictionary to locate and load your dataset's annotation files.
 20 |     *   Store necessary metadata, such as class labels and mappings, as instance variables.
 21 |     *   Organize the loaded data in a way that facilitates efficient retrieval by the other methods (e.g., nested dictionaries keyed by dataset and scene name, as seen in `EmbodiedScanLoader`).
 22 | 
 23 | ### `list_scenes(self, dataset_list)`
 24 | 
 25 | *   **Purpose:** Provides a generator that yields information about each scene within the specified datasets.
 26 | *   **Args:**
 27 |     *   `dataset_list (list)`: A list of dataset names (strings) requested by the pipeline (e.g., `['scannet', '3rscan']`).
 28 | *   **Returns:**
 29 |     *   `generator`: Yields tuples of `(dataset_name, scene_idx, scene_name)`.
 30 |         *   `dataset_name (str)`: The name of the dataset the scene belongs to.
 31 |         *   `scene_idx (int)`: A unique index for the scene within its dataset (can be a simple counter).
 32 |         *   `scene_name (str)`: A unique identifier for the scene, often including the dataset prefix (e.g., `'scannet/scene0000_00'`). This name is used in subsequent calls.
 33 | 
 34 | ### `list_images(self, dataset_name, scene_name)`
 35 | 
 36 | *   **Purpose:** Lists all images (or viewpoints) associated with a specific scene.
 37 | *   **Args:**
 38 |     *   `dataset_name (str)`: The name of the dataset.
 39 |     *   `scene_name (str)`: The unique identifier of the scene (obtained from `list_scenes`).
 40 | *   **Returns:**
 41 |     *   `dict`: A dictionary where keys are unique image identifiers (e.g., `'<scene_name>/<image_filename>'` or `'<scene_name>/<frame_id>'`) and values are dictionaries containing image-specific annotations. Each image annotation dictionary **must** include:
 42 |         *   `extrinsic` (or equivalent): 4x4 Transformation matrix (e.g., NumPy array or list of lists) from camera coordinates to the global/world coordinate system of the scene.
 43 |           ```python
 44 |           # Example:
 45 |           [[ -0.9897,  0.1085,  0.0927,  1.2120],
 46 |            [ -0.0330,  0.4577, -0.8884,  0.3075],
 47 |            [ -0.1388, -0.8824, -0.4494,  1.4804],
 48 |            [  0.    ,  0.    ,  0.    ,  1.    ]]
 49 |           ```
 50 |         *   `intrinsic`: 4x4 Camera intrinsics matrix (e.g., NumPy array or list of lists).
 51 |           ```python
 52 |           # Example:
 53 |           [[ 1170.18,    0.  ,  647.75,    0.  ],
 54 |            [    0.  , 1170.18,  483.75,    0.  ],
 55 |            [    0.  ,    0.  ,    1.  ,    0.  ],
 56 |            [    0.  ,    0.  ,    0.  ,    1.  ]]
 57 |           ```
 58 |         *   `img_path`: Absolute or relative path to the image file. `img_path` gets joined with `image_root` path in the config file.
 59 |         *   Any other metadata required by `list_objects` (e.g., `visible_instance_ids` in `EmbodiedScanLoader`).
 60 | 
 61 | ### `list_objects(self, dataset_name, scene_name, image_ann)`
 62 | 
 63 | *   **Purpose:** Identifies and processes object instances visible from a specific viewpoint (image). It organizes objects based on visibility and category, handles duplicate categories, and calculates scene bounds.
 64 | *   **Args:**
 65 |     *   `dataset_name (str)`: The name of the dataset.
 66 |     *   `scene_name (str)`: The unique identifier of the scene.
 67 |     *   `image_ann (dict)`: The annotation dictionary for a single image, obtained from the output of `list_images`.
 68 | *   **Returns:**
 69 |     *   `tuple`: A 5-element tuple containing:
 70 |         1.  `vis_objs (dict)`: Dictionary of *visible*, *non-environmental* objects.
 71 |             *   Keys: Object category name. If multiple instances of the same category are visible, append an index (e.g., `'chair_0'`, `'chair_1'`). Environmental objects like 'wall', 'floor', 'ceiling', and generic 'object' categories should be excluded.
 72 |             *   Values: Instance annotation dictionaries. Each dictionary should contain at least:
 73 |                 *   `category (str)`: The original object category label.
 74 |                 *   `name (str)`: The potentially indexed name used as the key in `vis_objs`.
 75 |                 *   `bbox_3d` (or equivalent, optional but recommended): The original 3D bounding box representation from your dataset (e.g., 9 DoF parameters: center, size, orientation). While the pipeline primarily uses the `obb` for calculations, this original `bbox_3d` is saved in the final annotations if provided.
 76 |                 *   `obb`: The Open3D `OrientedBoundingBox` representation (`open3d.geometry.OrientedBoundingBox`). **This is crucial for spatial relationship calculations.** Your `list_objects` implementation is responsible for creating this, often by converting from `bbox_3d` (see `EmbodiedScanLoader` line ~241 for an example using `_9dof_to_box`) or by generating it directly if your dataset provides OBBs.
 77 |         2.  `unique_vis_categories (set)`: A set of category names (strings) for objects that appear *exactly once* in the `vis_objs` dictionary (excluding environmental/generic categories).
 78 |         3.  `multi_vis_categories (set)`: A set of category names (strings) for objects that appear *multiple times* in the `vis_objs` dictionary (excluding environmental/generic categories).
 79 |         4.  `floor_bound (list)`: A list containing two `numpy.ndarray`s representing the minimum and maximum coordinates `[min_bound, max_bound]` that encompass the floor and all non-environmental objects. This is often derived from the combined OBBs of relevant objects.
 80 |         5.  `all_objs (dict)`: Dictionary of *all* non-environmental objects associated with the *scene* (not just the current view), keyed by their potentially indexed name (e.g., 'chair_0').
 81 |                                    Used for occupancy map generation or other downstream tasks. The structure mirrors `vis_objs` but includes objects not necessarily visible in the current `image_ann`.
 82 |                                    Each object dictionary must contain at least `category`, `name`, and `obb`. Including `bbox_3d` is recommended if available.
 83 |                                    *Note: Depending on your dataset structure, you might populate this similarly to `vis_objs` based on `visible_instance_ids` or load all scene objects separately.*
 84 | 
 85 | ## Configuration
 86 | 
 87 | To use your custom data loader, update the `data_loading` section in your configuration file (e.g., `configs/example_config.yaml`):
 88 | 
 89 | ```yaml
 90 | data_loading:
 91 |   # ... other settings ...
 92 |   loader_class: path.to.your.module.YourCustomLoaderClassName # Update this line
 93 |   # Provide any custom keys your loader's __init__ needs
 94 |   your_custom_annotation_path:
 95 |     train: /path/to/your/train_annotations.pkl
 96 |     val:   /path/to/your/val_annotations.pkl
 97 |   # ... other dataset-specific paths or parameters ...
 98 | ```
 99 | 
100 | *   Set `loader_class` to the fully qualified Python path of your custom loader class.
101 | *   Ensure any necessary configuration parameters (like annotation file paths) needed by your loader's `__init__` method are present in the `data_loading` section.
102 | 
103 | ## Example
104 | 
105 | Refer to `data_loader.embodiedscan_loader.EmbodiedScanLoader` for a concrete implementation example using datasets like ScanNet, Matterport3D, and 3RScan.
106 | 
107 | Additionally, refer to `data_loader.example_loader.py` for a simpler implementation tailored specifically to the JSON annotation format found in the `example_data/` directory. This loader demonstrates how to handle the example annotations provided for testing the pipeline.
108 | 
109 | ## Visualizing Your Loader Output
110 | 
111 | To verify that your custom data loader is producing the correct outputs (specifically the object instances with their 3D bounding boxes and camera parameters), you can use the provided visualization script: `scripts/visualize_input.py`.
112 | 
113 | **Purpose:**
114 | 
115 | This script takes an image file and a corresponding intermediate annotation JSON file (similar to those in `example_data/annotations/`, representing the data your loader would prepare for a single image) as input. It reads the camera parameters (`extrinsic`, `intrinsic`) and the object information (specifically `bbox_3d`) from the JSON. It then projects the 3D bounding boxes onto the 2D image and displays the result.
116 | 
117 | This helps you visually confirm:
118 | 
119 | *   Camera parameters (`extrinsic`, `intrinsic`) are correct.
120 | *   Oriented object bounding boxes (derived from `bbox_3d`) align with the objects in the image.
121 | *   The data format your loader prepares is being interpreted correctly before passing it to the main pipeline.
122 | 
123 | **Important Note:**
124 | 
125 | The provided visualization script, `scripts/visualize_input.py`, is designed to help debug your custom loader's output *before* running the full generation pipeline. It reads an intermediate JSON file (like those in `example_data/annotations/`) which represents the data your loader passes for a single image.
126 | 
127 | Currently, this script expects the JSON to contain an `objects` array. For each object in this array, it specifically looks for a `bbox_3d` field containing a list with 9 DoF parameters (center, size, rotation) as its first element. It uses these parameters to generate an Open3D `OrientedBoundingBox` (`obb`) via the `_9dof_to_box` function for visualization.
128 | 
129 | *   **If your custom loader generates an intermediate JSON where the 3D bounding box information is stored differently (e.g., different format within `bbox_3d`, different field name, or only providing a pre-computed `obb`),** you will need to modify the `visualize_single_image` function in `scripts/visualize_input.py` (around line 195) to correctly parse your data and create the `o3d_box` for drawing.
130 | 


--------------------------------------------------------------------------------
/robospatial/data_loader/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/robospatial/data_loader/base_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual
 4 | # property and proprietary rights in and to this material, related
 5 | # documentation and any modifications thereto. Any use, reproduction,
 6 | # disclosure or distribution of this material and related documentation
 7 | # without an express license agreement from NVIDIA CORPORATION or
 8 | # its affiliates is strictly prohibited.
 9 | 
10 | 
11 | from abc import ABC, abstractmethod
12 | 
13 | class BaseLoader(ABC):
14 |     """
15 |     Abstract base class for dataset loaders.
16 |     
17 |     This class defines the interface that dataset loaders must implement
18 |     to be compatible with the annotation generation pipeline.
19 |     """
20 |     
21 |     @abstractmethod
22 |     def __init__(self, config):
23 |         """
24 |         Initialize the dataset loader.
25 |         
26 |         Args:
27 |             config (dict): Configuration dictionary containing dataset parameters.
28 |         """
29 |         pass
30 |     
31 |     @abstractmethod
32 |     def list_scenes(self, dataset_list):
33 |         """
34 |         List all scenes available in the specified datasets.
35 |         
36 |         Args:
37 |             dataset_list (list): List of dataset names to query.
38 |             
39 |         Returns:
40 |             generator: Yields tuples of (dataset_name, scene_idx, scene_name).
41 |         """
42 |         pass
43 |     
44 |     @abstractmethod
45 |     def list_images(self, dataset_name, scene_name):
46 |         """
47 |         List all images available in the specified scene.
48 |         
49 |         Args:
50 |             dataset_name (str): Name of the dataset.
51 |             scene_name (str): Name of the scene (e.g., 'scannet/scene00191_00').
52 |             
53 |         Returns:
54 |             dict: Dictionary of image annotations keyed by image name (e.g., '<scene_name>/<image_filename>'),
55 |                   each containing at minimum:
56 |                 - extrinsic: Camera to global transformation matrix.
57 |                 - intrinsic: Camera to image transformation matrix.
58 |                 - img_path: Path to the image file.
59 |                 - (If needed) axis_align_matrix: Matrix to align to world coordinates.
60 |         """
61 |         pass
62 |     
63 |     @abstractmethod
64 |     def list_objects(self, dataset_name, scene_name, image_ann):
65 |         """
66 |         List all object instances visible in an image in the specified scene.
67 |         
68 |         Processes visible objects in an image and organizes them.
69 |         
70 |         Args:
71 |             dataset_name (str): Name of the dataset.
72 |             scene_name (str): Name of the scene.
73 |             image_ann (dict): Image annotation dictionary from list_images.
74 |             
75 |         Returns:
76 |             tuple: A 5-element tuple containing:
77 |                 - vis_objs (dict): Dictionary of visible, non-environmental objects.
78 |                                    Keys are categories (indexed if duplicates exist, e.g., 'chair_0').
79 |                                    Values are instance dictionaries.
80 |                 - unique_vis_categories (set): Set of categories for objects appearing only once
81 |                                                (excluding environmental/generic object categories).
82 |                 - multi_vis_categories (set): Set of categories for objects appearing multiple times
83 |                                               (excluding environmental/generic object categories).
84 |                 - floor_bound (list): Min and max floor boundaries derived from object OBBs,
85 |                                       as [min_bound, max_bound].
86 |                 - all_objs (dict): Dictionary of all non-environmental objects (floor, wall, ceiling excluded),
87 |                                    keyed by their potentially indexed name (e.g., 'chair_0').
88 |                                    Used for occupancy map calculation or other downstream tasks.
89 |         """
90 |         pass 


--------------------------------------------------------------------------------
/robospatial/data_loader/embodiedscan_loader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | #
 10 | # This code is partially adapted from
 11 | # https://github.com/OpenRobotLab/EmbodiedScan/blob/main/embodiedscan/explorer.py
 12 | # under the Apache 2.0 license.
 13 | 
 14 | 
 15 | import os
 16 | import pickle
 17 | from collections import defaultdict
 18 | 
 19 | import numpy as np
 20 | from data_loader.base_loader import BaseLoader
 21 | 
 22 | from spatial_analysis.relationship_utils import _9dof_to_box
 23 | 
 24 | 
 25 | class EmbodiedScanLoader(BaseLoader):
 26 |     """
 27 |     Loader for EmbodiedScan datasets (3RScan, ScanNet, Matterport3D).
 28 |     
 29 |     Inherits from BaseLoader and implements its interface methods.
 30 |     """
 31 |     
 32 |     def __init__(self, config):
 33 |         """
 34 |         Initialize the EmbodiedScan loader.
 35 |         
 36 |         Args:
 37 |             config (dict): Data loader configuration dictionary.
 38 |         """
 39 |         self.verbose = config["verbose"]
 40 | 
 41 |         if self.verbose:
 42 |             print('Loading EmbodiedScan...')
 43 |         
 44 |         # Get annotation key name from config
 45 |         annotation_key = config.get("annotation_key", "embodiedscan_ann")
 46 |         
 47 |         # Get splits from config
 48 |         splits = config.get("split")
 49 |         
 50 |         # Load annotation files based on splits
 51 |         ann_files = []
 52 |         if annotation_key in config:
 53 |             for split in splits:
 54 |                 if split in config[annotation_key]:
 55 |                     ann_files.append(config[annotation_key][split])
 56 |         
 57 |         self.ann_files = ann_files
 58 | 
 59 |         
 60 |         self.metainfo = None
 61 |         ## Load embodiedscan annotated scan datasets (scannet, matterport3d, 3rscan, arkitscenes)
 62 |         data_list = []
 63 |         for file in self.ann_files:
 64 |             with open(file, 'rb') as f:
 65 |                 data = pickle.load(f)
 66 |             
 67 |             if self.metainfo is None:
 68 |                 self.metainfo = data['metainfo']
 69 |             else:
 70 |                 assert self.metainfo == data['metainfo']
 71 |             
 72 |             data_list += data['data_list']
 73 | 
 74 | 
 75 |         if isinstance(self.metainfo['categories'], list):
 76 |             self.classes = self.metainfo['categories']
 77 |             self.id_to_index = {i: i for i in range(len(self.classes))}
 78 |         elif isinstance(self.metainfo['categories'], dict):
 79 |             self.classes = list(self.metainfo['categories'].keys())
 80 |             self.id_to_index = {
 81 |                 i: self.classes.index(classes)
 82 |                 for classes, i in self.metainfo['categories'].items()
 83 |             }
 84 | 
 85 |         # Check if certain scan exists
 86 |         self.data = defaultdict(dict)
 87 |         for data in data_list:
 88 | 
 89 |             splits = data['sample_idx'].split('/') # sample_idx is scene name
 90 |             dataset_name = splits[0]
 91 | 
 92 |             data['dataset'] = dataset_name
 93 |             if dataset_name == 'scannet':
 94 |                 region = splits[1]
 95 |                 dirpath = os.path.join(config['image_root'], dataset_name, 'posed_images',
 96 |                                         region)
 97 |             elif dataset_name == '3rscan':
 98 |                 region = splits[1]
 99 |                 dirpath = os.path.join(config['image_root'], dataset_name, region)
100 |             elif dataset_name == 'matterport3d':
101 |                 building, region = splits[1], splits[2]
102 |                 dirpath = os.path.join(config['image_root'], dataset_name,
103 |                                         building)
104 |             else:
105 |                 region = splits[1]
106 |                 dirpath = os.path.join(self.data_root[dataset_name], region)
107 |             if os.path.exists(dirpath):
108 |                 # scene_name is the scene name in the dataset with dataset name prepended if it is not already present
109 |                 scene_name = data['sample_idx']
110 |                 if not data['sample_idx'].startswith(dataset_name):
111 |                     scene_name = f"{dataset_name}/{data['sample_idx']}"
112 |                 self.data[dataset_name][scene_name] = data
113 |         # self.dataset_stats = {}
114 |         # for dataset, data in self.data.items():
115 |         #     self.dataset_stats[dataset] = len(data)
116 | 
117 |         if self.verbose:
118 |             for dataset_name, data in self.data.items():
119 |                 print(f"Loaded {len(data)} scenes from {dataset_name}")
120 |             print('Loading complete')
121 |     
122 |     def list_scenes(self, dataset_list):
123 |         """
124 |         Implementation of BaseLoader.list_scenes for EmbodiedScan datasets.
125 |         
126 |         Args:
127 |             dataset_list (list): List of dataset names to query.
128 |             
129 |         Returns:
130 |             generator: Yields tuples of (dataset_name, scene_idx, scene_name). 
131 |             #NOTE scene_name is <dataset_name>/<scene_name>
132 |         """
133 |         for dataset_name in dataset_list:
134 |             if dataset_name in self.data:
135 |                 for scene_idx, scene_name in enumerate(self.data[dataset_name]):
136 |                     yield dataset_name, scene_idx, scene_name
137 |     
138 |     def list_images(self, dataset_name, scene_name):
139 |         """
140 |         Implementation of BaseLoader.list_images for EmbodiedScan datasets.
141 |         
142 |         Args:
143 |             dataset_name (str): Name of the dataset.
144 |             scene_name (str): Name of the scene. Example: scannet/scene00191_00
145 |             
146 |         Returns:
147 |             list: List of image annotations.
148 |         """
149 |         if scene_name not in self.data[dataset_name]:
150 |             if self.verbose:
151 |                 print(f"Warning: Scene {scene_name} not found in annotations")
152 |             return []
153 |         
154 |         # Extract scene-wide annotations
155 |         axis_align_matrix = np.array(self.data[dataset_name][scene_name]['axis_align_matrix']) # scene wide
156 |         if "cam2img" in self.data[dataset_name][scene_name]:
157 |             cam2img = np.array(self.data[dataset_name][scene_name]['cam2img']) # scene wide
158 |         else:
159 |             cam2img = np.array(self.data[dataset_name][scene_name]['images'][0]['cam2img']) # Some scenes have cam2img in images
160 |         if "depth_cam2img" in self.data[dataset_name][scene_name]:
161 |             depth_cam2img = np.array(self.data[dataset_name][scene_name]['depth_cam2img']) # scene wide
162 |         else:
163 |             depth_cam2img = []
164 | 
165 |         # Add scene-wide annotations to each image annotation
166 |         image_annotations = {}
167 |         for image_ann in self.data[dataset_name][scene_name]['images']:                 
168 |             # Add dataset and scene information to image annotation
169 |             image_ann['dataset'] = dataset_name
170 |             image_ann['scene'] = scene_name
171 |             image_ann['image_basename'] = os.path.basename(image_ann["img_path"]) #NOTE Actual image filename
172 |             image_ann['extrinsic'] = axis_align_matrix @ image_ann['cam2global'] # Camera to world
173 |             image_ann['intrinsic'] = cam2img # Camera to image
174 |             image_ann['cam2img'] = cam2img
175 |             image_ann['axis_align_matrix'] = axis_align_matrix
176 |             image_ann['cam2global'] = image_ann['cam2global']
177 |             image_ann['depth_cam2img'] = depth_cam2img
178 |             image_ann['depth_path'] = image_ann['depth_path']
179 |             image_ann['visible_instance_ids'] = image_ann['visible_instance_ids']
180 |             image_name = scene_name + "/" + image_ann['image_basename']
181 |             image_annotations[image_name] = image_ann #NOTE Image name is <scene_name>/<image_filename>
182 |         
183 |         return image_annotations
184 |     
185 |     def list_objects(self, dataset_name, scene_name, image_ann):
186 |         """
187 |         Implementation of BaseLoader.list_objects for EmbodiedScan datasets.
188 |         
189 |         Processes visible objects in an image and organizes them into multiple categories:
190 |         - unique_vis_categories: Objects that appear exactly once (dictionary keyed by category)
191 |         - multi_vis_categories: Objects that appear multiple times (dictionary keyed by category_0, category_1, etc.)
192 |         - vis_objs: All visible objects
193 |         - all_objs: All non-environmental objects
194 |         
195 |         Also calculates the floor boundaries for the scene.
196 |         
197 |         Args:
198 |             dataset_name (str): Name of the dataset.
199 |             scene_name (str): Name of the scene.
200 |             image_ann (dict): Image annotation dictionary.
201 |             
202 |         Returns:
203 |             tuple: A 5-element tuple containing:
204 |                 - vis_objs (dict): Dictionary of visible, non-environmental objects. Keys are categories (indexed if duplicates exist, e.g., 'chair_0', 'chair_1'). Values are instance dictionaries.
205 |                 - unique_vis_categories (set): Set of categories for objects appearing only once (excluding environmental/object categories).
206 |                 - multi_vis_categories (set): Set of categories for objects appearing multiple times (excluding environmental/object categories).
207 |                 - floor_bound (list): Min and max floor boundaries as [min_bound, max_bound].
208 |                 - all_objs (dict): Dictionary of all non-environmental objects (floor, wall, ceiling excluded), keyed by their potentially indexed name (e.g., 'chair_0'). Used for occupancy map calculation.
209 |         """
210 |         # Get visible instance ids from image annotation
211 |         #NOTE you can use different ways to get this.
212 |         visible_instance_ids = image_ann['visible_instance_ids']    
213 |         
214 |         if scene_name not in self.data[dataset_name]:
215 |             if self.verbose:
216 |                 print(f"Warning: Scene {scene_name} not found in annotations")
217 |             return {}, set(), set(), [], {} # Return empty structures matching the new return type
218 | 
219 |         # First pass to count occurrences of each non-environmental category
220 |         category_total_counts = defaultdict(int)
221 |         for i in visible_instance_ids:
222 |             instance = self.data[dataset_name][scene_name]['instances'][i]
223 |             category = self.classes[self.id_to_index[instance['bbox_label_3d']]]
224 |             # Exclude environmental or generic object categories from indexed naming
225 |             if category not in ["wall", "ceiling", "floor", "object"]:
226 |                 category_total_counts[category] += 1
227 |                 
228 |         # Process instances to create the unified vis_objs dictionary
229 |         vis_objs = {}
230 |         unique_vis_categories = set()
231 |         multi_vis_categories = set()
232 |         category_indices = defaultdict(int) # To track current index for duplicate categories
233 |         env_objs = {}
234 |         all_objs = {} # Still needed for floor bounding box calculation
235 | 
236 |         for i in visible_instance_ids:
237 |             instance = self.data[dataset_name][scene_name]['instances'][i]
238 |             category = self.classes[self.id_to_index[instance['bbox_label_3d']]]
239 |             instance["category"] = category # Keep original label name in instance dict
240 |             instance["obb"] = _9dof_to_box(instance["bbox_3d"]) # We use Open3D obb for all spatial relationships
241 | 
242 |             # Handle environmental objects (for floor calculation)
243 |             if category in ["floor", "wall", "ceiling"]:
244 |                 env_objs[category] = instance
245 | 
246 |             # Parse categories to handle duplicates, assume there is only one floor, wall, and ceiling
247 |             total_count = category_total_counts[category]
248 |             if total_count == 1:
249 |                 obj_key = category
250 |                 instance["name"] = obj_key
251 |                 unique_vis_categories.add(category)
252 |             else:
253 |                 current_index = category_indices[category]
254 |                 obj_key = f"{category}_{current_index}"
255 |                 instance["name"] = obj_key
256 |                 multi_vis_categories.add(category)
257 |                 category_indices[category] += 1
258 |             
259 |             # Add to vis_objs if it is not an environmental object
260 |             if category not in ["wall", "ceiling", "floor", "object"]:
261 |                 vis_objs[obj_key] = instance
262 |             
263 |             # Get all objects for occupancy map calculation
264 |             if category not in ["floor", "wall", "ceiling"]:
265 |                 all_objs[obj_key] = instance 
266 | 
267 |             # Track all non-floor/wall/ceiling objects for OBB calculation - This part is now handled above
268 |             # if category not in ["floor", "wall", "ceiling"]:
269 |             #    all_objs[i] = instance # Use original instance id as key
270 |         
271 |         all_obbs = [obj["obb"] for obj in all_objs.values()]
272 | 
273 |         # Create floor box representation automatically
274 |         # Ensure floor object exists before accessing it
275 |         if "floor" in env_objs:
276 |             floor_obj = env_objs["floor"]
277 |             floor_obb = _9dof_to_box(floor_obj["bbox_3d"])
278 |             min_bound = np.min([box.get_min_bound() for box in all_obbs + [floor_obb]], axis=0)
279 |             max_bound = np.max([box.get_max_bound() for box in all_obbs + [floor_obb]], axis=0)
280 |             floor_bound = [min_bound, max_bound]
281 |         else:
282 |             # Handle cases where there might not be a floor object detected/annotated
283 |             if len(all_obbs) > 0:
284 |                 min_bound = np.min([box.get_min_bound() for box in all_obbs], axis=0)
285 |                 max_bound = np.max([box.get_max_bound() for box in all_obbs], axis=0)
286 |                 floor_bound = [min_bound, max_bound] # Use bounds from other objects if floor is missing
287 |             else:
288 |                 floor_bound = None
289 | 
290 |         return vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs
291 | 


--------------------------------------------------------------------------------
/robospatial/data_loader/example_loader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | 
 11 | import os
 12 | import json
 13 | from collections import defaultdict
 14 | import glob
 15 | 
 16 | import numpy as np
 17 | from data_loader.base_loader import BaseLoader
 18 | 
 19 | from spatial_analysis.relationship_utils import _9dof_to_box
 20 | 
 21 | # Top-level function for defaultdict factory (picklable)
 22 | def nested_dict_factory():
 23 |     return defaultdict(dict)
 24 | 
 25 | class ExampleLoader(BaseLoader):
 26 |     """
 27 |     Loader for example data from JSON annotation files.
 28 | 
 29 |     Inherits from BaseLoader and implements its interface methods.
 30 |     """
 31 | 
 32 |     def __init__(self, config):
 33 |         """
 34 |         Initialize the Example loader.
 35 | 
 36 |         Args:
 37 |             config (dict): Data loader configuration dictionary.
 38 |                            Expected keys:
 39 |                            - annotation_dir (str): Path to the directory containing JSON annotation files.
 40 |                            - verbose (bool): Optional verbosity flag.
 41 |         """
 42 |         self.verbose = config.get("verbose", False)
 43 |         self.config = config
 44 | 
 45 |         if self.verbose:
 46 |             print('Loading Example Dataset JSON annotations...')
 47 |         annotation_dir = config.get("annotation_dir")
 48 |         if not annotation_dir or not os.path.isdir(annotation_dir):
 49 |             raise ValueError("Config must contain a valid 'annotation_dir' pointing to the JSON annotations.")
 50 | 
 51 |         self.data = defaultdict(nested_dict_factory) # Use the named function
 52 |         # Recursively find all json files in the annotation_dir
 53 |         json_files = glob.glob(os.path.join(annotation_dir, '**', '*.json'), recursive=True)
 54 | 
 55 | 
 56 |         if not json_files:
 57 |              print(f"Warning: No JSON files found in {annotation_dir}")
 58 |              return
 59 | 
 60 |         for file_path in json_files:
 61 |             with open(file_path, 'r') as f:
 62 |                 image_data = json.load(f)
 63 | 
 64 |             # Validate basic structure (can be expanded)
 65 |             required_keys = ['dataset', 'scene_name', 'image_name', 'objects', 'camera_annotations']
 66 |             if not all(k in image_data for k in required_keys):
 67 |                 if self.verbose:
 68 |                     print(f"Warning: Skipping file {file_path} due to missing one or more required keys: {required_keys}.")
 69 |                 continue
 70 |             if not all(k in image_data['camera_annotations'] for k in ['extrinsic', 'intrinsic']):
 71 |                 if self.verbose:
 72 |                     print(f"Warning: Skipping file {file_path} due to missing extrinsic/intrinsic in camera_annotations.")
 73 |                     continue
 74 | 
 75 |             dataset_name = image_data['dataset']
 76 |             # Use the scene name provided in the JSON
 77 |             scene_name = image_data['scene_name']
 78 |             image_name = image_data['image_name']
 79 | 
 80 |             # Store the loaded data, grouping by dataset and scene
 81 |             # self.data[dataset_name][scene_name] will hold a dict of {image_identifier: image_data}
 82 |             self.data[dataset_name][scene_name][image_name] = image_data
 83 |             # except Exception as e:
 84 |             #     if self.verbose:
 85 |             #         print(f"Warning: Skipping file {file_path} due to error: {e}")
 86 | 
 87 |         if self.verbose:
 88 |             total_scenes = 0
 89 |             total_images = 0
 90 |             for dataset_name, scenes in self.data.items():
 91 |                 print(f"Loaded {len(scenes)} scenes from {dataset_name}")
 92 |                 total_scenes += len(scenes)
 93 |                 for scene_name, images in scenes.items():
 94 |                     total_images += len(images)
 95 |             print(f'Loading complete. Total scenes: {total_scenes}, Total images: {total_images}')
 96 | 
 97 | 
 98 |     def list_scenes(self, dataset_list):
 99 |         """
100 |         Implementation of BaseLoader.list_scenes for Example Dataset dataset loaded from JSON.
101 | 
102 |         Args:
103 |             dataset_list (list): List of dataset names to query.
104 | 
105 |         Returns:
106 |             generator: Yields tuples of (dataset_name, scene_idx, scene_name).
107 |         """
108 | 
109 |         for dataset_name in dataset_list:
110 |             if dataset_name in self.data:
111 |                 # Ensure consistent scene indexing if needed, otherwise enumerate keys
112 |                 # Using enumerate(self.data[dataset_name]) provides a simple index
113 |                 # The scene_name now directly comes from the JSON data structure keys
114 |                 for scene_idx, scene_name in enumerate(self.data[dataset_name]):
115 |                     yield dataset_name, scene_idx, scene_name
116 | 
117 |     def list_images(self, dataset_name, scene_name):
118 |         """
119 |         Implementation of BaseLoader.list_images for Example Dataset loaded from JSON.
120 | 
121 |         Args:
122 |             dataset_name (str): Name of the dataset.
123 |             scene_name (str): Name of the scene. Example: gr00t/deduped_data_normal
124 | 
125 |         Returns:
126 |             dict: Dictionary of image annotations, keyed by image_identifier.
127 |         """
128 |         if dataset_name not in self.data or scene_name not in self.data[dataset_name]:
129 |             if self.verbose:
130 |                 print(f"Warning: Scene {scene_name} not found in dataset {dataset_name}")
131 |             return {}
132 | 
133 |         scene_images_data = self.data[dataset_name][scene_name]
134 |         image_annotations = {}
135 | 
136 |         for image_identifier, image_data in scene_images_data.items():
137 |             image_ann = {}
138 |             # Basic info from the loaded JSON data
139 |             image_ann['dataset'] = dataset_name
140 |             image_ann['scene'] = scene_name
141 |             image_ann['image_identifier'] = image_identifier
142 |             image_ann['img_path'] = dataset_name + "/" + scene_name + "/" + image_identifier #NOTE image_path is combined with image_root from config to create the absolute image path
143 |             image_ann['image_basename'] = os.path.basename(image_data['image_name'])
144 |             image_ann['image_size'] = image_data.get('image_size') # Optional
145 | 
146 |             # Camera parameters from the loaded JSON data
147 |             cam_ann = image_data.get('camera_annotations', {}) # Presence checked in __init__
148 |             image_ann['extrinsic'] = np.array(cam_ann.get('extrinsic'))
149 |             image_ann['intrinsic'] = np.array(cam_ann.get('intrinsic'))
150 |             image_ann['objects'] = image_data.get('objects')
151 | 
152 |             # visible_instance_ids is no longer the primary way to get objects for list_objects
153 |             # image_ann['visible_instance_ids'] = image_data.get('visible_instance_ids', [])
154 | 
155 |             # Use image_identifier as the key
156 |             image_annotations[image_identifier] = image_ann
157 | 
158 | 
159 | 
160 |         return image_annotations
161 | 
162 |     def list_objects(self, dataset_name, scene_name, image_ann):
163 |         """
164 |         Implementation of BaseLoader.list_objects for Example Dataset from JSON.
165 | 
166 |         Processes objects listed in the 'object_grounding' field of the image annotation.
167 | 
168 |         Args:
169 |             dataset_name (str): Name of the dataset (provides context).
170 |             scene_name (str): Name of the scene (provides context).
171 |             image_ann (dict): Image annotation dictionary (from list_images).
172 | 
173 |         Returns:
174 |             tuple: A 5-element tuple containing:
175 |                 - vis_objs (dict): Dictionary of visible, non-environmental objects. Keys are categories (indexed if duplicates exist, e.g., 'chair_0', 'chair_1'). Values are instance dictionaries with 'obb' and 'name'.
176 |                 - unique_vis_categories (set): Set of original category names for objects appearing only once (excluding environmental/object categories).
177 |                 - multi_vis_categories (set): Set of original category names for objects appearing multiple times (excluding environmental/object categories).
178 |                 - floor_bound (list): Min and max floor boundaries [min_bound, max_bound] calculated from *this image's* non-environmental objects and floor (if present). Can be None.
179 |                 - all_objs (dict): Dictionary of all non-environmental objects in this image, keyed by their potentially indexed name (same as vis_objs in this implementation).
180 |         """
181 |         image_identifier = image_ann.get('image_identifier', 'unknown_image') # For logging
182 |         objects = image_ann.get('objects', [])
183 | 
184 |         if not objects:
185 |             # Return empty structures matching the expected return type
186 |             return {}, set(), set(), None, {}
187 | 
188 |         # First pass to count occurrences of each non-environmental category in this image
189 |         category_total_counts = defaultdict(int)
190 |         parsed_objects = [] # Store parsed objects temporarily
191 |         for obj_data in objects:
192 |             category = obj_data.get("Name") #NOTE Name is the category name for gr00t
193 |             bbox_3d_list = obj_data.get("bbox_3d")
194 | 
195 |             # Basic validation
196 |             if not category:
197 |                 if self.verbose: print(f"Warning: Skipping object with missing Name in image {image_identifier}")
198 |                 continue
199 |             if not bbox_3d_list or not isinstance(bbox_3d_list, list) or not bbox_3d_list:
200 |                  if self.verbose: print(f"Warning: Skipping object '{category}' with missing or invalid bbox_3d in image {image_identifier}")
201 |                  continue
202 | 
203 | 
204 |             # Assuming bbox_3d is 9 DOF
205 |             bbox_3d_params = bbox_3d_list[0]
206 |             # Validate that we indeed have 9 parameters
207 |             if not isinstance(bbox_3d_params, list) or len(bbox_3d_params) != 9:
208 |                  if self.verbose: print(f"Warning: Skipping object '{category}' due to invalid bbox_3d params (expected 9DOF, got {len(bbox_3d_params)}) in image {image_identifier}. Params: {bbox_3d_params}")
209 |                  continue
210 | 
211 |             # Removed padding logic as bbox_3d is guaranteed 9DOF.
212 | 
213 |             instance = {
214 |                 "name": category, #NOTE name == category for gr00t
215 |                 "category": category, # Original name from JSON
216 |                 "bbox_3d": bbox_3d_params, # Store the 9DoF params
217 |             }
218 | 
219 |             try:
220 |                  # Calculate OBB immediately
221 |                  instance["obb"] = _9dof_to_box(bbox_3d_params)
222 |                  parsed_objects.append(instance) # Add to list for further processing
223 |                  # Count non-environmental categories (case-insensitive check)
224 |                  if category.lower() not in ["wall", "ceiling", "floor", "object"]:
225 |                      category_total_counts[category] += 1 # Count using original name
226 |             except ValueError as e:
227 |                  # Catch potential errors from _9dof_to_box if params are still invalid
228 |                  if self.verbose:
229 |                     print(f"Error converting bbox for object '{category}' in image {image_identifier}: {e}. Params: {bbox_3d_params}")
230 |             except Exception as e: # Catch other potential exceptions
231 |                  if self.verbose:
232 |                     print(f"Unexpected error processing object '{category}' in image {image_identifier}: {e}")
233 | 
234 | 
235 |         # Process parsed instances to create the final dictionaries
236 |         vis_objs = {}
237 |         unique_vis_categories = set()
238 |         multi_vis_categories = set()
239 |         category_indices = defaultdict(int) # To track current index for duplicate categories
240 |         env_objs = {}
241 |         all_objs_for_bounds = [] # Collect OBBs for floor calculation
242 | 
243 |         for instance in parsed_objects:
244 |             category = instance["category"] # Original name
245 | 
246 |             # Handle environmental objects (for floor calculation)
247 |             # Use lower case for comparison to identify type
248 |             cat_lower = category.lower()
249 |             if cat_lower in ["floor", "wall", "ceiling"]:
250 |                 # Assuming only one of each environmental object per image annotation
251 |                 # Store with lowercase key for easy lookup
252 |                 env_objs[cat_lower] = instance
253 |                 if cat_lower == "floor":
254 |                     # Add floor OBB for bound calculation if it exists
255 |                     if "obb" in instance:
256 |                         all_objs_for_bounds.append(instance["obb"])
257 |                 continue # Skip adding env objects to vis_objs/multi/unique sets
258 | 
259 |             # Process non-environmental objects (already counted)
260 |             if category in category_total_counts:
261 |                 total_count = category_total_counts[category]
262 |                 if total_count == 1:
263 |                     obj_key = category # Use original name as key
264 |                     instance["name"] = obj_key # Store potentially indexed name
265 |                     unique_vis_categories.add(category) # Store original category name
266 |                 else:
267 |                     # Use original category name for indexing
268 |                     current_index = category_indices[category]
269 |                     obj_key = f"{category}_{current_index}"
270 |                     instance["name"] = obj_key # Store potentially indexed name
271 |                     multi_vis_categories.add(category) # Store original category name
272 |                     category_indices[category] += 1
273 | 
274 |                 vis_objs[obj_key] = instance
275 |                  # Add OBB for floor calculation if it exists
276 |                 if "obb" in instance:
277 |                     all_objs_for_bounds.append(instance["obb"])
278 | 
279 | 
280 |         # Calculate floor bounds based on OBBs from this image (non-env + floor if present)
281 |         floor_bound = None
282 |         if all_objs_for_bounds:
283 |             try:
284 |                 # Combine points from all relevant OBBs
285 |                 all_points = np.vstack([box.get_box_points() for box in all_objs_for_bounds])
286 |                 min_bound = np.min(all_points, axis=0)
287 |                 max_bound = np.max(all_points, axis=0)
288 | 
289 |                 # The bounds derived this way represent the extent of the objects considered.
290 |                 floor_bound = [min_bound.tolist(), max_bound.tolist()]
291 |             except Exception as e:
292 |                 if self.verbose:
293 |                     print(f"Error calculating floor bounds for image {image_identifier}: {e}")
294 |                 floor_bound = None # Indicate failure
295 | 
296 | 
297 |         # `all_objs` in the original return signature was intended for occupancy map calculation,
298 |         # usually containing all non-environmental objects in the scene.
299 |         # In this JSON-based, image-level loading, it effectively becomes the same as `vis_objs`
300 |         # as we only process objects visible/annotated in the current image JSON.
301 |         all_objs = vis_objs.copy()
302 | 
303 |         return vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs
304 | 


--------------------------------------------------------------------------------
/robospatial/run_generation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | """Main entry script for generating spatial annotations from 3D scan datasets.
 11 | 
 12 | This script orchestrates the annotation generation process based on a YAML
 13 | configuration file. It handles:
 14 | - Parsing command-line arguments for configuration path, scene filtering (range, specific scene, specific image), and dry runs.
 15 | - Loading the dataset configuration and initializing the appropriate data loader.
 16 | - Iterating through specified datasets and scenes.
 17 | - Calling the `generate_and_save_annotations` function from `annotation_generator.py`
 18 |   to perform the core annotation generation for each scene.
 19 | - Tracking generation progress across scenes and saving it periodically.
 20 | - Aggregating and saving final statistics (overall and per-dataset).
 21 | 
 22 | Supported annotation types (configured via YAML) include:
 23 | - Object Grounding
 24 | - Spatial Context
 25 | - Spatial Configuration
 26 | - Spatial Compatibility
 27 | 
 28 | Usage:
 29 |     python robospatial/run_generation.py --config path/to/your/config.yaml [options]
 30 | """
 31 | 
 32 | # run_generation.py
 33 | # Main entry script to generate annotations from 3D scan datasets
 34 | # Supports flexible configuration of annotation types (object grounding, spatial context, spatial configuration, spatial compatibility)
 35 | 
 36 | import argparse
 37 | import yaml
 38 | import os
 39 | import json
 40 | import importlib
 41 | import concurrent.futures
 42 | from tqdm import tqdm
 43 | from collections import defaultdict
 44 | 
 45 | # Import the new generator function
 46 | from annotation_generator import generate_and_save_annotations
 47 | 
 48 | def parse_args():
 49 |     parser = argparse.ArgumentParser(description="Parse configuration file for annotation generation.")
 50 |     parser.add_argument('--config', type=str, default="configs/base_local.yaml", help='Path to the configuration YAML file.')
 51 |     parser.add_argument('--range', type=int, nargs=2, help='Range of scene indices to process (inclusive start, exclusive end).')
 52 |     parser.add_argument('--scene', type=str, help='Specific scene name to process (e.g., "scannet/scene0190_00").')  # New argument for specific scene
 53 |     parser.add_argument('--image', type=str, help='Specific image basename (e.g., "rgb_00010") to process within the specified scene for debugging.') # New argument for specific image
 54 |     parser.add_argument('--num_workers', type=int, help='Number of worker threads to use for processing scenes.')
 55 |     parser.add_argument('--dry_run', action='store_true', help='Enable dry run mode (processes only the first 5 images per scene).')
 56 |     args = parser.parse_args()
 57 | 
 58 |     if args.range:
 59 |         start, end = args.range
 60 |         # Make range inclusive by adding 1 to end for Python range behavior
 61 |         args.range = range(start, end + 1)  # Store as a range object
 62 |         if start > end:
 63 |             parser.error("Start of range must not be greater than end.")
 64 | 
 65 |     return args
 66 | 
 67 | def load_config(config_file):
 68 |     with open(config_file, 'r') as file:
 69 |         config = yaml.safe_load(file)
 70 |     # Add default output suffix if not present
 71 |     if "output_suffix" not in config.get("data_generation", {}):
 72 |         if "data_generation" not in config:
 73 |             config["data_generation"] = {}
 74 |         config["data_generation"]["output_suffix"] = ".annotations.json"
 75 |     return config
 76 | 
 77 | def create_loader(config):
 78 |     """Create a loader instance based on configuration."""
 79 |     # Default to EmbodiedScanLoader if not specified
 80 |     loader_class_path = config.get("loader_class")
 81 |     if loader_class_path is None:
 82 |         raise ValueError("loader_class not specified in config[data_loading]")
 83 | 
 84 |     module_name, class_name = loader_class_path.rsplit('.', 1)
 85 |     module = importlib.import_module(module_name)
 86 |     loader_class = getattr(module, class_name)
 87 |     
 88 |     # Create the loader instance with config data
 89 |     loader = loader_class(config)
 90 |     
 91 |     return loader
 92 | 
 93 | # Define the scene processing function outside run or nested inside run
 94 | def process_scene(args_tuple):
 95 |     loader, dataset_name, scene_idx, scene_name, config, specific_image, dry_run, num_workers = args_tuple # Unpack num_workers
 96 | 
 97 |     # Only print if not multi-threaded
 98 |     if num_workers <= 1:
 99 |         tqdm.write(f"\nProcessing {dataset_name} scene: {scene_name} ({scene_idx+1})") # Note: total count isn't readily available here
100 | 
101 |     try: # Add try/except block for robustness in threads
102 |         images_ann_dict_full = loader.list_images(dataset_name, scene_name)
103 | 
104 |         # Filter images if a specific image name is provided
105 |         if specific_image:
106 |             if specific_image in images_ann_dict_full:
107 |                 images_ann_dict = {specific_image: images_ann_dict_full[specific_image]}
108 |                 if num_workers <= 1:
109 |                     tqdm.write(f"  - Specific image requested: Processing only '{specific_image}'.")
110 |             else:
111 |                 if num_workers <= 1:
112 |                     tqdm.write(f"  - Warning: Specific image '{specific_image}' not found in scene '{scene_name}'. Skipping scene.")
113 |                 return dataset_name, scene_name, None # Return None for stats if skipped
114 | 
115 |         # Limit images if dry_run is enabled AND no specific image was requested
116 |         elif dry_run and len(images_ann_dict_full) > 5:
117 |             images_ann_dict = dict(list(images_ann_dict_full.items())[:5])
118 |             if num_workers <= 1:
119 |                 tqdm.write(f"  - Dry run enabled: Processing only the first 5 images out of {len(images_ann_dict_full)}.")
120 |         else:
121 |             images_ann_dict = images_ann_dict_full
122 | 
123 |         # Print total only if not dry run or specific image, and not multi-threaded
124 |         if num_workers <= 1:
125 |             if not dry_run and not specific_image:
126 |                 tqdm.write(f"  - Listed {len(images_ann_dict_full)} total images")
127 |             elif dry_run and not specific_image: # Also print total if dry run
128 |                 tqdm.write(f"  - Listed {len(images_ann_dict_full)} total images")
129 | 
130 |         if not images_ann_dict:
131 |             if num_workers <= 1:
132 |                 tqdm.write(f"Warning: No images found for scene {scene_name}. Skipping.")
133 |             return dataset_name, scene_name, None # Return None for stats if skipped
134 | 
135 |         scene_stats = generate_and_save_annotations(
136 |             loader,
137 |             dataset_name,
138 |             scene_name,
139 |             images_ann_dict,
140 |             config,
141 |             num_workers
142 |         )
143 |         if num_workers <= 1:
144 |             tqdm.write(f"Finished scene {scene_name}. Stats: {dict(scene_stats)}")
145 |         return dataset_name, scene_name, scene_stats
146 |     except Exception as e:
147 |         # Always write errors, regardless of num_workers
148 |         tqdm.write(f"Error processing scene {scene_name}: {e}")
149 |         # Optionally re-raise or log the full traceback
150 |         import traceback
151 |         # Use tqdm.write for traceback as well
152 |         tqdm.write(f"Traceback for error in scene {scene_name}:\n{traceback.format_exc()}")
153 |         return dataset_name, scene_name, None # Indicate failure
154 | 
155 | 
156 | def run(config, specific_scene=None, dry_run=False, specific_image=None, num_workers_arg=None): # Added num_workers_arg
157 |     # Normal execution path
158 |     print("Starting annotation generation with configuration:")
159 |     print(yaml.dump(config, indent=2))
160 | 
161 |     # --- Determine Number of Workers ---
162 |     num_workers = num_workers_arg # CLI argument takes precedence
163 |     if num_workers is None:
164 |         num_workers = config.get("data_generation", {}).get("num_workers")
165 |     if num_workers is None:
166 |         num_workers = 1 # Default to 1
167 |         print(f"Number of workers not specified, defaulting to {num_workers}")
168 |     print(f"Using {num_workers} worker threads.")
169 | 
170 | 
171 |     # --- Dataset Loading ---
172 |     dataset_list = config["data_loading"]["datasets"]
173 |     
174 |     if not dataset_list:
175 |         print("Error: No valid datasets specified. Please include valid datasets in the config.")
176 |         return
177 |     
178 |     # Create the loader instance
179 |     loader = create_loader(config["data_loading"])
180 |     print(f"Loader initialized.")
181 | 
182 | 
183 |     # --- Statistics Initialization ---
184 |     total_stats = defaultdict(lambda: defaultdict(int))
185 |     overall_stats = defaultdict(int)
186 |     generated_something = False
187 |     progress_file_path = config["data_generation"].get("progress_file", "generation_progress.json")
188 |     completed_scenes_map = defaultdict(list)
189 | 
190 |     # Load progress if file exists
191 |     if os.path.exists(progress_file_path):
192 |         with open(progress_file_path, 'r') as f:
193 |             loaded_progress = json.load(f)
194 |             if isinstance(loaded_progress, dict):
195 |                 completed_scenes_map.update(loaded_progress)
196 |             else:
197 |                 print(f"Warning: Progress file {progress_file_path} has unexpected format. Starting fresh.")
198 | 
199 |     # --- Prepare Scene List ---
200 |     print("\n--- Preparing Scene List ---")
201 |     scene_list_all = list(loader.list_scenes(dataset_list))
202 |     print(f"Found {len(scene_list_all)} total scenes across specified datasets.")
203 | 
204 |     scenes_to_process_info = []
205 |     skipped_count = 0
206 |     for idx, (dataset_name, scene_idx, scene_name) in enumerate(scene_list_all):
207 |         # Apply filters
208 |         if specific_scene and scene_name != specific_scene:
209 |             skipped_count += 1
210 |             continue
211 |         if config.get("range") and idx not in config["range"]:
212 |              skipped_count += 1
213 |              continue
214 |         if scene_name in completed_scenes_map.get(dataset_name, []):
215 |              skipped_count += 1
216 |              continue
217 | 
218 |         # If not skipped, add to list
219 |         scenes_to_process_info.append((loader, dataset_name, idx, scene_name, config, specific_image, dry_run, num_workers))
220 | 
221 |     if skipped_count > 0:
222 |         print(f"Skipped {skipped_count} scenes (due to filters: specific_scene, range, or already completed).")
223 |     print(f"Processing {len(scenes_to_process_info)} scenes.")
224 | 
225 |     if not scenes_to_process_info:
226 |         print("No scenes left to process based on filters.")
227 |         # Skip the rest if nothing to process
228 |         print("\n--- Generation Complete ---")
229 |         print("No new annotations were generated.")
230 |         return
231 | 
232 | 
233 |     # --- Generation Loop (Parallelized) ---
234 |     print("\n--- Processing Scenes ---")
235 |     generated_something = False # Reset here, check results later
236 | 
237 |     with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
238 |         # Submit tasks and store Future objects
239 |         futures = [executor.submit(process_scene, args) for args in scenes_to_process_info]
240 | 
241 |         # Process results as they complete
242 |         for future in tqdm(concurrent.futures.as_completed(futures), total=len(scenes_to_process_info), desc="Processing Scenes"):
243 |             try:
244 |                 result = future.result() # Get the result from the completed future
245 |                 # --- Aggregation logic using 'result' ---
246 |                 if result: # Check if result is not None (i.e., processing didn't fail or skip)
247 |                     dataset_name, scene_name, scene_stats = result
248 |                     if scene_stats is not None: # Check if stats were successfully generated
249 |                         # Aggregate stats
250 |                         for key, value in scene_stats.items():
251 |                             if isinstance(value, (int, float)):
252 |                                 total_stats[dataset_name][key] += value
253 |                                 overall_stats[key] += value
254 |                         generated_something = True
255 |                         completed_scenes_map[dataset_name].append(scene_name)
256 |                     # else: scene processing might have skipped internally or failed
257 |                 # --- End of aggregation logic ---
258 |             except Exception as exc:
259 |                 # Handle exceptions raised within the process_scene function
260 |                 # Find the arguments that caused the exception for better logging (optional, requires mapping futures back to args)
261 |                 tqdm.write(f'\nError: A scene generation task generated an exception: {exc}')
262 |                 # Optionally log the full traceback
263 |                 # import traceback
264 |                 # tqdm.write(f"Traceback:\n{traceback.format_exc()}")
265 | 
266 |     # --- Final Statistics and Cleanup ---
267 |     print("\n--- Generation Complete ---")
268 |     if generated_something:
269 |         print("\n--- Overall Statistics ---")
270 |         print(json.dumps(overall_stats, indent=4))
271 | 
272 |         print("\n--- Per-Dataset Statistics ---")
273 |         print(json.dumps(total_stats, indent=4))
274 | 
275 |         with open(progress_file_path, 'w') as f:
276 |             json.dump(completed_scenes_map, f, indent=4)
277 |         print(f"Final progress saved to {progress_file_path}")
278 | 
279 |         stats_file_path = config["data_generation"].get("stats_file", "generation_stats.json")
280 |         final_stats_data = {
281 |             "overall_stats": overall_stats,
282 |             "per_dataset_stats": total_stats
283 |         }
284 |         with open(stats_file_path, 'w') as f:
285 |             json.dump(final_stats_data, f, indent=4)
286 |         print(f"Final statistics saved to {stats_file_path}")
287 |     else:
288 |         print("No new annotations were generated.")
289 | 
290 |     return
291 | 
292 | if __name__ == "__main__":
293 |     args = parse_args()
294 |     config = load_config(args.config)
295 |     if args.range:
296 |         config["range"] = args.range
297 |     # Pass num_workers from args
298 |     run(config, specific_scene=args.scene, dry_run=args.dry_run, specific_image=args.image, num_workers_arg=args.num_workers) 


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RoboSpatial/59b091d7694a724d3a46bb2b636d1bc49b899eb9/robospatial/spatial_analysis/__init__.py


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/compatibility/compatibility.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | import numpy as np
 11 | 
 12 | from spatial_analysis.context.context import get_point_in_space_relative_to_object
 13 | from spatial_analysis.topdown_map import get_empty_space
 14 | from spatial_analysis.compatibility.compatibility_utils import can_fit_at_point
 15 | 
 16 | DEBUG_FIT=False
 17 | 
 18 | def can_fit_object_a_in_relation_to_b(
 19 |     floor_bound,
 20 |     environment_boxes,
 21 |     obj_a,        # Dictionary of the object being placed (contains name, obb, etc.)
 22 |     obj_b,        # Dictionary of the reference object (contains name, obb, etc.)
 23 |     have_face,
 24 |     extrinsic,
 25 |     intrinsic,
 26 |     image_size,
 27 |     image_path,
 28 |     grid_resolution,
 29 |     num_samples,
 30 |     individual_occupancy_maps, 
 31 |     env_occupancy_map,         
 32 |     threshold=0.5,
 33 |     min_distance=0.2,
 34 |     buffer_ratio=0.3
 35 | ):
 36 |     """Checks if object A (the target object) can be placed in empty space relative to object B (the reference object).
 37 | 
 38 |     The function operates in several steps:
 39 |     1.  Calculates the available empty space in the environment using `get_empty_space`, generating a 2D grid representation.
 40 |     2.  Determines a dynamic threshold for empty space sampling based on the sizes of object A and B.
 41 |     3.  Samples a set of potential placement points (`num_samples`) around object B in various directions
 42 |         (infront, behind, left, right) across different reference frames (object-centric, camera-centric, world-centric)
 43 |         using `get_point_in_space_relative_to_object`. This sampling considers precomputed occupancy maps.
 44 |     4.  For each sampled 3D point corresponding to a specific frame and direction, it checks if object A's
 45 |         oriented bounding box (OBB) can be placed at that point without colliding with the environment or object B using `can_fit_at_point`. This check utilizes the 2D occupancy grid.
 46 |     5.  Aggregates the results, indicating whether *at least one* valid placement position was found for 
 47 |         each frame/direction combination.
 48 | 
 49 |     Args:
 50 |         floor_bound (list): A list defining the bounding box of the walkable floor area, used for empty space calculation.
 51 |         environment_boxes (list): A list of open3d.geometry.OrientedBoundingBox objects representing static obstacles
 52 |                                   in the environment.
 53 |         obj_a (dict): Dictionary representing object A (the object being placed), must contain 'obb' (open3d OBB)
 54 |                       and 'name' (str).
 55 |         obj_b (dict): Dictionary representing the reference object B, must contain 'obb' (open3d OBB) and 'name' (str).
 56 |         have_face (bool): Indicates if object B has a defined 'face' or primary orientation, affecting object-centric sampling.
 57 |         extrinsic (np.ndarray): 4x4 camera extrinsic matrix (camera-to-world transformation).
 58 |         intrinsic (np.ndarray): 3x3 or 4x4 camera intrinsic matrix (only the top-left 3x3 portion is used if 4x4).
 59 |         image_size (tuple): Size of the image (width, height), used for camera-centric calculations.
 60 |         image_path (str): Path to the associated scene image, primarily used for debugging visualizations within called functions.
 61 |         grid_resolution (float): The resolution (e.g., meters per grid cell) of the 2D occupancy grid used for collision checks.
 62 |         num_samples (int): The number of candidate points to sample around object B for potential placement checks.
 63 |         individual_occupancy_maps (dict): Precomputed 2D occupancy numpy arrays for each individual dynamic object (including A and B).
 64 |                                           Keys are object names, values are the occupancy map arrays.
 65 |         env_occupancy_map (np.ndarray): Precomputed combined 2D occupancy numpy array representing the static environment.
 66 |         threshold (float, optional): Base distance threshold used in empty space calculation. This is dynamically adjusted based on
 67 |                                      object sizes. Defaults to 0.5.
 68 | 
 69 |     Returns:
 70 |         dict: A nested dictionary indicating whether a valid placement was found for object A relative to object B
 71 |               for each combination of reference frame and direction.
 72 |               Example: `{'objectcentric': {'infront': True, 'behind': False, ...}, 'cameracentric': {...}, ...}`
 73 |               `True` means at least one valid point was found for that relative position.
 74 |     """
 75 |     empty_areas, grid, occupied = get_empty_space(floor_bound, environment_boxes, grid_resolution)
 76 | 
 77 |     box_a = obj_a['obb'] # Extract OBB from obj_a dictionary
 78 |     obj_a_name = obj_a['name'] # Extract name from obj_a dictionary
 79 |     box_b = obj_b['obb'] # Extract OBB from obj_b dictionary
 80 |     obj_b_name = obj_b['name'] # Extract name from obj_b dictionary
 81 | 
 82 |     # Adjust the sampling distance threshold based on the average horizontal size of the two objects.
 83 |     max_extent_a = np.max(box_a.extent[:2])  # Max extent in world x-y plane
 84 |     max_extent_b = np.max(box_b.extent[:2])  # Max extent in world x-y plane
 85 |     dynamic_threshold = threshold + (max_extent_a + max_extent_b) / 2
 86 | 
 87 |     # Sample potential placement points around obj_b using the precomputed occupancy information.
 88 |     _, _, visible_points_3d_all, _ = get_point_in_space_relative_to_object(
 89 |         floor_bound, environment_boxes,
 90 |         ref_obj=obj_b,
 91 |         extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size,
 92 |         have_face=have_face, num_samples=num_samples,
 93 |         individual_occupancy_maps=individual_occupancy_maps,
 94 |         env_occupancy_map=env_occupancy_map,
 95 |         threshold=dynamic_threshold, grid_resolution=grid_resolution,
 96 |         image_path=image_path,
 97 |         empty_areas=empty_areas, grid=grid, occupied=occupied,
 98 |     )
 99 | 
100 |     results = {}
101 | 
102 |     # Check placement possibility for each defined reference frame and direction
103 |     frames_to_check = ['objectcentric', 'cameracentric', 'worldcentric']
104 |     directions_to_check = ['infront', 'behind', 'left', 'right']
105 | 
106 |     for frame in frames_to_check:
107 |         results[frame] = {}
108 |         for direction in directions_to_check:
109 |             # Retrieve the list of 3D candidate points sampled for this specific frame/direction
110 |             points_in_direction_3d = visible_points_3d_all.get(frame, {}).get(direction, [])
111 | 
112 |             if not points_in_direction_3d:
113 |                 results[frame][direction] = False # No candidate points found for this relative position
114 |                 continue
115 | 
116 |             # Check if obj_a fits at any of the sampled points without collision
117 |             can_fit = False
118 |             for point_3d in points_in_direction_3d:
119 |                 # Check collision using the 2D grid, environment OBBs, and the reference object B's OBB
120 |                 if can_fit_at_point(grid, box_a, occupied, point_3d[:2], environment_boxes, box_b, min_distance=min_distance, buffer_ratio=buffer_ratio, box_name=obj_a_name, box_b_name=obj_b_name, frame=frame, direction=direction, DEBUG_FIT=DEBUG_FIT):
121 |                     can_fit = True
122 |                     break # Found a valid spot for this direction
123 | 
124 |             results[frame][direction] = can_fit
125 | 
126 |     if DEBUG_FIT:
127 |         # Print final fitting results if debug flag is enabled
128 |         print(f"Can fit results for {obj_a_name} relative to {obj_b_name}:")
129 |         print(results)
130 | 
131 |     return results
132 | 
133 | 
134 | def can_fit_on_top(top_box, base_box):
135 |     """Determines if the top OrientedBoundingBox can fit horizontally on top of the base OrientedBoundingBox.
136 |         
137 |     Args:
138 |         top_box (o3d.geometry.OrientedBoundingBox): The bounding box of the object to be placed on top.
139 |         base_box (o3d.geometry.OrientedBoundingBox): The bounding box of the base object.
140 |         
141 |     Returns:
142 |         bool: True if the top box's x and y extents are less than or equal to the base box's,
143 |               False otherwise.
144 |     """
145 |     base_extent = base_box.extent
146 |     top_extent = top_box.extent
147 |     
148 |     # Simple check: Top object's horizontal dimensions must be <= base object's dimensions.
149 |     # Assumes alignment of the boxes' principal axes with the world axes for this check.
150 |     if (top_extent[0] <= base_extent[0] and
151 |         top_extent[1] <= base_extent[1]):
152 |         result = True
153 |     else:
154 |         result = False
155 |     return result


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/compatibility/compatibility_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | 
 11 | import numpy as np
 12 | from scipy.spatial import ConvexHull
 13 | import matplotlib.path as mpath
 14 | import open3d as o3d
 15 | 
 16 | from spatial_analysis.relationship_utils import project_to_floor
 17 | 
 18 | 
 19 | 
 20 | def is_hull_within_bounds(hull_vertices, bounds):
 21 |     """Check if a convex hull is completely within given bounds.
 22 |     
 23 |     Args:
 24 |         hull_vertices (np.ndarray): Vertices of the convex hull
 25 |         bounds (tuple): (x_min, x_max, y_min, y_max)
 26 |         
 27 |     Returns:
 28 |         bool: True if hull is within bounds
 29 |     """
 30 |     x_min, x_max, y_min, y_max = bounds
 31 |     hull_min = np.min(hull_vertices, axis=0)
 32 |     hull_max = np.max(hull_vertices, axis=0)
 33 |     return (hull_min[0] >= x_min and hull_max[0] <= x_max and 
 34 |             hull_min[1] >= y_min and hull_max[1] <= y_max)
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | def can_fit_at_point(grid, box, occupied, point, environment_boxes, box_b, min_distance=0.02, buffer_ratio=0.3, box_name="Object A", box_b_name="Object B", frame=None, direction=None, DEBUG_FIT=False):
 41 |     """Check if an object can fit at a given point, trying different rotations.
 42 |     
 43 |     Args:
 44 |         grid: The floor grid (meshgrid output: grid[0]=X, grid[1]=Y)
 45 |         box: The object's bounding box
 46 |         occupied: Occupancy grid (boolean, same shape as grid[0])
 47 |         point: The point (x, y) to try placing at
 48 |         environment_boxes: List of all environment boxes
 49 |         box_b: The reference object box
 50 |         min_distance: Minimum distance required between objects (in meters)
 51 |         buffer_ratio: Ratio of buffer zone to object size
 52 |         box_name: Name of the object being placed
 53 |         box_b_name: Name of the reference object
 54 |         frame (str, optional): The reference frame for visualization context.
 55 |         direction (str, optional): The direction for visualization context.
 56 |     """
 57 |     x_coords, y_coords = grid
 58 |     h, w = x_coords.shape
 59 |     x_min, y_min = x_coords.min(), y_coords.min()
 60 |     x_max, y_max = x_coords.max(), y_coords.max()
 61 |     bounds = (x_min, x_max, y_min, y_max)
 62 |     
 63 |     grid_res_x = 0
 64 |     grid_res_y = 0
 65 |     if w > 1:
 66 |          grid_res_x = x_coords[0, 1] - x_coords[0, 0]
 67 |     if h > 1:
 68 |          grid_res_y = y_coords[1, 0] - y_coords[0, 0]
 69 | 
 70 |     if grid_res_x > 1e-6 and grid_res_y > 1e-6:
 71 |         grid_resolution = min(grid_res_x, grid_res_y)
 72 |     elif grid_res_x > 1e-6:
 73 |         grid_resolution = grid_res_x
 74 |     elif grid_res_y > 1e-6:
 75 |         grid_resolution = grid_res_y
 76 |     else:
 77 |         grid_resolution = 0 # Indicate failure to determine resolution
 78 |         print(f"Warning: Could not determine valid grid resolution for buffer calculation.")
 79 | 
 80 |     buffer_size_in_cells = 0 # Default buffer size in grid cells
 81 |     if grid_resolution > 1e-6: # Check if grid resolution is valid
 82 |         buffer_size_in_cells = int(np.ceil(min_distance / grid_resolution))
 83 |     else:
 84 |          print(f"Warning: Invalid grid resolution ({grid_resolution}). Setting buffer_size_in_cells to 0.")
 85 | 
 86 |     try:
 87 |         original_projected_points = project_to_floor(box)
 88 |         # Calculate the 2D center of the projected points
 89 |         box_center_2d = np.mean(original_projected_points, axis=0)
 90 |         # Calculate the initial hull
 91 |         initial_hull = ConvexHull(original_projected_points)
 92 |         # Store hull vertices relative to the projected center
 93 |         relative_hull_vertices = original_projected_points[initial_hull.vertices] - box_center_2d
 94 |     except Exception as e:
 95 |         return False # Cannot proceed if base hull fails
 96 | 
 97 |     rotations = [0, np.pi/4, np.pi/2]
 98 |     
 99 |     for rotation_idx, rotation in enumerate(rotations):
100 |         # --- Transform the precomputed hull ---
101 |         cos_theta = np.cos(rotation)
102 |         sin_theta = np.sin(rotation)
103 |         rotation_matrix_2d = np.array([
104 |             [cos_theta, -sin_theta],
105 |             [sin_theta,  cos_theta]
106 |         ])
107 |         # Apply rotation to relative vertices
108 |         rotated_vertices = relative_hull_vertices @ rotation_matrix_2d.T # Note the transpose for point rotation
109 |         # Apply translation (move center to the target 'point')
110 |         hull_vertices = rotated_vertices + point[:2] # Use only x,y from point
111 | 
112 |         # --- Check 1: Is hull within grid bounds? ---
113 |         if not is_hull_within_bounds(hull_vertices, bounds):
114 |             if DEBUG_FIT:
115 |                 # Need a 3D rotation matrix for visualization
116 |                 rotation_matrix_3d = np.array([
117 |                     [cos_theta, -sin_theta, 0],
118 |                     [sin_theta, cos_theta, 0],
119 |                     [0, 0, 1]
120 |                 ])
121 |                 visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 
122 |                                    rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - Out of Bounds", 
123 |                                    bounds, box_name, box_b_name, frame, direction)
124 |             continue
125 |             
126 |         # --- Check 2: Hull Occupancy (Vectorized) ---
127 |         path = mpath.Path(hull_vertices)
128 |         
129 |         # Find the bounding box of the hull to minimize grid points checked
130 |         hull_min_x, hull_min_y = np.min(hull_vertices, axis=0)
131 |         hull_max_x, hull_max_y = np.max(hull_vertices, axis=0)
132 | 
133 |         # Convert hull bounds to grid indices (clamp to grid dimensions)
134 |         min_ix = np.clip(int(np.floor((hull_min_x - x_min) / grid_res_x)) if grid_res_x > 1e-6 else 0, 0, w - 1)
135 |         max_ix = np.clip(int(np.ceil((hull_max_x - x_min) / grid_res_x)) if grid_res_x > 1e-6 else w - 1, 0, w - 1)
136 |         min_iy = np.clip(int(np.floor((hull_min_y - y_min) / grid_res_y)) if grid_res_y > 1e-6 else 0, 0, h - 1)
137 |         max_iy = np.clip(int(np.ceil((hull_max_y - y_min) / grid_res_y)) if grid_res_y > 1e-6 else h - 1, 0, h - 1)
138 |         
139 |         # Create subset of grid points and indices within the hull's bounding box
140 |         sub_x, sub_y = np.meshgrid(np.arange(min_ix, max_ix + 1), np.arange(min_iy, max_iy + 1))
141 |         sub_points_x = x_coords[sub_y.ravel(), sub_x.ravel()]
142 |         sub_points_y = y_coords[sub_y.ravel(), sub_x.ravel()]
143 |         sub_grid_points = np.vstack((sub_points_x, sub_points_y)).T
144 |         sub_grid_indices = (sub_y.ravel(), sub_x.ravel()) # Indices into the original 'occupied' grid
145 | 
146 |         if sub_grid_points.size == 0:
147 |             if DEBUG_FIT:
148 |                 # Need a 3D rotation matrix for visualization
149 |                 rotation_matrix_3d = np.array([
150 |                     [cos_theta, -sin_theta, 0],
151 |                     [sin_theta, cos_theta, 0],
152 |                     [0, 0, 1]
153 |                 ])
154 |                 visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 
155 |                                    rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - No Grid Points", 
156 |                                    bounds, box_name, box_b_name, frame, direction)
157 |             continue # Hull is likely too small or outside grid center area
158 |         
159 |         # Check which subset points are inside the actual hull polygon
160 |         inside_hull_mask_flat = path.contains_points(sub_grid_points)
161 |         
162 |         # Get the indices of the grid cells that are inside the hull
163 |         hull_indices_flat = tuple(idx[inside_hull_mask_flat] for idx in sub_grid_indices)
164 |         
165 |         # Check occupancy for points *inside* the hull
166 |         occupied_inside_hull = occupied[hull_indices_flat]
167 |         num_occupied_inside = np.sum(occupied_inside_hull)
168 |         total_cells_inside = len(occupied_inside_hull)
169 | 
170 |         # --- Check 3: Buffer Zone Occupancy ---
171 |         buffer_occupied_cells = 0
172 |         checked_buffer_indices = set() # Keep track of checked indices to avoid double counting
173 | 
174 |         # Only check buffer if hull itself is not significantly occupied
175 |         if total_cells_inside == 0 or num_occupied_inside / total_cells_inside <= 0.0: # Do not allow overlap
176 |             # Iterate through the grid cells *inside* the hull
177 |             for iy_hull, ix_hull in zip(*hull_indices_flat):
178 |                 # Check the neighborhood (buffer) around this hull cell
179 |                 for dy in range(-buffer_size_in_cells, buffer_size_in_cells + 1):
180 |                     for dx in range(-buffer_size_in_cells, buffer_size_in_cells + 1):
181 |                         if dx == 0 and dy == 0:
182 |                             continue # Skip the cell itself
183 | 
184 |                         ix_buffer = ix_hull + dx
185 |                         iy_buffer = iy_hull + dy
186 | 
187 |                         # Check if the buffer cell index is valid and hasn't been checked
188 |                         if 0 <= ix_buffer < w and 0 <= iy_buffer < h:
189 |                             buffer_idx_tuple = (iy_buffer, ix_buffer)
190 |                             if buffer_idx_tuple not in checked_buffer_indices:
191 |                                 # Check if this buffer cell is outside the hull but occupied
192 |                                 buffer_point = (x_coords[iy_buffer, ix_buffer], y_coords[iy_buffer, ix_buffer])
193 |                                 if not path.contains_point(buffer_point) and occupied[iy_buffer, ix_buffer]:
194 |                                     buffer_occupied_cells += 1
195 |                                 checked_buffer_indices.add(buffer_idx_tuple)
196 |         
197 |         # --- Final Decision for this rotation --- 
198 |         # Conditions: 
199 |         # 1. Hull must have some cells under it. 
200 |         # 2. Significant overlap inside the hull is not allowed. 
201 |         # 3. Significant occupation in the buffer zone is not allowed.
202 |         fit_this_rotation = False
203 |         overlap_ratio = 0
204 |         current_buffer_ratio = 0
205 |         
206 |         if total_cells_inside > 0:
207 |             overlap_ratio = num_occupied_inside / total_cells_inside
208 |             current_buffer_ratio = buffer_occupied_cells / total_cells_inside # Compare buffer count to hull size
209 |             
210 |             # Do not allow overlap and limited buffer occupation (e.g. < 50%)
211 |             # These thresholds might need tuning based on grid resolution and object sizes
212 |             if overlap_ratio <= 0.0 and current_buffer_ratio < buffer_ratio: 
213 |                 fit_this_rotation = True
214 |         
215 |         # Debug visualization
216 |         if DEBUG_FIT:
217 |             result_text = "SUCCESS" if fit_this_rotation else "FAILED"
218 |             # Need a 3D rotation matrix for visualization
219 |             rotation_matrix_3d = np.array([
220 |                 [cos_theta, -sin_theta, 0],
221 |                 [sin_theta, cos_theta, 0],
222 |                 [0, 0, 1]
223 |             ])
224 |             visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 
225 |                                rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - {result_text} (Overlap: {overlap_ratio:.2f}, Buffer: {current_buffer_ratio:.2f})", 
226 |                                bounds, box_name, box_b_name, frame, direction)
227 | 
228 |         if fit_this_rotation:
229 |             return True # Found a valid rotation
230 | 
231 |     return False
232 | 
233 | 
234 | 
235 | def visualize_placement(grid, box, occupied, point, environment_boxes, box_b, rotation=None, step_name="", bounds=None, box_name="Object A", box_b_name="Object B", frame=None, direction=None):
236 |     """Visualize the placement attempt in a top-down view.
237 |     
238 |     Args:
239 |         grid: The floor grid
240 |         box: The object's bounding box
241 |         occupied: Occupancy grid
242 |         point: The point to try placing at
243 |         environment_boxes: List of all environment boxes
244 |         box_b: The reference object box
245 |         rotation: Current rotation being tried
246 |         step_name: Name of the current step
247 |         bounds: Grid bounds (x_min, x_max, y_min, y_max)
248 |         box_name: Name of the object being placed
249 |         box_b_name: Name of the reference object
250 |         frame (str, optional): The reference frame (e.g., 'objectcentric').
251 |         direction (str, optional): The direction within the frame (e.g., 'infront').
252 |     """
253 |     import matplotlib.pyplot as plt
254 |     from matplotlib.patches import Rectangle, Polygon
255 |     
256 |     # Create figure
257 |     plt.figure(figsize=(10, 10))
258 |     
259 |     # Create color-coded grid
260 |     color_grid = np.zeros((*occupied.shape, 3), dtype=np.uint8)
261 |     color_grid[occupied] = [255, 0, 0]  # Red for occupied
262 |     color_grid[~occupied] = [0, 255, 0]  # Green for empty
263 |     
264 |     # Plot the grid
265 |     plt.imshow(color_grid, 
266 |               extent=[grid[0].min(), grid[0].max(), grid[1].min(), grid[1].max()],
267 |               origin='lower')
268 |     
269 |     # Plot environment boxes
270 |     for env_box in environment_boxes:
271 |         if env_box == box_b:
272 |             color = 'magenta'  # Reference box
273 |             label = box_b_name
274 |         else:
275 |             color = 'red'
276 |             label = 'Environment'
277 |         corners = project_to_floor(env_box)
278 |         plt.fill(corners[:, 0], corners[:, 1], color, alpha=0.3, label=label)
279 |         plt.plot(corners[:, 0], corners[:, 1], color)
280 |     
281 |     # Plot the current placement attempt
282 |     if rotation is not None:
283 |         translated_box = o3d.geometry.OrientedBoundingBox()
284 |         translated_box.center = np.array([point[0], point[1], box.extent[2] / 2])
285 |         translated_box.R = rotation
286 |         translated_box.extent = box.extent
287 |         
288 |         corners = project_to_floor(translated_box)
289 |         plt.fill(corners[:, 0], corners[:, 1], 'blue', alpha=0.3, label=box_name)
290 |         plt.plot(corners[:, 0], corners[:, 1], 'blue')
291 |     
292 |     # Plot the point
293 |     plt.scatter(point[0], point[1], c='yellow', s=100, marker='*', label='Target Point')
294 |     
295 |     # Add title and labels
296 |     title = f"Placement Attempt: {step_name}"
297 |     if frame and direction:
298 |         title += f" ({frame} - {direction} relative to {box_b_name})"
299 |     else:
300 |         title += f" (relative to {box_b_name})"
301 |     plt.title(title)
302 |     plt.xlabel('X')
303 |     plt.ylabel('Y')
304 |     
305 |     # Show grid bounds if provided
306 |     if bounds:
307 |         x_min, x_max, y_min, y_max = bounds
308 |         plt.axvline(x=x_min, color='k', linestyle='--')
309 |         plt.axvline(x=x_max, color='k', linestyle='--')
310 |         plt.axhline(y=y_min, color='k', linestyle='--')
311 |         plt.axhline(y=y_max, color='k', linestyle='--')
312 |     
313 |     plt.grid(True)
314 |     plt.legend()
315 |     plt.show()


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/configuration/configuration.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | 
 11 | import numpy as np
 12 | 
 13 | from spatial_analysis.configuration.configuration_utils import get_object_metrics
 14 | 
 15 | 
 16 | def check_spatial_configuration_relationships(obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness='lenient'):
 17 |     """Calculates the spatial relationship between two 3D objects (obj1 relative to obj2) from multiple perspectives.
 18 | 
 19 |     This function determines relationships like 'left', 'right', 'in front of', 'behind', 'above',
 20 |     'below', and 'overlapping' based on the objects' oriented bounding boxes (OBBs). It leverages
 21 |     pre-calculated object metrics (pixel projections, depth ranges, world coordinates) obtained
 22 |     via `get_object_metrics`.
 23 | 
 24 |     Relationships are assessed in three reference frames:
 25 |     1.  **camera_centric**: Based on the objects' 2D projections onto the image plane (for left/right)
 26 |         and their depth relative to the camera (for in front/behind). Vertical relationships (above/below)
 27 |         in this frame are determined using world Z coordinates.
 28 |     2.  **world_centric**: Uses the same logic as camera_centric for horizontal and depth relationships
 29 |         in this implementation, but explicitly defines overlap based on world Z-axis separation.
 30 |     3.  **object_centric**: Determines relationships based on the relative position of obj1's center
 31 |         with respect to obj2's orientation (forward and right vectors). It uses the Separating Axis 
 32 |         Theorem (SAT) to check for OBB overlap in 3D, influencing directional judgments. Vertical
 33 |         relationships (above/below) use the world Z coordinates.
 34 | 
 35 |     The `strictness` parameter controls the calculation logic for camera_centric and world_centric:
 36 |     -   **'strict'**: Requires clear separation based on the minimum and maximum bounds of the objects'
 37 |         metrics (pixel coordinates, depth values, world Z). Objects are considered overlapping if their
 38 |         bounds intersect, even slightly. This mode is sensitive to full visibility.
 39 |     -   **'lenient'**: Uses object centers (for pixel projection), average visible depth, and a combination 
 40 |         of average/min/max world Z coordinates. It's more robust to partial occlusions or near overlaps. It may still use strict bounds checks in ambiguous cases (e.g., very close average depths).
 41 | 
 42 |     Args:
 43 |         obj1 (dict): First object, containing at least 'name' (str) and 'obb' (open3d.geometry.OrientedBoundingBox).
 44 |         obj2 (dict): Second object, with the same structure as obj1.
 45 |         extrinsic (np.ndarray): 4x4 extrinsic matrix representing the camera-to-world transformation.
 46 |         intrinsic (np.ndarray): 3x3 or 4x4 camera intrinsic matrix. Only the top-left 3x3 portion is used.
 47 |         image_size (tuple): A tuple representing the image size as (width, height).
 48 |         individual_occupancy_maps (dict): A dictionary containing precomputed individual occupancy maps for objects,
 49 |                                           used by `get_object_metrics`. Keys should match object names.
 50 |         strictness (str, optional): The mode for relationship checks ('strict' or 'lenient'). Defaults to 'lenient'.
 51 | 
 52 |     Returns:
 53 |         dict: A dictionary containing boolean spatial relationships for each reference frame ('camera_centric',
 54 |               'world_centric', 'object_centric'). Each frame contains keys: 'left', 'right', 'infront',
 55 |               'behind', 'above', 'below', 'overlapping'.
 56 |     """
 57 |     EPSILON = 1e-6 # Tolerance for floating point comparisons
 58 | 
 59 | 
 60 |     box1_metrics = get_object_metrics(obj1, extrinsic, intrinsic, image_size, individual_occupancy_maps)
 61 |     box2_metrics = get_object_metrics(obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps)
 62 | 
 63 |     # Clamp negative world Z values to 0 based on the assumption that nothing is below ground
 64 |     for metrics in [box1_metrics, box2_metrics]:
 65 |         if metrics.get('world_z_min') is not None and metrics['world_z_min'] < 0:
 66 |             metrics['world_z_min'] = 0.0
 67 |         if metrics.get('world_z_max') is not None and metrics['world_z_max'] < 0:
 68 |              # If max is negative, min must also be negative (or None), so both are clamped to 0
 69 |              metrics['world_z_max'] = 0.0
 70 |         if metrics.get('world_z_avg') is not None and metrics['world_z_avg'] < 0:
 71 |             metrics['world_z_avg'] = 0.0
 72 | 
 73 |     # Nested Function to Determine Relationships Based on Metrics
 74 |     def determine_camera_and_world_relationship(m1, m2):
 75 |         """Determines strict and lenient relationships based on pre-calculated metrics."""
 76 | 
 77 |         # Strict Relationship Calculation
 78 |         def get_relationship_strict():
 79 |             horizontal_relation = "overlapping"
 80 |             # Check visibility before comparing strict horizontal bounds
 81 |             if m1.get('max_px') is not None and m2.get('min_px') is not None and m1['max_px'] < m2['min_px'] - EPSILON:
 82 |                  horizontal_relation = "left"
 83 |             elif m1.get('min_px') is not None and m2.get('max_px') is not None and m1['min_px'] > m2['max_px'] + EPSILON:
 84 |                  horizontal_relation = "right"
 85 | 
 86 |             depth_relation = "overlapping"
 87 |             # Check visibility before comparing strict depth bounds
 88 |             if m1.get('max_depth') is not None and m2.get('min_depth') is not None and m1['max_depth'] < m2['min_depth'] - EPSILON:
 89 |                 depth_relation = "in front of"
 90 |             elif m1.get('min_depth') is not None and m2.get('max_depth') is not None and m1['min_depth'] > m2['max_depth'] + EPSILON:
 91 |                 depth_relation = "behind"
 92 | 
 93 |             vertical_world_relation = "overlapping"
 94 |             # World Z is always calculated
 95 |             if m1.get('world_z_max') is not None and m2.get('world_z_min') is not None and m1['world_z_max'] < m2['world_z_min'] - EPSILON:
 96 |                 vertical_world_relation = "below"
 97 |             elif m1.get('world_z_min') is not None and m2.get('world_z_max') is not None and m1['world_z_min'] > m2['world_z_max'] + EPSILON:
 98 |                 vertical_world_relation = "above"
 99 | 
100 |             return {
101 |                 "left": horizontal_relation == "left",
102 |                 "right": horizontal_relation == "right",
103 |                 "infront": depth_relation == "in front of",
104 |                 "behind": depth_relation == "behind",
105 |                 "cam_overlapping": horizontal_relation == "overlapping", # Overlap based on pixel projection
106 |                 "above": vertical_world_relation == "above",
107 |                 "below": vertical_world_relation == "below",
108 |                 "world_overlapping": vertical_world_relation == "overlapping" # Overlap based on world Z
109 |             }
110 | 
111 |         # Lenient Relationship Calculation
112 |         def get_relationship_lenient():
113 |             # Check if centers are comparable
114 |             centers_comparable = m1.get('center_px') is not None and m2.get('center_px') is not None
115 |             
116 |             horizontal_relation = "overlapping"
117 |             if centers_comparable:
118 |                 # Check containment using pixel bounds
119 |                 box1_center_in_box2_px = (m1.get('min_px') is not None and # Check all required bounds exist
120 |                                            m1.get('max_px') is not None and
121 |                                            m1.get('min_py') is not None and
122 |                                            m1.get('max_py') is not None and
123 |                                            m2.get('min_px') is not None and 
124 |                                            m2.get('max_px') is not None and 
125 |                                            m2.get('min_py') is not None and 
126 |                                            m2.get('max_py') is not None and 
127 |                                            m2['min_px'] <= m1['center_px'][0] <= m2['max_px'] and
128 |                                            m2['min_py'] <= m1['center_px'][1] <= m2['max_py'])
129 | 
130 |                 box2_center_in_box1_px = (m1.get('min_px') is not None and # Check all required bounds exist
131 |                                            m1.get('max_px') is not None and
132 |                                            m1.get('min_py') is not None and
133 |                                            m1.get('max_py') is not None and
134 |                                            m2.get('min_px') is not None and 
135 |                                            m2.get('max_px') is not None and 
136 |                                            m2.get('min_py') is not None and 
137 |                                            m2.get('max_py') is not None and 
138 |                                            m1['min_px'] <= m2['center_px'][0] <= m1['max_px'] and
139 |                                            m1['min_py'] <= m2['center_px'][1] <= m1['max_py'])
140 | 
141 |                 if m1['center_px'][0] < m2['center_px'][0] - EPSILON and not box2_center_in_box1_px:
142 |                     horizontal_relation = "left"
143 |                 elif m1['center_px'][0] > m2['center_px'][0] + EPSILON and not box1_center_in_box2_px:
144 |                     horizontal_relation = "right"
145 | 
146 |             # Lenient depth check based on average visible depth
147 |             depth_relation = "overlapping"
148 |             avg_depths_comparable = m1.get('visible_depth_avg') is not None and m2.get('visible_depth_avg') is not None
149 |             if avg_depths_comparable:
150 |                  # Optional: Add hybrid check using strict bounds if averages are close
151 |                  if abs(m1['visible_depth_avg'] - m2['visible_depth_avg']) < EPSILON:
152 |                       # Averages are close, fall back to strict check only if strictly separated
153 |                       if m1.get('max_depth') is not None and m2.get('min_depth') is not None and m1['max_depth'] < m2['min_depth'] - EPSILON:
154 |                            depth_relation = "in front of"
155 |                       elif m1.get('min_depth') is not None and m2.get('max_depth') is not None and m1['min_depth'] > m2['max_depth'] + EPSILON:
156 |                            depth_relation = "behind"
157 |                       # else: stays overlapping
158 |                  elif m1['visible_depth_avg'] < m2['visible_depth_avg']:
159 |                      depth_relation = "in front of"
160 |                  elif m1['visible_depth_avg'] > m2['visible_depth_avg']:
161 |                      depth_relation = "behind"
162 | 
163 |             # Lenient vertical check based on world Z, prioritizing separation
164 |             vertical_world_relation = "overlapping" # Default to overlapping
165 | 
166 |             # Check if metrics are available for comparison
167 |             m1_z_max = m1.get('world_z_max')
168 |             m1_z_min = m1.get('world_z_min')
169 |             m1_z_avg = m1.get('world_z_avg')
170 |             m2_z_max = m2.get('world_z_max')
171 |             m2_z_min = m2.get('world_z_min')
172 |             m2_z_avg = m2.get('world_z_avg')
173 | 
174 |             all_metrics_exist = all(v is not None for v in [m1_z_max, m1_z_min, m1_z_avg, m2_z_max, m2_z_min, m2_z_avg])
175 | 
176 |             if all_metrics_exist:
177 |                  # 1. Check strict separation first
178 |                  if m1_z_max < m2_z_min - EPSILON:
179 |                       vertical_world_relation = "below"
180 |                  elif m1_z_min > m2_z_max + EPSILON:
181 |                       vertical_world_relation = "above"
182 |                  # 2. If strictly overlapping, check lenient conditions (avg vs max/min)
183 |                  else:
184 |                       # Check if average of 1 is above max of 2
185 |                       if m1_z_avg > m2_z_max + EPSILON:
186 |                            vertical_world_relation = "above"
187 |                       # Check if average of 2 is above max of 1 (meaning 1 is below 2)
188 |                       elif m2_z_avg > m1_z_max + EPSILON:
189 |                            vertical_world_relation = "below"
190 |                       # Otherwise, they remain overlapping
191 | 
192 |             return {
193 |                 "left": horizontal_relation == "left",
194 |                 "right": horizontal_relation == "right",
195 |                 "infront": depth_relation == "in front of",
196 |                 "behind": depth_relation == "behind",
197 |                 "cam_overlapping": horizontal_relation == "overlapping", # Overlap based on pixel projection centroid logic
198 |                 "above": vertical_world_relation == "above",
199 |                 "below": vertical_world_relation == "below",
200 |                 "world_overlapping": vertical_world_relation == "overlapping" # Overlap based on world Z average logic
201 |             }
202 | 
203 |         # Return both results
204 |         return {
205 |             "strict": get_relationship_strict(),
206 |             "lenient": get_relationship_lenient()
207 |         }
208 | 
209 |     # Calculate Camera/World Relationships
210 |     # Object visibility/comparability is handled by None checks within determine_camera_and_world_relationship.
211 |     cam_world_relations = determine_camera_and_world_relationship(box1_metrics, box2_metrics)
212 | 
213 |     # Calculate Object Centric Relationships
214 |     def get_object_centric_relationship(obj1, obj2):
215 |         def get_facing_direction(box):
216 |             rotation_matrix = np.asarray(box.R)
217 |             forward_direction = rotation_matrix[:, 0]
218 |             return forward_direction
219 | 
220 |         def check_overlap(box1, box2):
221 |             box1_points = np.asarray(box1.get_box_points())
222 |             box2_points = np.asarray(box2.get_box_points())
223 | 
224 |             def project_points(points, axis):
225 |                 return np.dot(points, axis)
226 | 
227 |             def overlap_on_axis(box1_proj, box2_proj):
228 |                 box1_min, box1_max = np.min(box1_proj), np.max(box1_proj)
229 |                 box2_min, box2_max = np.min(box2_proj), np.max(box2_proj)
230 |                 return not (box1_max < box2_min or box2_max < box1_min)
231 | 
232 |             # Use OBB axes for Separating Axis Theorem (more robust than just world axes diffs)
233 |             axes = []
234 |             axes.extend(box1.R.T) # Box 1 axes
235 |             axes.extend(box2.R.T) # Box 2 axes
236 |             # Add cross products of axes (simplified common implementation)
237 |             # Calculate the 9 potential separating axes derived from cross products
238 |             # of each edge direction of box1 with each edge direction of box2.
239 |             # Since OBB axes are parallel to edge directions, we cross the axes vectors.
240 |             for i in range(3):
241 |                 for j in range(3):
242 |                     # Cross product of box1 axis i and box2 axis j
243 |                     cross_product = np.cross(box1.R[:, i], box2.R[:, j])
244 |                     if np.linalg.norm(cross_product) > EPSILON: # Avoid zero vectors
245 |                          axes.append(cross_product / np.linalg.norm(cross_product))
246 | 
247 |             for axis in axes:
248 |                 if not overlap_on_axis(project_points(box1_points, axis),
249 |                                        project_points(box2_points, axis)):
250 |                     # Separating axis found, no overlap
251 |                     return False
252 | 
253 |             # If no separating axis is found by SAT, the OBBs are considered overlapping.
254 |             return True
255 | 
256 |         # Simplified overlap check based on SAT result for object-centric logic
257 |         overlap_obj_centric = check_overlap(obj1["obb"], obj2["obb"])
258 | 
259 |         obj2_forward = get_facing_direction(obj2["obb"])
260 |         obj1_center = np.mean(np.asarray(obj1["obb"].get_box_points()), axis=0)
261 |         obj2_center = np.asarray(obj2["obb"].get_center())
262 | 
263 |         relative_position = obj1_center - obj2_center
264 |         dot_product = np.dot(relative_position, obj2_forward)
265 |         # Use Up vector (assuming Z is up) for cross product for left/right relative to forward
266 |         world_up = np.array([0, 0, 1]) 
267 |         obj2_right = np.cross(obj2_forward, world_up) 
268 |         # Ensure obj2_right is normalized if needed, though only sign matters for dot product
269 |         if np.linalg.norm(obj2_right) > EPSILON:
270 |              obj2_right /= np.linalg.norm(obj2_right)
271 |         else:
272 |              # Handle cases where forward is aligned with up (e.g. object pointing straight up/down)
273 |              # Use world X or Y as a fallback 'right' ? This needs careful thought.
274 |              # For now, if right vector is invalid, horizontal relation is ambiguous/overlapping
275 |              obj2_right = None
276 | 
277 |         horizontal_dot = np.dot(relative_position, obj2_right) if obj2_right is not None else 0
278 | 
279 | 
280 |         # Object-centric depth uses dot product with forward vector
281 |         depth_relation = "overlapping"
282 |         if not overlap_obj_centric: # Only assign directional if not overlapping
283 |              if dot_product > EPSILON:
284 |                  depth_relation = "in front of"
285 |              elif dot_product < -EPSILON:
286 |                  depth_relation = "behind"
287 |              # else: stays overlapping (or on the plane)
288 | 
289 |         # Object-centric horizontal uses dot product with right vector
290 |         horizontal_relation = "overlapping"
291 |         if not overlap_obj_centric and obj2_right is not None: # Only assign if not overlapping and right vector is valid
292 |              if horizontal_dot > EPSILON: # Project onto right vector: positive is "right"
293 |                  horizontal_relation = "right"
294 |              elif horizontal_dot < -EPSILON: # Negative is "left"
295 |                  horizontal_relation = "left"
296 |              # else: stays overlapping (or directly in front/behind)
297 | 
298 |         return horizontal_relation, depth_relation
299 | 
300 |     obj_centric_horizontal, obj_centric_depth = get_object_centric_relationship(obj1, obj2)
301 | 
302 |     # Select strict or lenient results based on parameter
303 |     chosen_relation = cam_world_relations.get(strictness, cam_world_relations['lenient']) # Default to lenient
304 | 
305 |     # Assemble Final Result
306 |     relationships = {
307 |         "camera_centric": {
308 |             "left": chosen_relation["left"],
309 |             "right": chosen_relation["right"],
310 |             "infront": chosen_relation["infront"],
311 |             "behind": chosen_relation["behind"],
312 |             # Use world vertical for camera frame above/below
313 |             "above": chosen_relation["above"],
314 |             "below": chosen_relation["below"],
315 |             "overlapping": chosen_relation["cam_overlapping"],
316 |         },
317 |         "world_centric": {
318 |             # World uses same planar relationships as camera in this implementation
319 |             "left": chosen_relation["left"],
320 |             "right": chosen_relation["right"],
321 |             "infront": chosen_relation["infront"],
322 |             "behind": chosen_relation["behind"],
323 |             "above": chosen_relation["above"],
324 |             "below": chosen_relation["below"],
325 |             "overlapping": chosen_relation["world_overlapping"] # Use Z-based overlap here
326 |         },
327 |         "object_centric": {
328 |             "left": obj_centric_horizontal == "left",
329 |             "right": obj_centric_horizontal == "right",
330 |             "infront": obj_centric_depth == "in front of",
331 |             "behind": obj_centric_depth == "behind",
332 |             # Use world vertical for object frame above/below
333 |             "above": chosen_relation["above"],
334 |             "below": chosen_relation["below"],
335 |             # Object centric overlap combines horizontal and depth states
336 |             "overlapping": obj_centric_horizontal == "overlapping" or obj_centric_depth == "overlapping"
337 |         }
338 |     }
339 | 
340 |     return relationships


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/configuration/configuration_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual
 4 | # property and proprietary rights in and to this material, related
 5 | # documentation and any modifications thereto. Any use, reproduction,
 6 | # disclosure or distribution of this material and related documentation
 7 | # without an express license agreement from NVIDIA CORPORATION or
 8 | # its affiliates is strictly prohibited.
 9 | 
10 | import numpy as np
11 | 
12 | from spatial_analysis.relationship_utils import get_min_max_visible_depth_for_box
13 | 
14 | 
15 | # --- Calculate Metrics for Both Objects ---
16 | def get_object_metrics(obj, extrinsic, intrinsic, image_size, individual_occupancy_maps):
17 |     metrics = {}
18 |     obb = obj.get('obb') # Use .get for safety
19 |     obj_name = obj.get('name') # Get object name
20 | 
21 |     if obb is None or obj_name is None: # Check if retrieval failed
22 |             print(f"Warning: Skipping metrics calculation due to missing 'obb' or 'name' in object: {obj}")
23 |             return {'is_visible': False} # Return basic structure indicating not visible
24 | 
25 |     world_coords = np.asarray(obb.get_box_points())
26 | 
27 |     # Pixel Occupancy & 2D Metrics
28 |     # --- Retrieve pre-calculated occupancy map ---
29 |     occupancy_map = individual_occupancy_maps.get(obj_name)
30 |     if occupancy_map is None:
31 |             # Removed fallback calculation, error is printed instead.
32 |             print(f"Error: Occupancy map not found for '{obj_name}' in get_object_metrics. Aborting.")
33 |             return {'is_visible': False}
34 | 
35 |     occupied_coords = np.argwhere(occupancy_map) # Shape (N, 2), cols=[row, col]
36 | 
37 |     # Visible Depth Range
38 |     min_depth, max_depth = get_min_max_visible_depth_for_box(obb, extrinsic, intrinsic, image_size)
39 | 
40 |     # Determine overall visibility
41 |     metrics['is_visible'] = occupied_coords.size > 0 and min_depth is not None
42 | 
43 |     if metrics['is_visible']:
44 |         metrics['min_px'] = np.min(occupied_coords[:, 1]) # Min X pixel
45 |         metrics['max_px'] = np.max(occupied_coords[:, 1]) # Max X pixel
46 |         metrics['min_py'] = np.min(occupied_coords[:, 0]) # Min Y pixel
47 |         metrics['max_py'] = np.max(occupied_coords[:, 0]) # Max Y pixel
48 |         metrics['center_px'] = np.mean(occupied_coords[:, ::-1], axis=0) # Centroid (x, y)
49 |         metrics['min_depth'] = min_depth
50 |         metrics['max_depth'] = max_depth
51 |         metrics['visible_depth_avg'] = (min_depth + max_depth) / 2.0
52 |     else:
53 |         # Set pixel/depth metrics to None if not visible
54 |         metrics['min_px'] = metrics['max_px'] = metrics['min_py'] = metrics['max_py'] = None
55 |         metrics['center_px'] = None
56 |         metrics['min_depth'] = metrics['max_depth'] = None
57 |         metrics['visible_depth_avg'] = None
58 | 
59 |     # World Z Metrics (calculated regardless of visibility)
60 |     metrics['world_z_min'] = np.min(world_coords[:, 2])
61 |     metrics['world_z_max'] = np.max(world_coords[:, 2])
62 |     # Handle potential empty world_coords if needed, though unlikely for OBB
63 |     if world_coords.size > 0:
64 |             metrics['world_z_avg'] = np.mean(world_coords[:, 2])
65 |     else:
66 |             metrics['world_z_avg'] = None # Should not happen with OBB
67 | 
68 |     return metrics


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/context/context_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual
 4 | # property and proprietary rights in and to this material, related
 5 | # documentation and any modifications thereto. Any use, reproduction,
 6 | # disclosure or distribution of this material and related documentation
 7 | # without an express license agreement from NVIDIA CORPORATION or
 8 | # its affiliates is strictly prohibited.
 9 | 
10 | 
11 | 
12 | import numpy as np
13 | from matplotlib.path import Path
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | def compute_distances_to_bbox(points, bbox_corners):
21 |     """
22 |     Compute the shortest distance from multiple points to a bounding box (convex polygon).
23 | 
24 |     Parameters:
25 |     - points: A NumPy array of shape (N, 2) representing N points.
26 |     - bbox_corners: A NumPy array of shape (M, 2) representing the corners of the bounding box in order.
27 | 
28 |     Returns:
29 |     - distances: A NumPy array of shape (N,) containing the shortest distances from each point to the bounding box.
30 |     """
31 |     # Create a Path object for the bounding box
32 |     path = Path(bbox_corners)
33 | 
34 |     # Determine which points are inside the bounding box
35 |     inside = path.contains_points(points)
36 | 
37 |     # Initialize distances array
38 |     distances = np.zeros(points.shape[0])
39 | 
40 |     # Points outside the polygon
41 |     outside_points = points[~inside]
42 | 
43 |     if outside_points.size > 0:
44 |         num_edges = bbox_corners.shape[0]
45 |         distances_outside = np.full(outside_points.shape[0], np.inf)
46 | 
47 |         # Compute distances from points to each edge
48 |         for i in range(num_edges):
49 |             A = bbox_corners[i]
50 |             B = bbox_corners[(i + 1) % num_edges]
51 |             AB = B - A
52 |             AB_squared = np.dot(AB, AB)
53 | 
54 |             if AB_squared == 0:
55 |                 # A and B are the same point
56 |                 distances_edge = np.linalg.norm(outside_points - A, axis=1)
57 |             else:
58 |                 AP = outside_points - A
59 |                 t = np.dot(AP, AB) / AB_squared
60 |                 t = np.clip(t, 0, 1)
61 |                 closest = A + t[:, np.newaxis] * AB
62 |                 distances_edge = np.linalg.norm(outside_points - closest, axis=1)
63 | 
64 |             distances_outside = np.minimum(distances_outside, distances_edge)
65 | 
66 |         # Assign distances to the corresponding points
67 |         distances[~inside] = distances_outside
68 | 
69 |     # Points inside have zero distance
70 |     distances[inside] = 0.0
71 | 
72 |     return distances
73 | 
74 | 
75 | def project_points_to_image(points, extrinsic, intrinsic):
76 |     extrinsic_w2c = np.linalg.inv(extrinsic)
77 |     points = np.concatenate([points, np.ones((points.shape[0], 1))], axis=1)
78 |     points_img = intrinsic @ extrinsic_w2c @ points.transpose()
79 |     points_img = points_img.transpose()
80 |     
81 |     # Normalize homogeneous coordinates
82 |     w = points_img[:, 3]
83 |     points_img = points_img[:, :3] / w[:, np.newaxis]
84 |     
85 |     # Initialize output arrays
86 |     points_pixel = points_img[:, :2] / points_img[:, 2][:, np.newaxis]
87 |     points_depth = points_img[:, 2]
88 |     
89 |     return np.round(points_pixel).astype(int).tolist(), points_depth.tolist()


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/grounding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual
 4 | # property and proprietary rights in and to this material, related
 5 | # documentation and any modifications thereto. Any use, reproduction,
 6 | # disclosure or distribution of this material and related documentation
 7 | # without an express license agreement from NVIDIA CORPORATION or
 8 | # its affiliates is strictly prohibited.
 9 | 
10 | 
11 | import numpy as np
12 | 
13 | def get_object_grounding(obj, occupancy_map):
14 |     """
15 |     Generates 2D bounding boxes based on pixel occupancy for a given 3D object.
16 |     Uses a pre-calculated occupancy map.
17 | 
18 |     Args:
19 |         obj (dict): A single object dict with at least 'obb' and 'name'.
20 |         extrinsic (np.ndarray): Extrinsic matrix.
21 |         intrinsic (np.ndarray): Intrinsic matrix.
22 |         image_size (tuple): (width, height).
23 |         occupancy_map (np.ndarray): Pre-calculated 2D boolean occupancy map for this object.
24 | 
25 |     Returns:
26 |         dict or None: A dictionary containing:
27 |               - 'name': Object name (str)
28 |               - 'clipped_bbox': Axis-aligned clipped [xmin, ymin, xmax, ymax] (list)
29 |               - 'square_bbox': Axis-aligned square clipped [xmin, ymin, xmax, ymax] (list)
30 |               - 'bbox_3d': Original 3D bounding box coordinates (list) - if available in input
31 |               - 'obb': Original OrientedBoundingBox - needed later for occupancy map
32 |               Returns None if the object has no OBB or is not visible.
33 |     """
34 |     if 'obb' not in obj:
35 |         print(f"Warning: Skipping object {obj.get('name', 'Unknown')} because 'obb' is missing.")
36 |         return None
37 |     if occupancy_map is None:
38 |          print(f"Warning: Occupancy map is None for object {obj.get('name', 'Unknown')}. Cannot calculate grounding.")
39 |          return None
40 | 
41 |     # Use the provided occupancy_map
42 |     occupied_coords = np.argwhere(occupancy_map) # Shape (N, 2), cols=[row, col] -> (y, x)
43 | 
44 |     if occupied_coords.size == 0:
45 |         # Object does not project onto any pixels
46 |         # Object does not project onto any pixels according to the map
47 |         return None
48 | 
49 |     # --- Calculate Clipped Axis-Aligned BBox from Occupied Pixels ---
50 |     # Remember: occupied_coords are (row, col) -> (y, x)
51 |     clipped_min_y = np.min(occupied_coords[:, 0])
52 |     clipped_max_y = np.max(occupied_coords[:, 0])
53 |     clipped_min_x = np.min(occupied_coords[:, 1])
54 |     clipped_max_x = np.max(occupied_coords[:, 1])
55 | 
56 |     clipped_coords_bbox = [clipped_min_x, clipped_min_y, clipped_max_x, clipped_max_y]
57 | 
58 |     # --- Store Info ---
59 |     info = {
60 |         'name': obj.get("name"),
61 |         'clipped_bbox': [float(c) for c in clipped_coords_bbox], # Ensure floats
62 |         'bbox_3d': obj.get("bbox_3d"),
63 |         'obb': obj['obb']
64 |     }
65 |     # Return the dictionary directly, not a list containing the dictionary
66 |     return info


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/obj_properties.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | """Defines default lists of object categories based on physical properties.
 11 | 
 12 | These lists categorize objects based on characteristics relevant for spatial
 13 | analysis tasks, such as determining potential interactions or relationships.
 14 | For example, knowing if an object has a discernible 'face' can influence
 15 | contextual relationship calculations (e.g., 'in front of'), and knowing if an
 16 | object is movable and placeable on a flat surface is crucial for compatibility
 17 | checks like 'on top of'.
 18 | 
 19 | The lists provided here are defaults and are intended to be user-configurable.
 20 | Users can modify these lists directly or replace them based on the specific
 21 | object categories present in their datasets or required for their analysis tasks.
 22 | 
 23 | Defined Lists:
 24 | - `items_with_face`: Objects typically having a primary interaction face.
 25 | - `flat_surface_items`: Objects typically providing a flat surface for placement.
 26 | - `movable_items`: Objects generally considered movable.
 27 | - `movable_and_placeable_items`: A subset of movable items suitable for placing on surfaces.
 28 | - `embodiedscan_objs`: A comprehensive list of object categories found in the EmbodiedScan dataset (provided for reference).
 29 | """
 30 | 
 31 | # GPT-generated item filter list
 32 | 
 33 | items_with_face = [
 34 |     'air conditioner', 'alarm', 'album', 'bed', 'bicycle', 'blackboard',
 35 |     'book', 'camera', 'car', 'cabinet', 'calendar', 'clock', 'computer',
 36 |     'copier', 'couch', 'desk', 'dishwasher', 'door', 'drawer', 'drum',
 37 |     'fan', 'fireplace', 'garage door', 'guitar', 'heater', 'humidifier', 'kettle',
 38 |     'keyboard', 'laptop', 'mailbox', 'microwave', 'mirror', 'monitor', 'oven',
 39 |     'piano', 'picture', 'projector', 'refrigerator', 'screen', 'shelf', 'sink', 'stand',
 40 |     'speaker', 'stairs', 'stove', 'tablet', 'telephone', 'tv',
 41 |     'toilet', 'vacuum cleaner', 'washing machine', 'window'
 42 | ]
 43 | 
 44 | 
 45 | flat_surface_items = [
 46 |     'bar', 'beam', 'bed', 'bench', 'cabinet', 'cart',
 47 |     'chair', 'counter', 'countertop', 'crate',
 48 |     'cube', 'desk', 'dresser', 'footstool',
 49 |     'kitchen island', 'mat', 'mattress', 'ottoman',
 50 |     'package', 'panel', 'pedestal', 'pool table',
 51 |     'rack', 'rail', 'shelf', 'stand', 'steps', 'stool', 'structure', 'support',
 52 |     'table', 'tray', 'vanity', 'windowsill'
 53 | ]
 54 | 
 55 | movable_items = [
 56 |     'adhesive tape', 'air conditioner', 'alarm', 'album', 'backpack', 'bag', 'ball',
 57 |     'basket', 'beanbag', 'bicycle', 'bin', 'blackboard', 'blanket', 'board',
 58 |     'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', 'bread',
 59 |     'broom', 'brush', 'bucket', 'camera', 'can', 'candle', 'candlestick',
 60 |     'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'cleanser',
 61 |     'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'commode', 'computer',
 62 |     'conducting wire', 'container', 'control', 'copier', 'cosmetics', 'crib', 'cup',
 63 |     'curtain', 'cushion', 'decoration', 'desk', 'detergent', 'device', 'dish rack',
 64 |     'dispenser', 'divider', 'drawer', 'dress', 'dresser', 'drum', 'dumbbell',
 65 |     'dustpan', 'dvd', 'eraser', 'excercise equipment', 'fan', 'file', 'fire extinguisher',
 66 |     'flowerpot', 'folder', 'food', 'footstool', 'frame', 'fruit', 'furniture',
 67 |     'garbage', 'glass', 'globe', 'glove', 'guitar', 'hair dryer', 'hamper',
 68 |     'hat', 'headphones', 'heater', 'helmets', 'holder', 'hook', 'humidifier',
 69 |     'jacket', 'jar', 'kettle', 'keyboard', 'kitchenware', 'knife', 'label',
 70 |     'ladder', 'lamp', 'laptop', 'letter', 'light', 'luggage', 'machine',
 71 |     'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu',
 72 |     'microwave', 'mirror', 'mop', 'mouse', 'napkins', 'notebook', 'object',
 73 |     'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'paper',
 74 |     'paper cutter', 'pen', 'pillow', 'pitcher', 'plant', 'plate', 'player',
 75 |     'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot', 'price tag',
 76 |     'printer', 'projector', 'purse', 'rack', 'radiator', 'radio', 'remote control',
 77 |     'roll', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen',
 78 |     'seasoning', 'shampoo', 'sheet', 'shirt', 'shoe', 'shovel', 'sign',
 79 |     'soap', 'soap dish', 'soap dispenser', 'speaker', 'sponge', 'spoon', 'stand',
 80 |     'stapler', 'statue', 'stick', 'stool', 'sunglasses', 'tablet', 'teapot',
 81 |     'telephone', 'tissue', 'tissue box', 'toaster', 'toiletry', 'tool', 'toothbrush',
 82 |     'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube',
 83 |     'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vase', 'wardrobe',
 84 |     'washbasin', 'washing machine', 'wine', 'wrap'
 85 | ]
 86 | 
 87 | 
 88 | movable_and_placeable_items = [
 89 |     'adhesive tape',
 90 |     'alarm',
 91 |     'album',
 92 |     'backpack',
 93 |     'bag',
 94 |     'ball',
 95 |     'basket',
 96 |     'beanbag',
 97 |     'bicycle',
 98 |     'bin',
 99 |     'blanket',
100 |     'board',
101 |     'body loofah',
102 |     'book',
103 |     'boots',
104 |     'bottle',
105 |     'bowl',
106 |     'box',
107 |     'bread',
108 |     'broom',
109 |     'brush',
110 |     'bucket',
111 |     'calendar',
112 |     'camera',
113 |     'can',
114 |     'candle',
115 |     'candlestick',
116 |     'cap',
117 |     'carpet',
118 |     'cart',
119 |     'case',
120 |     'chair',
121 |     'cleanser',
122 |     'clock',
123 |     'clothes',
124 |     'coat hanger',
125 |     'coffee maker',
126 |     'coil',
127 |     'computer',
128 |     'conducting wire',
129 |     'container',
130 |     'control',
131 |     'cosmetics',
132 |     'crate',
133 |     'cube',
134 |     'cup',
135 |     'curtain',
136 |     'cushion',
137 |     'decoration',
138 |     'detergent',
139 |     'device',
140 |     'dish rack',
141 |     'dispenser',
142 |     'door knob',
143 |     'drawer',
144 |     'dress',
145 |     'drum',
146 |     'dumbbell',
147 |     'dustpan',
148 |     'dvd',
149 |     'eraser',
150 |     'fan',
151 |     'file',
152 |     'fire extinguisher',
153 |     'flowerpot',
154 |     'folder',
155 |     'food',
156 |     'footstool',
157 |     'frame',
158 |     'fruit',
159 |     'garbage',
160 |     'glass',
161 |     'globe',
162 |     'glove',
163 |     'guitar',
164 |     'hair dryer',
165 |     'hamper',
166 |     'hanger',
167 |     'hat',
168 |     'headphones',
169 |     'heater',
170 |     'helmets',
171 |     'holder',
172 |     'hook',
173 |     'humidifier',
174 |     'ironware',
175 |     'jacket',
176 |     'jar',
177 |     'kettle',
178 |     'keyboard',
179 |     'kitchenware',
180 |     'knife',
181 |     'label',
182 |     'ladder',
183 |     'lamp',
184 |     'laptop',
185 |     'letter',
186 |     'light',
187 |     'luggage',
188 |     'magazine',
189 |     'map',
190 |     'mask',
191 |     'mat',
192 |     'menu',
193 |     'microwave',
194 |     'mirror',
195 |     'monitor',
196 |     'mop',
197 |     'mouse',
198 |     'napkins',
199 |     'notebook',
200 |     'object',
201 |     'ottoman',
202 |     'pack',
203 |     'package',
204 |     'pad',
205 |     'pan',
206 |     'paper',
207 |     'paper cutter',
208 |     'pen',
209 |     'picture',
210 |     'pillow',
211 |     'pipe',
212 |     'pitcher',
213 |     'plant',
214 |     'plate',
215 |     'player',
216 |     'plug',
217 |     'plunger',
218 |     'poster',
219 |     'pot',
220 |     'price tag',
221 |     'printer',
222 |     'projector',
223 |     'purse',
224 |     'rack',
225 |     'radio',
226 |     'remote control',
227 |     'rod',
228 |     'roll',
229 |     'rope',
230 |     'sack',
231 |     'salt',
232 |     'scale',
233 |     'scissors',
234 |     'screen',
235 |     'seasoning',
236 |     'shampoo',
237 |     'sheet',
238 |     'shirt',
239 |     'shoe',
240 |     'shovel',
241 |     'sign',
242 |     'soap',
243 |     'soap dish',
244 |     'soap dispenser',
245 |     'speaker',
246 |     'sponge',
247 |     'spoon',
248 |     'stand',
249 |     'stapler',
250 |     'statue',  # if small enough
251 |     'stick',
252 |     'stool',
253 |     'sunglasses',
254 |     'table',   # if small enough
255 |     'tablet',
256 |     'teapot',
257 |     'telephone',
258 |     'tissue',
259 |     'tissue box',
260 |     'toaster',
261 |     'toilet paper',
262 |     'toiletry',
263 |     'tool',
264 |     'toothbrush',
265 |     'toothpaste',
266 |     'towel',
267 |     'toy',
268 |     'tray',
269 |     'trophy',
270 |     'tube',
271 |     'tv',     # most modern sets can be moved by one person
272 |     'umbrella',
273 |     'urn',
274 |     'utensil',
275 |     'vacuum cleaner',
276 |     'vase',
277 |     'wine',
278 |     'wire',
279 |     'wood',
280 |     'wrap'
281 | ]
282 | 
283 | 
284 | 
285 | embodiedscan_objs = [
286 |     'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack',
287 |     'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard',
288 |     'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench',
289 |     'bicycle', 'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board',
290 |     'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', 'bread',
291 |     'broom', 'brush', 'bucket', 'cabinet', 'calendar', 'camera', 'can',
292 |     'candle', 'candlestick', 'cap', 'car', 'carpet', 'cart', 'case',
293 |     'ceiling', 'chair', 'chandelier', 'cleanser', 'clock', 'clothes', 
294 |     'clothes dryer', 'coat hanger', 'coffee maker', 'coil', 'column', 
295 |     'commode', 'computer', 'conducting wire', 'container', 'control', 
296 |     'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 
297 |     'crib', 'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 
298 |     'detergent', 'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 
299 |     'door', 'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 
300 |     'dresser', 'drum', 'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 
301 |     'exercise equipment', 'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 
302 |     'fireplace', 'floor', 'flowerpot', 'flush', 'folder', 'food', 'footstool', 
303 |     'frame', 'fruit', 'furniture', 'garage door', 'garbage', 'glass', 'globe', 
304 |     'glove', 'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 
305 |     'hanger', 'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 
306 |     'hook', 'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle', 
307 |     'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder', 
308 |     'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine', 
309 |     'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu', 
310 |     'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins', 
311 |     'notebook', 'object', 'ottoman', 'oven', 'pack', 'package', 'pad', 
312 |     'pan', 'panel', 'paper', 'paper cutter', 'partition', 'pedestal', 
313 |     'pen', 'person', 'piano', 'picture', 'pillar', 'pillow', 'pipe', 
314 |     'pitcher', 'plant', 'plate', 'player', 'plug', 'plunger', 'pool', 
315 |     'pool table', 'poster', 'pot', 'price tag', 'printer', 'projector', 
316 |     'purse', 'rack', 'radiator', 'radio', 'rail', 'range hood', 
317 |     'refrigerator', 'remote control', 'ridge', 'rod', 'roll', 'roof', 
318 |     'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', 'seasoning', 
319 |     'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel', 'shower', 
320 |     'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket', 
321 |     'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler', 
322 |     'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure', 
323 |     'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot', 
324 |     'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 
325 |     'toilet', 'toilet paper', 'toiletry', 'tool', 'toothbrush', 
326 |     'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube', 
327 |     'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vanity', 
328 |     'vase', 'vent', 'ventilation', 'wall', 'wardrobe', 'washbasin', 
329 |     'washing machine', 'water cooler', 'water heater', 'window', 
330 |     'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap'
331 | ]
332 | 
333 | 


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/relationship_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | # Utils for spatial relationships
 11 | import numpy as np
 12 | from matplotlib.path import Path
 13 | import open3d as o3d
 14 | from scipy.spatial import ConvexHull
 15 | 
 16 | 
 17 | 
 18 | 
 19 | def get_min_max_visible_depth_for_box(box, extrinsic, intrinsic, image_size):
 20 |     """
 21 |     Calculates the minimum and maximum depth of the visible parts of a box.
 22 | 
 23 |     Args:
 24 |         box (o3d.geometry.OrientedBoundingBox): The bounding box.
 25 |         extrinsic (np.ndarray): 4x4 extrinsic matrix (camera to world).
 26 |         intrinsic (np.ndarray): 4x4 camera intrinsic matrix.
 27 |         image_size (tuple): (width, height) of the image.
 28 | 
 29 |     Returns:
 30 |         tuple: (min_visible_depth, max_visible_depth) or (None, None) if the box
 31 |                is not visible or behind the camera.
 32 |     """
 33 |     width, height = image_size
 34 |     extrinsic_w2c = np.linalg.inv(extrinsic)
 35 |     EPS = 1e-6 # Small epsilon for depth checks
 36 | 
 37 |     # 1. Get world corners
 38 |     corners_world = np.asarray(box.get_box_points())
 39 |     corners_world_hom = np.hstack((corners_world, np.ones((8, 1)))) # Homogeneous coordinates
 40 | 
 41 |     # 2. Transform to camera coordinates
 42 |     corners_cam_hom = corners_world_hom @ extrinsic_w2c.T
 43 |     
 44 |     # Validate transformation results
 45 |     if not np.all(np.isfinite(corners_cam_hom)):
 46 |         # print("Warning: Non-finite values encountered during world to camera transformation.")
 47 |         return None, None
 48 | 
 49 |     # corners_cam = corners_cam_hom[:, :3] / corners_cam_hom[:, 3][:, np.newaxis] # Normalize if W is not 1
 50 |     corners_cam = corners_cam_hom[:, :3] # Assume w=1 from standard transformation
 51 |     depths = corners_cam[:, 2] # Z-coordinate is depth
 52 | 
 53 |     # 3. Filter points behind the camera
 54 |     valid_depth_mask = depths > EPS
 55 |     if not np.any(valid_depth_mask):
 56 |         return None, None # Box entirely behind camera
 57 | 
 58 |     valid_corners_cam = corners_cam[valid_depth_mask]
 59 |     valid_depths = depths[valid_depth_mask]
 60 | 
 61 |     # If no valid points in front of the camera, return None
 62 |     if valid_corners_cam.shape[0] == 0:
 63 |         return None, None
 64 | 
 65 |     # 4. Project *all* valid camera points (not just visible ones) to pixel coordinates to check overlap
 66 |     # This helps catch cases where vertices are off-screen but faces/edges are visible
 67 |     valid_corners_cam_hom_for_proj = np.hstack((valid_corners_cam, np.ones((valid_corners_cam.shape[0], 1))))
 68 |     corners_proj = valid_corners_cam_hom_for_proj @ intrinsic.T
 69 | 
 70 |     # Validate projection results
 71 |     if not np.all(np.isfinite(corners_proj)):
 72 |         # print("Warning: Non-finite values encountered during projection.")
 73 |         return None, None
 74 | 
 75 |     proj_depths = corners_proj[:, 2]
 76 |     # Filter points where projection depth is too small (avoids division by zero)
 77 |     valid_proj_mask = np.abs(proj_depths) >= EPS
 78 |     if not np.any(valid_proj_mask):
 79 |         return None, None # All points projected onto image plane or behind
 80 |     
 81 |     corners_proj = corners_proj[valid_proj_mask]
 82 |     proj_depths = proj_depths[valid_proj_mask]
 83 |     corners_pixel = corners_proj[:, :2] / proj_depths[:, np.newaxis]
 84 |     # We also need to filter the original depths to match the filtered projected points
 85 |     valid_depths = valid_depths[valid_proj_mask]
 86 | 
 87 |     corners_pixel_rounded = np.round(corners_pixel).astype(int)
 88 | 
 89 |     # 5. Check visibility: At least one vertex inside image bounds?
 90 |     in_image_mask = (corners_pixel_rounded[:, 0] >= 0) & (corners_pixel_rounded[:, 0] < width) & \
 91 |                     (corners_pixel_rounded[:, 1] >= 0) & (corners_pixel_rounded[:, 1] < height)
 92 |     any_vertex_visible = np.any(in_image_mask)
 93 | 
 94 |     # 6. Check visibility: Projected bounding box overlaps image?
 95 |     min_px, min_py = np.min(corners_pixel_rounded, axis=0)
 96 |     max_px, max_py = np.max(corners_pixel_rounded, axis=0)
 97 |     bbox_overlaps_image = not (max_px < 0 or min_px >= width or max_py < 0 or min_py >= height)
 98 | 
 99 |     # 7. Determine if any part is visible
100 |     is_visible = any_vertex_visible or bbox_overlaps_image
101 | 
102 |     # 8. Return min/max depth if visible
103 |     if is_visible and valid_depths.size > 0: # Ensure there are depths to calculate min/max from
104 |         min_visible_depth = np.min(valid_depths)
105 |         max_visible_depth = np.max(valid_depths)
106 |         return min_visible_depth, max_visible_depth
107 |     else:
108 |         return None, None
109 | 
110 | 
111 | 
112 | def calculate_occupied_pixels(objects, extrinsic, intrinsic, img_shape):
113 |     """Compute occupancy map for the given objects (Optimized).
114 | 
115 |     Also returns individual occupancy maps for each object.
116 | 
117 |     Args:
118 |         objects (list of dict): List of object dictionaries, each must contain
119 |                                 'obb' (o3d.geometry.OrientedBoundingBox) and
120 |                                 'name' (str).
121 |         extrinsic (np.ndarray): 4x4 extrinsic matrix (camera to world transformation).
122 |         intrinsic (np.ndarray): 4x4 camera intrinsic matrix.
123 |         img_shape (tuple): Shape of the image (width, height).
124 | 
125 |     Returns:
126 |         tuple: (combined_occupancy_map, individual_occupancy_maps)
127 |             - combined_occupancy_map (np.ndarray): 2D boolean array for all objects.
128 |             - individual_occupancy_maps (dict): Dictionary where keys are object
129 |                                                 names and values are individual
130 |                                                 2D boolean occupancy maps.
131 |     """
132 |     EPS = 1e-6
133 |     extrinsic_w2c = np.linalg.inv(extrinsic)
134 |     w, h = img_shape # Correctly unpack width and height
135 |     combined_occupancy_map = np.zeros((h, w), dtype=bool)
136 |     individual_occupancy_maps = {} # Store individual maps here
137 | 
138 |     faces = [
139 |         [0, 1, 2, 3], [4, 5, 6, 7],
140 |         [0, 1, 5, 4], [3, 2, 6, 7],
141 |         [0, 3, 7, 4], [1, 2, 6, 5]
142 |     ]
143 | 
144 |     for obj_info in objects:
145 |         box = obj_info.get('obb')
146 |         obj_name = obj_info.get('name')
147 | 
148 |         if box is None or obj_name is None:
149 |              print(f"Warning: Skipping object due to missing 'obb' or 'name'. Info: {obj_info}")
150 |              continue
151 | 
152 |         # Initialize individual map for this object
153 |         current_obj_map = np.zeros((h, w), dtype=bool)
154 | 
155 |         # Get the corners of the box
156 |         corners = np.asarray(box.get_box_points())
157 |         # Reorder corners to match the original code
158 |         corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]]
159 |         # Add homogeneous coordinate
160 |         corners_hom = np.concatenate([corners, np.ones((corners.shape[0], 1))], axis=1)
161 |         # Project corners to image plane
162 |         corners_img = (intrinsic @ extrinsic_w2c @ corners_hom.T).T
163 | 
164 |         # Check for invalid depths early
165 |         if np.any(np.abs(corners_img[:, 2]) < EPS):
166 |             pass # Face-by-face check will handle points behind camera
167 | 
168 |         # Normalize projected points using actual z-coordinate
169 |         corners_pixel = np.zeros((corners_img.shape[0], 2))
170 |         valid_proj_mask = np.abs(corners_img[:, 2]) >= EPS
171 |         if np.any(valid_proj_mask):
172 |              corners_pixel[valid_proj_mask] = corners_img[valid_proj_mask, :2] / corners_img[valid_proj_mask, 2][:, np.newaxis]
173 | 
174 |         for face in faces:
175 |              # Check if all vertices of the face are behind the camera
176 |             if np.any(corners_img[face, 2] < EPS):
177 |                  continue # Skip faces that are entirely or partially behind the camera plane
178 | 
179 |             pts = corners_pixel[face]
180 | 
181 |             # Calculate the bounding box of the projected face
182 |             min_coords = np.min(pts, axis=0)
183 |             max_coords = np.max(pts, axis=0)
184 | 
185 |             # Determine the subgrid boundaries, clamping to image dimensions
186 |             min_x = max(0, int(np.floor(min_coords[0])))
187 |             min_y = max(0, int(np.floor(min_coords[1])))
188 |             max_x = min(w - 1, int(np.ceil(max_coords[0])))
189 |             max_y = min(h - 1, int(np.ceil(max_coords[1])))
190 | 
191 |             # If the bounding box is outside the image or has no area, skip
192 |             if max_x < min_x or max_y < min_y:
193 |                 continue
194 | 
195 |             # Create coordinate grid only for the bounding box region
196 |             sub_x, sub_y = np.meshgrid(np.arange(min_x, max_x + 1), np.arange(min_y, max_y + 1))
197 |             pixel_points_sub = np.vstack((sub_x.flatten(), sub_y.flatten())).T
198 | 
199 |             if pixel_points_sub.size == 0:
200 |                 continue # Skip if subgrid is empty
201 | 
202 |             # Check containment using Path for the subgrid
203 |             p = Path(pts)
204 |             mask_sub = p.contains_points(pixel_points_sub).reshape((max_y - min_y + 1, max_x - min_x + 1))
205 | 
206 |             # Update the *individual* occupancy map
207 |             current_obj_map[min_y:max_y+1, min_x:max_x+1] |= mask_sub
208 | 
209 |         # Store the individual map
210 |         individual_occupancy_maps[obj_name] = current_obj_map
211 |         # Combine into the main map
212 |         combined_occupancy_map |= current_obj_map
213 | 
214 |     return combined_occupancy_map, individual_occupancy_maps
215 | 
216 | 
217 | 
218 | # Function to project the bounding box to the floor (2D)
219 | def project_to_floor(box):
220 |     corners = np.asarray(box.get_box_points())
221 |     # corners[:, 2] = 0  # Set the z-coordinate to 0 to project onto the floor
222 |     return corners[:, :2]  # Return only the x,y coordinates
223 | 
224 | 
225 | 
226 | # Adapted from: https://github.com/OpenRobotLab/EmbodiedScan/blob/main/embodiedscan/visualization/utils.py
227 | # License: Apache 2.0
228 | def _9dof_to_box(box, label=None, color_selector=None, color=None):
229 |     """Convert 9-DoF box from array/tensor to open3d.OrientedBoundingBox.
230 | 
231 |     Args:
232 |         box (numpy.ndarray|torch.Tensor|List[float]):
233 |             9-DoF box with shape (9,).
234 |         label (int, optional): Label of the box. Defaults to None.
235 |         color_selector (:obj:`ColorSelector`, optional):
236 |             Color selector for boxes. Defaults to None.
237 |         color (tuple[int], optional): Color of the box.
238 |             You can directly specify the color.
239 |             If you do, the color_selector and label will be ignored.
240 |             Defaults to None.
241 |     """
242 |     if isinstance(box, list):
243 |         box = np.array(box)
244 |     else:
245 |         print("box is not a list!")
246 |         print(type(box))
247 |     # if isinstance(box, Tensor):  #NOTE omitted to not load in torch for just this!
248 |     #     box = box.cpu().numpy()
249 |     center = box[:3].reshape(3, 1)
250 |     scale = box[3:6].reshape(3, 1)
251 |     rot = box[6:].reshape(3, 1)
252 |     rot_mat = o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot)
253 |     geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale)
254 | 
255 |     if color is not None:
256 |         geo.color = [x / 255.0 for x in color]
257 |         return geo
258 | 
259 |     if label is not None and color_selector is not None:
260 |         color = color_selector.get_color(label)
261 |         color = [x / 255.0 for x in color]
262 |         geo.color = color
263 |     return geo


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/relationships.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual
 4 | # property and proprietary rights in and to this material, related
 5 | # documentation and any modifications thereto. Any use, reproduction,
 6 | # disclosure or distribution of this material and related documentation
 7 | # without an express license agreement from NVIDIA CORPORATION or
 8 | # its affiliates is strictly prohibited.
 9 | 
10 | """High-level interface for spatial relationship analysis between 3D objects.
11 | 
12 | This module provides wrapper functions that simplify the calculation of various
13 | spatial relationships by utilizing lower-level functions from the `context`,
14 | `configuration`, and `compatibility` submodules.
15 | 
16 | Key functionalities include:
17 | - `get_spatial_context`: Determines points in space relative to a reference object
18 |   (e.g., 'in front of', 'behind').
19 | - `get_spatial_configuration`: Calculates 3D directional relationships between two
20 |   objects (e.g., 'left of', 'above').
21 | - `get_spatial_compatibility`: Assesses whether one object can physically fit
22 |   relative to another (e.g., 'on top of', 'next to').
23 | 
24 | These functions are typically used by higher-level annotation generation scripts.
25 | """
26 | 
27 | from spatial_analysis.obj_properties import items_with_face, movable_and_placeable_items, flat_surface_items
28 | from spatial_analysis.context.context import get_point_in_space_relative_to_object
29 | from spatial_analysis.configuration.configuration import check_spatial_configuration_relationships
30 | from spatial_analysis.compatibility.compatibility import can_fit_object_a_in_relation_to_b, can_fit_on_top
31 | 
32 | 
33 | def get_spatial_context(obj, extrinsic, intrinsic, floor_bound, obbs, image_size, image_path,
34 |                         individual_occupancy_maps, env_occupancy_map,
35 |                         threshold, grid_resolution, num_samples):
36 |     """Generates points relative to an object (e.g., in front, behind) for contextual understanding."""
37 | 
38 |     have_face = obj["name"] in items_with_face
39 | 
40 |     # Generate potential points relative to the object's spatial context.
41 |     sampled_points, sampled_3d_points, visible_points_3d_all, generated_something = get_point_in_space_relative_to_object(
42 |         floor_bound, obbs,
43 |         ref_obj=obj,
44 |         extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size, have_face=have_face,
45 |         num_samples=num_samples, threshold=threshold, grid_resolution=grid_resolution,
46 |         individual_occupancy_maps=individual_occupancy_maps,
47 |         env_occupancy_map=env_occupancy_map,
48 |         image_path=image_path,
49 |     )
50 | 
51 |     if generated_something:
52 |         return sampled_points, sampled_3d_points, True
53 |     return None, None, False
54 | 
55 | 
56 | def get_spatial_configuration(obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness='lenient'):
57 |     """Calculates spatial configuration relationships (left/right, above/below, etc.) between two objects."""
58 | 
59 |     obj_configuration_relationships = check_spatial_configuration_relationships(
60 |         obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness
61 |     )
62 | 
63 |     return obj_configuration_relationships
64 | 
65 | 
66 | def get_spatial_compatibility(obj1, obj2, extrinsic, intrinsic, floor_bound, obbs, image_size, image_path,
67 |                               individual_occupancy_maps, env_occupancy_map,
68 |                               grid_resolution, num_samples, min_distance, buffer_ratio):
69 |     """Checks if obj1 can fit in relation to obj2 (e.g., on top, next to)."""
70 | 
71 |     # Check if the anchor object (obj2) has a face, as this influences spatial context calculations.
72 |     have_face = obj2["name"] in items_with_face
73 | 
74 |     # Check fitting in various spatial relations using sampling-based methods.
75 |     results = can_fit_object_a_in_relation_to_b(
76 |         floor_bound, obbs,
77 |         obj_a=obj1,
78 |         obj_b=obj2,
79 |         have_face=have_face,
80 |         extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size, image_path=image_path,
81 |         grid_resolution=grid_resolution,
82 |         min_distance=min_distance,
83 |         num_samples=num_samples,
84 |         individual_occupancy_maps=individual_occupancy_maps,
85 |         env_occupancy_map=env_occupancy_map,
86 |         buffer_ratio=buffer_ratio
87 |     )
88 | 
89 |     # Specifically check 'on_top' relationship using direct OBB comparison
90 |     # for movable items on flat surfaces, as this is a common and simpler case.
91 |     fits_on_top = False
92 |     if obj1["name"] in movable_and_placeable_items and obj2["name"] in flat_surface_items:
93 |         fits_on_top = can_fit_on_top(obj1["obb"], obj2["obb"])
94 | 
95 |     if "worldcentric" not in results:
96 |         results["worldcentric"] = {}
97 |     results["worldcentric"]["on_top"] = fits_on_top
98 |     
99 |     return results


--------------------------------------------------------------------------------
/robospatial/spatial_analysis/topdown_map.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # NVIDIA CORPORATION and its licensors retain all intellectual
  4 | # property and proprietary rights in and to this material, related
  5 | # documentation and any modifications thereto. Any use, reproduction,
  6 | # disclosure or distribution of this material and related documentation
  7 | # without an express license agreement from NVIDIA CORPORATION or
  8 | # its affiliates is strictly prohibited.
  9 | 
 10 | 
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from scipy.spatial import ConvexHull
 14 | import matplotlib.path as mpath
 15 | 
 16 | from spatial_analysis.relationship_utils import project_to_floor
 17 | 
 18 | DEBUG_EMPTY_SPACE=False
 19 | 
 20 | # Function to create the grid representing the floor
 21 | def create_floor_grid(floor_bound, grid_resolution=0.1):
 22 | 
 23 |     min_bound = floor_bound[0]
 24 |     max_bound = floor_bound[1]
 25 | 
 26 |     x_range = np.arange(min_bound[0], max_bound[0], grid_resolution)
 27 |     y_range = np.arange(min_bound[1], max_bound[1], grid_resolution)
 28 |     
 29 |     return np.meshgrid(x_range, y_range)
 30 | 
 31 | # Function to mark occupied areas on the grid
 32 | def mark_occupied_areas(grid, boxes, occupied, floor=False):
 33 |     x_flat = grid[0].ravel()
 34 |     y_flat = grid[1].ravel()
 35 |     points_array = np.column_stack((x_flat, y_flat))
 36 | 
 37 |     for box in boxes:
 38 |         projected_points = project_to_floor(box)
 39 |         hull = ConvexHull(projected_points)
 40 |         hull_vertices = projected_points[hull.vertices]
 41 |         path = mpath.Path(hull_vertices)
 42 | 
 43 |         # Vectorized point-in-polygon test
 44 |         if floor:
 45 |             inside = ~path.contains_points(points_array)
 46 |         else:
 47 |             inside = path.contains_points(points_array)
 48 | 
 49 |         # Update the occupied grid
 50 |         occupied.ravel()[inside] = True
 51 | 
 52 |     return occupied
 53 | 
 54 | # Function to find empty areas on the grid
 55 | def find_empty_areas(occupied):
 56 |     empty_areas = np.logical_not(occupied)
 57 |     return empty_areas
 58 | 
 59 | def get_empty_space(floor_bound, boxes, grid_resolution=0.01):
 60 |     grid = create_floor_grid(floor_bound, grid_resolution)
 61 |     empty_occupied = np.zeros(grid[0].shape, dtype=bool)
 62 |     occupied = mark_occupied_areas(grid, boxes, empty_occupied)
 63 |     empty_areas = find_empty_areas(occupied)
 64 |     
 65 |     if DEBUG_EMPTY_SPACE:
 66 |         plt.figure(figsize=(10, 10))
 67 |         
 68 |         # Create color-coded grid
 69 |         color_grid = np.zeros((*empty_areas.shape, 3), dtype=np.uint8)
 70 |         color_grid[empty_areas] = [0, 255, 0]  # Green for empty
 71 |         color_grid[~empty_areas] = [255, 0, 0]  # Red for occupied
 72 |         
 73 |         # Plot the grid
 74 |         plt.imshow(color_grid, 
 75 |                   extent=[grid[0].min(), grid[0].max(), grid[1].min(), grid[1].max()],
 76 |                   origin='lower')
 77 |         
 78 |         # Plot boxes as outlines
 79 |         for box in boxes:
 80 |             corners = project_to_floor(box)
 81 |             plt.plot(corners[:, 0], corners[:, 1], 'blue', linewidth=2)
 82 |             plt.fill(corners[:, 0], corners[:, 1], 'blue', alpha=0.3)
 83 |         
 84 |         # If floor_box is not a list, plot it as an outline
 85 |         min_bound = floor_bound[0]
 86 |         max_bound = floor_bound[1]
 87 |         plt.plot(min_bound[0], min_bound[1], 'black', linewidth=2)
 88 |         plt.fill(min_bound[0], min_bound[1], 'black', alpha=0.1)
 89 |         plt.plot(max_bound[0], max_bound[1], 'black', linewidth=2)
 90 |         plt.fill(max_bound[0], max_bound[1], 'black', alpha=0.1)
 91 |         plt.plot(min_bound[0], max_bound[1], 'black', linewidth=2)
 92 |         plt.fill(min_bound[0], max_bound[1], 'black', alpha=0.1)
 93 |         plt.plot(max_bound[0], min_bound[1], 'black', linewidth=2)
 94 |         plt.fill(max_bound[0], min_bound[1], 'black', alpha=0.1)
 95 | 
 96 |         plt.title("Empty Space Grid")
 97 |         plt.xlabel('X')
 98 |         plt.ylabel('Y')
 99 |         plt.grid(True)
100 |         # Create a legend instead of a colorbar
101 |         from matplotlib.patches import Patch
102 |         legend_elements = [
103 |             Patch(facecolor='red', edgecolor='black', label='Occupied'),
104 |             Patch(facecolor='green', edgecolor='black', label='Empty')
105 |         ]
106 |         plt.legend(handles=legend_elements, loc='lower right')
107 |         plt.show()
108 |     
109 |     return empty_areas, grid, occupied
110 | 
111 | 


--------------------------------------------------------------------------------
/scripts/visualize_input.py:
--------------------------------------------------------------------------------
  1 | # Create a new file named visualize_simple.py
  2 | import os
  3 | import json
  4 | import cv2
  5 | import argparse
  6 | import numpy as np
  7 | import open3d as o3d
  8 | import matplotlib.path as mpath # Needed for face filling in draw_box3d
  9 | 
 10 | # --- Utility Functions (Copied and potentially simplified) ---
 11 | 
 12 | def _9dof_to_box(box_params, color=None):
 13 |     """Convert 9-DoF box from array/tensor to open3d.OrientedBoundingBox.
 14 | 
 15 |     Args:
 16 |         box_params (numpy.ndarray|list): 9-DoF box [cx, cy, cz, sx, sy, sz, rx, ry, rz].
 17 |         color (tuple[int], optional): RGB Color of the box (0-255). Defaults to None.
 18 | 
 19 |     Returns:
 20 |         open3d.geometry.OrientedBoundingBox: The converted Open3D box.
 21 |     """
 22 |     if isinstance(box_params, list):
 23 |         box_params = np.array(box_params)
 24 | 
 25 |     center = box_params[:3].reshape(3, 1)
 26 |     scale = box_params[3:6].reshape(3, 1)
 27 |     rot = box_params[6:].reshape(3, 1)
 28 |     rot_mat = o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot)
 29 |     geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale)
 30 | 
 31 |     if color is not None:
 32 |         geo.color = [x / 255.0 for x in color] # O3D uses 0-1 range
 33 | 
 34 |     return geo
 35 | 
 36 | def _inside_box(box, point):
 37 |     """Check if any points are inside the box.
 38 | 
 39 |     Args:
 40 |         box (open3d.geometry.OrientedBoundingBox): Oriented Box.
 41 |         point (np.ndarray): N points represented by nx3 array (x, y, z).
 42 | 
 43 |     Returns:
 44 |         bool: True if any point is inside, False otherwise.
 45 |     """
 46 |     # Reference logic uses nx4, check if conversion needed
 47 |     if point.shape[1] == 4:
 48 |         point = point[:, :3]
 49 |     point_vec = o3d.utility.Vector3dVector(point)
 50 |     inside_idx = box.get_point_indices_within_bounding_box(point_vec)
 51 |     return len(inside_idx) > 0
 52 | 
 53 | # Replaced with logic from visualization/img_drawer.py:draw_box3d
 54 | def draw_box3d_on_image(image, box, color, label, extrinsic, intrinsic):
 55 |     """Draw 3D boxes on the image, exactly matching img_drawer.py logic.
 56 | 
 57 |     Args:
 58 |         image (np.ndarray): The image to draw on.
 59 |         box (open3d.geometry.OrientedBoundingBox): Box to be drawn.
 60 |         color (tuple): Box color.
 61 |         label (str): Box category label.
 62 |         extrinsic (np.ndarray): 4x4 extrinsic matrix (axis_align @ cam2global).
 63 |         intrinsic (np.ndarray): 4x4 camera intrinsic matrix.
 64 | 
 65 |     Returns:
 66 |         np.ndarray: Image with the box drawn.
 67 |     """
 68 |     EPS = 1e-4  # Epsilon from img_drawer
 69 |     ALPHA = 0.75  # Alpha from img_drawer (was 0.6)
 70 | 
 71 |     extrinsic_w2c = np.linalg.inv(extrinsic)
 72 |     h, w = image.shape[:2]
 73 |     x, y = np.meshgrid(np.arange(w), np.arange(h))
 74 |     x, y = x.flatten(), y.flatten()
 75 |     pixel_points = np.vstack((x, y)).T
 76 | 
 77 |     # Fix 1: Use transpose() as in original code
 78 |     camera_pos_in_world = (extrinsic @ np.array([0, 0, 0, 1]).reshape(4, 1)).transpose()
 79 |     if _inside_box(box, camera_pos_in_world):
 80 |         return image
 81 | 
 82 |     corners = np.asarray(box.get_box_points())
 83 |     corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]]  # Specific corner order from img_drawer
 84 |     corners = np.concatenate([corners, np.ones((corners.shape[0], 1))], axis=1)
 85 |     
 86 |     # Same projection as img_drawer: intrinsic @ extrinsic_w2c @ corners.transpose()
 87 |     corners_img = intrinsic @ extrinsic_w2c @ corners.transpose()
 88 |     corners_img = corners_img.transpose()
 89 |     
 90 |     corners_pixel = np.zeros((corners_img.shape[0], 2))
 91 |     
 92 |     # Fix 2: Use np.abs() in division exactly as in img_drawer
 93 |     for i in range(corners_img.shape[0]):
 94 |         corners_pixel[i] = corners_img[i][:2] / np.abs(corners_img[i][2])
 95 | 
 96 |     lines = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7],
 97 |              [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]]
 98 |     faces = [[0, 1, 2, 3], [4, 5, 6, 7], [0, 1, 5, 4], [3, 2, 6, 7],
 99 |              [0, 3, 7, 4], [1, 2, 6, 5]]
100 | 
101 |     image_with_box = image.copy()
102 | 
103 |     # Fix 3: Use exact depth check from img_drawer for lines
104 |     for line in lines:
105 |         # This is the exact check from img_drawer
106 |         if (corners_img[line][:, 2] < EPS).any():
107 |             continue
108 |         px = corners_pixel[line[0]].astype(np.int32)
109 |         py = corners_pixel[line[1]].astype(np.int32)
110 |         cv2.line(image_with_box, (px[0], px[1]), (py[0], py[1]), color, 2)
111 | 
112 |     # Fix 4: Use exact mask/face handling from img_drawer
113 |     all_mask = np.zeros((h, w), dtype=bool)
114 |     for face in faces:
115 |         # This is the exact check from img_drawer
116 |         if (corners_img[face][:, 2] < EPS).any():
117 |             continue
118 |         pts = corners_pixel[face]
119 |         p = mpath.Path(pts[:, :2])
120 |         mask = p.contains_points(pixel_points).reshape((h, w))
121 |         all_mask = np.logical_or(all_mask, mask)
122 |     
123 |     # Apply color blend - exact formula from img_drawer
124 |     image_with_box[all_mask] = image_with_box[all_mask] * ALPHA + (1 - ALPHA) * np.array(color)
125 | 
126 |     # Draw text label if any faces were visible
127 |     if all_mask.any():
128 |         textpos = np.min(corners_pixel, axis=0).astype(np.int32)
129 |         textpos[0] = np.clip(textpos[0], a_min=0, a_max=w)
130 |         textpos[1] = np.clip(textpos[1], a_min=0, a_max=h)
131 |         
132 |         # Simple text drawing to mimic self.draw_text from img_drawer
133 |         font = cv2.FONT_HERSHEY_SIMPLEX
134 |         font_scale = 0.6
135 |         thickness = 1
136 |         text_size, _ = cv2.getTextSize(label, font, font_scale, thickness)
137 |         text_w, text_h = text_size
138 |         
139 |         # Draw background box and text
140 |         cv2.rectangle(image_with_box, 
141 |                      (textpos[0], textpos[1]), 
142 |                      (textpos[0] + text_w, textpos[1] + text_h),
143 |                      color, -1)
144 |         cv2.putText(image_with_box, label, 
145 |                    (textpos[0], textpos[1] + text_h),
146 |                    font, font_scale, (255, 255, 255), thickness)
147 | 
148 |     return image_with_box
149 | 
150 | # --- Main Visualization Logic ---
151 | 
152 | def visualize_single_image(image_path, annotation_data):
153 |     """Loads image and draws 3D boxes based on annotation data."""
154 |     image = cv2.imread(image_path)
155 |     if image is None:
156 |         print(f"Error: Could not load image {image_path}")
157 |         return
158 | 
159 |     # Extract camera info
160 |     cam_ann = annotation_data.get("camera_annotations")
161 |     if not cam_ann:
162 |         print("Error: 'camera_annotations' not found in JSON.")
163 |         return
164 |     try:
165 |         # extrinsic is Camera -> World
166 |         extrinsic = np.array(cam_ann['extrinsic'])
167 |         # Intrinsic matrix
168 |         intrinsic = np.array(cam_ann['intrinsic'])
169 | 
170 |     except KeyError as e:
171 |         print(f"Error: Missing camera parameter key: {e}")
172 |         return
173 |     # Removed LinAlgError check here as inversion happens in drawing function now
174 |     except Exception as e:
175 |          print(f"Error processing camera parameters: {e}")
176 |          return
177 | 
178 | 
179 |     # Extract object grounding info
180 |     object_grounding = annotation_data.get("objects", [])
181 |     if not object_grounding:
182 |         print("Warning: 'objects' array is missing or empty.")
183 |         # Display original image if no objects
184 |         cv2.imshow("Image with 3D Boxes", image)
185 |         cv2.waitKey(0)
186 |         cv2.destroyAllWindows()
187 |         return
188 | 
189 |     display_image = image.copy()
190 |     # Ensure matplotlib is imported for colormap and path
191 |     try:
192 |         import matplotlib.pyplot as plt
193 |         # Ensure mpath is imported here as it's needed by draw_box3d
194 |         import matplotlib.path as mpath
195 |         colors = plt.colormaps['tab10'] # Get distinct colors - Updated API
196 |     except ImportError:
197 |         print("Error: Matplotlib required for colormap and face drawing. Please install.")
198 |         # Fallback to manual colors if matplotlib fails
199 |         colors = lambda i: [(255,0,0), (0,255,0), (0,0,255)][i % 3]
200 | 
201 | 
202 |     for i, obj_data in enumerate(object_grounding):
203 |         obj_name = obj_data.get("Name", f"Object_{i+1}")
204 |         bbox_3d_list = obj_data.get("bbox_3d")
205 | 
206 |         if bbox_3d_list:
207 |             # Assuming the first bbox in the list is the one to draw
208 |             bbox_9dof = bbox_3d_list[0]
209 | 
210 |             # Get color
211 |             if callable(colors): # Check if it's a colormap function or fallback list
212 |                  color_float = colors(i)[:3] # Get RGB, discard alpha
213 |                  color_uint8 = tuple(int(c * 255) for c in color_float)
214 |             else: # Fallback list
215 |                  color_uint8 = colors[i % len(colors)] # Use modulo for safety
216 | 
217 | 
218 |             try:
219 |                 # Box center/extent/rotation are in Aligned World space from JSON
220 |                 o3d_box = _9dof_to_box(bbox_9dof, color=color_uint8)
221 |                 # Pass the combined extrinsic and intrinsic to the drawing function
222 |                 display_image = draw_box3d_on_image(
223 |                     display_image,
224 |                     o3d_box,
225 |                     color_uint8,
226 |                     obj_name,
227 |                     extrinsic, # Combined matrix (axis_align @ cam2global)
228 |                     intrinsic  # Camera intrinsics (K)
229 |                 )
230 |             except Exception as e:
231 |                 print(f"Error processing/drawing box for '{obj_name}': {e}")
232 |                 import traceback
233 |                 traceback.print_exc() # More detailed error for debugging
234 |         else:
235 |             print(f"Warning: No 'bbox_3d' found for object '{obj_name}'.")
236 | 
237 |     # Display the result
238 |     cv2.imshow("Image with 3D Boxes", display_image)
239 |     print("Press any key to close the window.")
240 |     cv2.waitKey(0)
241 |     cv2.destroyAllWindows()
242 | 
243 | # --- Entry Point ---
244 | 
245 | if __name__ == "__main__":
246 |     parser = argparse.ArgumentParser(description="Visualize 3D bounding boxes from an annotation file on an image.")
247 |     parser.add_argument('--image_path', type=str, required=True,
248 |                         help='Direct path to the image file.')
249 |     parser.add_argument('--annotation_file', type=str, required=True,
250 |                         help='Path to the JSON annotation file.')
251 |     args = parser.parse_args()
252 | 
253 |     # Load annotation data
254 |     try:
255 |         with open(args.annotation_file, 'r') as f:
256 |             annotation_data = json.load(f)
257 |     except FileNotFoundError:
258 |         print(f"Error: Annotation file not found at {args.annotation_file}")
259 |         exit(1)
260 |     except json.JSONDecodeError:
261 |         print(f"Error: Could not parse JSON from {args.annotation_file}")
262 |         exit(1)
263 |     except Exception as e:
264 |          print(f"An unexpected error occurred loading annotations: {e}")
265 |          exit(1)
266 | 
267 |     # Use the provided image path directly
268 |     image_path = args.image_path
269 | 
270 |     # Check if the provided image file exists before proceeding
271 |     if not os.path.isfile(image_path):
272 |          print(f"Error: Image file not found at the provided path: {image_path}")
273 |          exit(1) # Exit if the primary path doesn't exist
274 | 
275 |     # Import matplotlib here to avoid making it a hard dependency if not needed
276 |     # although draw_box3d currently needs mpath
277 |     try:
278 |          import matplotlib.pyplot as plt
279 |          # Make sure mpath is imported within the try block as well
280 |          import matplotlib.path as mpath
281 |     except ImportError:
282 |          print("Error: Matplotlib is required by the drawing function. Please install it (`pip install matplotlib`).")
283 |          exit(1)
284 | 
285 | 
286 |     visualize_single_image(image_path, annotation_data)
287 |     print("Visualization finished.")


--------------------------------------------------------------------------------