├── .gitignore ├── LICENSE ├── README.md ├── data └── README.md ├── example_data ├── annotations │ └── example_input.json └── images │ └── example_dataset │ └── example_scene │ └── example_image.jpg ├── requirements.txt ├── robospatial ├── README.md ├── __init__.py ├── annotation_generator.py ├── configs │ ├── embodiedscan.yaml │ ├── example_config.yaml │ └── example_dataset.yaml ├── data_loader │ ├── README.md │ ├── __init__.py │ ├── base_loader.py │ ├── embodiedscan_loader.py │ └── example_loader.py ├── run_generation.py └── spatial_analysis │ ├── __init__.py │ ├── compatibility │ ├── compatibility.py │ └── compatibility_utils.py │ ├── configuration │ ├── configuration.py │ └── configuration_utils.py │ ├── context │ ├── context.py │ └── context_utils.py │ ├── grounding.py │ ├── obj_properties.py │ ├── relationship_utils.py │ ├── relationships.py │ └── topdown_map.py └── scripts ├── visualize_input.py └── visualize_output.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | 173 | # outputs 174 | outputs/ 175 | checkpoints/ 176 | wandb/ 177 | 178 | example_data/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024-Present, NVIDIA Corporation & affiliates. All rights reserved. 2 | 3 | 4 | ======================================================================= 5 | 6 | 1. Definitions 7 | 8 | "Licensor" means any person or entity that distributes its Work. 9 | 10 | "Software" means the original work of authorship made available under 11 | this License. 12 | 13 | "Work" means the Software and any additions to or derivative works of 14 | the Software that are made available under this License. 15 | 16 | The terms "reproduce," "reproduction," "derivative works," and 17 | "distribution" have the meaning as provided under U.S. copyright law; 18 | provided, however, that for the purposes of this License, derivative 19 | works shall not include works that remain separable from, or merely 20 | link (or bind by name) to the interfaces of, the Work. 21 | 22 | Works, including the Software, are "made available" under this License 23 | by including in or with the Work either (a) a copyright notice 24 | referencing the applicability of this License to the Work, or (b) a 25 | copy of this License. 26 | 27 | 2. License Grants 28 | 29 | 2.1 Copyright Grant. Subject to the terms and conditions of this 30 | License, each Licensor grants to you a perpetual, worldwide, 31 | non-exclusive, royalty-free, copyright license to reproduce, 32 | prepare derivative works of, publicly display, publicly perform, 33 | sublicense and distribute its Work and any resulting derivative 34 | works in any form. 35 | 36 | 3. Limitations 37 | 38 | 3.1 Redistribution. You may reproduce or distribute the Work only 39 | if (a) you do so under this License, (b) you include a complete 40 | copy of this License with your distribution, and (c) you retain 41 | without modification any copyright, patent, trademark, or 42 | attribution notices that are present in the Work. 43 | 44 | 3.2 Derivative Works. You may specify that additional or different 45 | terms apply to the use, reproduction, and distribution of your 46 | derivative works of the Work ("Your Terms") only if (a) Your Terms 47 | provide that the use limitation in Section 3.3 applies to your 48 | derivative works, and (b) you identify the specific derivative 49 | works that are subject to Your Terms. Notwithstanding Your Terms, 50 | this License (including the redistribution requirements in Section 51 | 3.1) will continue to apply to the Work itself. 52 | 53 | 3.3 Use Limitation. The Work and any derivative works thereof only 54 | may be used or intended for use non-commercially. Notwithstanding 55 | the foregoing, NVIDIA and its affiliates may use the Work and any 56 | derivative works commercially. As used herein, "non-commercially" 57 | means for research or evaluation purposes only. 58 | 59 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim 60 | against any Licensor (including any claim, cross-claim or 61 | counterclaim in a lawsuit) to enforce any patents that you allege 62 | are infringed by any Work, then your rights under this License from 63 | such Licensor (including the grant in Section 2.1) will terminate 64 | immediately. 65 | 66 | 3.5 Trademarks. This License does not grant any rights to use any 67 | Licensor�s or its affiliates� names, logos, or trademarks, except 68 | as necessary to reproduce the notices described in this License. 69 | 70 | 3.6 Termination. If you violate any term of this License, then your 71 | rights under this License (including the grant in Section 2.1) will 72 | terminate immediately. 73 | 74 | 4. Disclaimer of Warranty. 75 | 76 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY 77 | KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 78 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR 79 | NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER 80 | THIS LICENSE. 81 | 82 | 5. Limitation of Liability. 83 | 84 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 85 | THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 86 | SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, 87 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF 88 | OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 89 | (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, 90 | LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 91 | COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF 92 | THE POSSIBILITY OF SUCH DAMAGES. 93 | 94 | ======================================================================= -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RoboSpatial: Teaching Spatial Understanding to 2D and 3D Vision-Language Models for Robotics 2 | 3 | [**🌐 Homepage**](https://chanh.ee/RoboSpatial/) | [**📖 arXiv**](https://arxiv.org/abs/2411.16537) | [**📂 Benchmark**](https://huggingface.co/datasets/chanhee-luke/RoboSpatial-Home) | [**📊 Evaluation**](https://github.com/chanhee-luke/RoboSpatial-Eval) 4 | 5 | **✨ CVPR 2025 (Oral) ✨** 6 | 7 | Authors: [Chan Hee Song](https://chanh.ee)1, [Valts Blukis](https://research.nvidia.com/person/valts-blukis)2, [Jonathan Tremblay](https://research.nvidia.com/person/jonathan-tremblay)2, [Stephen Tyree](https://research.nvidia.com/person/stephen-tyree)2, [Yu Su](https://ysu1989.github.io/)1, [Stan Birchfield](https://sbirchfield.github.io/)2 8 | 9 | 1 The Ohio State University 2 NVIDIA 10 | 11 | --- 12 | 13 | ## 🔔News 14 | 15 | - **🔥[2025-04-24]: Released the RoboSpatial data generation pipeline, RoboSpatial-Home dataset, and evaluation script!** 16 | 17 | --- 18 | 19 | **Project Components:** 20 | 21 | This repository contains the code for **generating** the spatial annotations used in the RoboSpatial dataset. 22 | 23 | * **Benchmark Dataset:** [**📂 RoboSpatial-Home**](https://huggingface.co/datasets/chanhee-luke/RoboSpatial-Home) 24 | * **Evaluation Script:** [**📊 RoboSpatial-Eval**](https://github.com/chanhee-luke/RoboSpatial-Eval) 25 | 26 | **Coming up!** 27 | 28 | - [ ] Unified data loader supporting [BOP datasets](https://bop.felk.cvut.cz/datasets/) and [GraspNet dataset](https://graspnet.net/). (Turn object pose estimation datasets into spatial QA!) 29 | - [ ] Support for additional scan datasets like [SCRREAM](https://sites.google.com/view/scrream/about). 30 | 31 | --- 32 | 33 | # RoboSpatial Annotation Generation 34 | 35 | This codebase generates rich spatial annotations for 3D scan datasets. While initially built using the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) conventions, it is designed to be extensible to other data formats through custom data loaders (see [Data Loader Documentation](#data-loader-documentation)). It extracts various spatial relationships from image data and associated 3D information, including: 36 | 37 | * **Object Grounding:** Locating objects mentioned in text within the image. 38 | * **Spatial Context:** Identifying points in empty space relative to objects (e.g., "in front of the chair"). 39 | * **Spatial Configuration:** Describing the relative arrangement of multiple objects (e.g., "the chair is next to the table"). 40 | * **Spatial Compatibility:** Determining if an object *could* fit in a specific location. 41 | 42 | The generated annotations are saved in JSON format, one file per image. 43 | 44 | ## Prerequisites 45 | 46 | 1. **Python Environment:** Ensure you have a Python (3.8+) environment set up (e.g., using `conda` or `venv`). Required packages can be installed via `pip install -r requirements.txt`. 47 | 2. **Datasets:** You need access to the 3D scan datasets you intend to process. 48 | * **Note:** For specific instructions on downloading and setting up the **EmbodiedScan** dataset, please refer to the guide in [**`data/README.md`**](data/README.md). 49 | 3. **Configuration:** The main configuration file (e.g., `robospatial/configs/embodiedscan.yaml`) needs to be updated with paths relevant to your chosen data loader and dataset: 50 | * `data_loading.loader_class`: Specifies the Python class for your data loader (e.g., `data_loader.embodiedscan_loader.EmbodiedScanLoader`). 51 | * Dataset-specific paths (e.g., `image_root`, format-specific annotation files like `embodiedscan_ann`). Consult the configuration file and your data loader's requirements. See [Data Loader Documentation](#data-loader-documentation) for more details on adding custom formats. 52 | * `data_generation.output_dir`: The directory where the generated `.annotations.json` files will be saved. 53 | 54 | ## Running Annotation Generation 55 | 56 | The core script for generating annotations is `robospatial/run_generation.py`. 57 | 58 | **Running with Provided Example Data (Recommended First Step):** 59 | 60 | We provide a small example scene with input annotations and images in the `example_data/` directory. This allows you to test the generation pipeline without downloading large datasets. 61 | 62 | 1. **Navigate to the `robospatial` directory:** 63 | ```bash 64 | cd robospatial 65 | ``` 66 | 2. **Run the generation script:** 67 | ```bash 68 | python run_generation.py --config configs/example_dataset.yaml 69 | ``` 70 | This will process only the example scene defined in `example_dataset.yaml` and generate the annotation in the `example_data/example_qa` folder. 71 | 72 | **Running on Full Datasets:** 73 | 74 | Once you have confirmed the example works and have downloaded your target datasets: 75 | 76 | 1. **Configure your data loader:** Ensure the `data_loading` section in your chosen configuration file (e.g., `configs/example_dataset.yaml`) correctly points to your dataset paths and uses the appropriate `loader_class`. 77 | 2. **Run the script:** 78 | ```bash 79 | cd robospatial 80 | python run_generation.py --config configs/your_chosen_config.yaml 81 | ``` 82 | 83 | This command will process all scenes found by the data loader using the settings defined in `your_chosen_config.yaml`. 84 | 85 | **Command-Line Options:** 86 | 87 | * `--config `: **(Required)** Specifies the path to the YAML configuration file. 88 | * `--scene `: Process only a single specific scene. 89 | ```bash 90 | python run_generation.py --config configs/embodiedscan.yaml --scene "scannet/scene0191_00" 91 | ``` 92 | * `--image `: Process only a single specific image within the specified scene (requires `--scene`). Useful for debugging. 93 | ```bash 94 | python run_generation.py --config configs/embodiedscan.yaml --scene "scannet/scene0191_00" --image "00090.jpg" 95 | ``` 96 | * `--range `: Process a specific range of scenes based on their index in the loaded list (inclusive start, inclusive end). 97 | ```bash 98 | python run_generation.py --config configs/embodiedscan.yaml --range 0 10 # Process first 11 scenes 99 | ``` 100 | * `--num_workers `: Specify the number of parallel worker threads to use for processing scenes. Overrides the `num_workers` setting in the config file. Defaults to `min(os.cpu_count(), 4)` if neither is provided. 101 | ```bash 102 | python run_generation.py --config configs/embodiedscan.yaml --num_workers 8 103 | ``` 104 | * `--dry-run`: Process only the first 5 images of each scene. Useful for quickly testing the pipeline. 105 | ```bash 106 | python run_generation.py --config configs/embodiedscan.yaml --dry-run 107 | ``` 108 | 109 | ## Visualizing Input/Outputs 110 | 111 | Two scripts are provided in the `scripts/` directory for visualizing inputs/outputs: 112 | 113 | ### 1. Visualizing Input Data (`scripts/visualize_input.py`) 114 | 115 | Use this script to check if your input annotations (e.g., 3D bounding boxes from your dataset's original format, after conversion by your data loader) are being loaded and interpreted correctly. It reads the intermediate JSON format produced by the data loader for a single image and overlays the 3D bounding boxes onto the image. 116 | 117 | **Usage:** 118 | 119 | ```bash 120 | python scripts/visualize_input.py \ 121 | --image_path \ 122 | --annotation_file 123 | ``` 124 | 125 | * Replace `` with the direct path to the image file. 126 | * Replace `` with the path to the JSON file representing the *input* annotations for that image (this file's location and naming depend on your data loader implementation). 127 | 128 | **Example using the provided example data:** 129 | ```bash 130 | python scripts/visualize_input.py \ 131 | --image_path example_data/images/example_dataset/example_scene/example_image.jpg \ 132 | --annotation_file example_data/annotations/example_input.json 133 | ``` 134 | 135 | ### 2. Visualizing Generated Output (`scripts/visualize_output.py`) 136 | 137 | Use this script to debug and inspect the spatial relationships generated by `run_generation.py`. It reads the final `.annotations.json` file for a specific image and allows you to visualize different types of generated annotations, including object grounding and spatial relationships (context, configuration, compatibility). 138 | 139 | **Usage:** 140 | 141 | ```bash 142 | python scripts/visualize_output.py \ 143 | --image_path \ 144 | --annotation_file ///.annotations.json \ 145 | --object_3d_grounding \ 146 | --context 147 | ``` 148 | 149 | * Replace `` with the direct path to the image file. 150 | * Replace `` with the path used in your configuration's `data_generation.output_dir`. 151 | * Adjust ``, ``, and `` to match the specific output file you want to visualize. 152 | * Include flags like `--object_2d_grounding`, `--object_3d_grounding`, `--context`, `--configuration`, or `--compatibility` to select what to visualize. Use the `--verbose` or `-v` flag for more detailed output. Refer to the script's internal documentation (`--help`) for detailed controls and options. 153 | 154 | **Example using the provided example data (run the generation first):** 155 | ```bash 156 | python scripts/visualize_output.py \ 157 | --image_path example_data/images/example_dataset/example_scene/example_image.jpg \ 158 | --annotation_file example_data/example_qa/example_scene/example_image.jpg.annotations.json \ 159 | --object_3d_grounding \ 160 | --context 161 | ``` 162 | 163 | ## Data Loader Documentation 164 | 165 | This project supports adding custom data loaders to handle different 3D dataset formats. The configuration file (`data_loading.loader_class`) specifies which loader to use. 166 | 167 | For detailed instructions on the expected interface for a data loader and how to implement your own, please refer to the README within the data loader directory: [**`robospatial/data_loader/README.md`**](robospatial/data_loader/README.md) 168 | 169 | ## Project Structure 170 | 171 | For a detailed explanation of the annotation generation logic and hyperparameters within the `spatial_analysis` modules, please refer to the [**`robospatial/README.md`**](robospatial/README.md). 172 | 173 | * `robospatial/`: Main source code directory. 174 | * `configs/`: Contains YAML configuration files (e.g., `example_config.yaml`). 175 | * `data_loader/`: Contains modules for loading and interfacing with different 3D datasets. Includes examples like `embodiedscan_loader.py` and can be extended with custom loaders. See the [README](robospatial/data_loader/README.md) in this directory for details. 176 | * `spatial_analysis/`: Modules performing the core spatial reasoning and annotation generation logic. 177 | * `annotation_generator.py`: Orchestrates the generation process for a single scene by calling functions from `spatial_analysis`. 178 | * `run_generation.py`: Main script to run the annotation generation across datasets/scenes based on configuration. 179 | 180 | ## Output Files 181 | 182 | * **`///.annotations.json`**: The primary output. Contains the generated spatial annotations for a single image. 183 | * **`generation_progress.json`**: Stores a list of scenes that have been successfully processed. This allows the script to resume if interrupted. Located in the directory where `run_generation.py` is executed. 184 | * **`generation_stats.json`**: Contains aggregated statistics about the generated annotations (e.g., counts of each annotation type) overall and per-dataset. Located in the directory where `run_generation.py` is executed. 185 | 186 | ## Acknowledgements 187 | 188 | We thank the authors of [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/tree/main) for providing their unified annotations for various 3D scan datasets, which served as the foundation for this project's data loading capabilities. 189 | 190 | ## Contact 191 | - Luke Song: song.1855@osu.edu 192 | - NVIDIA internal: Valts Blukis (vblukis@nvidia.com), Jonathan Tremblay (jtremblay@nvidia.com) 193 | - Or Github Issues! 194 | 195 | ## Citation 196 | 197 | **BibTex:** 198 | ```bibtex 199 | @inproceedings{song2025robospatial, 200 | author = {Song, Chan Hee and Blukis, Valts and Tremblay, Jonathan and Tyree, Stephen and Su, Yu and Birchfield, Stan}, 201 | title = {{RoboSpatial}: Teaching Spatial Understanding to {2D} and {3D} Vision-Language Models for Robotics}, 202 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 203 | year = {2025}, 204 | note = {Oral Presentation}, 205 | } 206 | ``` 207 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Processing EmbodiedScan Data 2 | 3 | To use the EmbodiedScan dataset with this project, you first need to download and process the raw data according to the instructions provided by the original EmbodiedScan authors. 4 | 5 | ## 1. Download and Preprocess Raw Data 6 | 7 | Follow the steps outlined in the official EmbodiedScan data preparation guide: 8 | [https://github.com/OpenRobotLab/EmbodiedScan/tree/main/data](https://github.com/OpenRobotLab/EmbodiedScan/tree/main/data) 9 | 10 | Specifically, you need to complete steps 1 through 5 and step 7: 11 | 1. Download ScanNet v2 data. 12 | 2. Download 3RScan data. 13 | 3. Download Matterport3D data. 14 | 4. Download ARKitScenes data. 15 | 5. Download EmbodiedScan annotations (`.pkl` files). 16 | 7. Extract images for ScanNet and 3RScan using the provided scripts (`generate_image_scannet.py` and `generate_image_3rscan.py`). 17 | 18 | **Note:** You do **not** need to perform step 6 (extracting occupancy annotations) for this project. 19 | 20 | Ensure your final data directory structure matches the one specified in the EmbodiedScan README. 21 | 22 | ## 2. Update Configuration File 23 | 24 | Once the data is downloaded and processed, you need to update the configuration file to point to the correct locations on your system. 25 | 26 | Edit the `robospatial/configs/embodiedscan.yaml` file. 27 | 28 | Update the following paths under the `data_loading` section: 29 | - `image_root`: Set this to the directory where the extracted images (e.g., `scannet/posed_images`, `3rscan//sequence`) are located. The specific structure might depend on how you organized the datasets downloaded in step 1. 30 | - `embodiedscan_ann`: Update the `train`, `val`, and `test` paths to point to the downloaded `.pkl` annotation files (from step 5). 31 | 32 | Example relevant section in `robospatial/configs/embodiedscan.yaml`: 33 | 34 | ```yaml 35 | data_loading: 36 | # ... other settings ... 37 | image_root: /path/to/your/processed/image/data # <- UPDATE THIS 38 | embodiedscan_ann: 39 | train: /path/to/your/EmbodiedScan/data/embodiedscan_infos_train.pkl # <- UPDATE THIS 40 | val: /path/to/your/EmbodiedScan/data/embodiedscan_infos_val.pkl # <- UPDATE THIS 41 | test: /path/to/your/EmbodiedScan/data/embodiedscan_infos_test.pkl # <- UPDATE THIS 42 | # ... other settings ... 43 | ``` 44 | 45 | After completing these steps, you should be able to load and use the EmbodiedScan dataset with the project. 46 | -------------------------------------------------------------------------------- /example_data/annotations/example_input.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": "example_dataset", 3 | "scene_name": "example_scene", 4 | "image_name": "example_image.jpg", 5 | "image_size": [ 6 | 1280, 7 | 800 8 | ], 9 | "visible_instance_ids": [ 10 | 0, 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 5, 16 | 6, 17 | 7 18 | ], 19 | "camera_annotations": { 20 | "extrinsic": [ 21 | [ 22 | 1.0, 23 | 0.0, 24 | 0.0, 25 | 0.0 26 | ], 27 | [ 28 | 0.0, 29 | -0.6427876096865394, 30 | 0.766044443118978, 31 | 0.0 32 | ], 33 | [ 34 | 0.0, 35 | -0.766044443118978, 36 | -0.6427876096865394, 37 | 0.0 38 | ], 39 | [ 40 | 0.0, 41 | 0.0, 42 | 0.0, 43 | 1.0 44 | ] 45 | ], 46 | "intrinsic": [ 47 | [ 48 | 669.9725341796875, 49 | 0.0, 50 | 640.0, 51 | 0.0 52 | ], 53 | [ 54 | 0.0, 55 | 669.9725341796875, 56 | 400.0, 57 | 0.0 58 | ], 59 | [ 60 | 0.0, 61 | 0.0, 62 | 1.0, 63 | 0.0 64 | ], 65 | [ 66 | 0.0, 67 | 0.0, 68 | 0.0, 69 | 1.0 70 | ] 71 | ], 72 | "axis_align_matrix": [ 73 | [ 74 | 1.0, 75 | 0.0, 76 | 0.0, 77 | 0.0 78 | ], 79 | [ 80 | 0.0, 81 | 1.0, 82 | 0.0, 83 | 0.0 84 | ], 85 | [ 86 | 0.0, 87 | 0.0, 88 | 1.0, 89 | 0.0 90 | ], 91 | [ 92 | 0.0, 93 | 0.0, 94 | 0.0, 95 | 1.0 96 | ] 97 | ] 98 | }, 99 | "objects": [ 100 | { 101 | "Name": "Brown potato.", 102 | "bbox_3d": [ 103 | [ 104 | 0.0965536352212597, 105 | 0.6457258245959925, 106 | -0.708612940810858, 107 | 0.12224379445579278, 108 | 0.14591369211531313, 109 | 0.07074560840853938, 110 | -1.11690709649976, 111 | 0.0, 112 | 0.0 113 | ] 114 | ] 115 | }, 116 | { 117 | "Name": "Yellow banana.", 118 | "bbox_3d": [ 119 | [ 120 | 0.6281575669935243, 121 | 0.8318574965411679, 122 | -0.7111925077538456, 123 | 0.11816268404379843, 124 | 0.20877171553860133, 125 | 0.06713331741369433, 126 | 3.499465406810771, 127 | 0.0, 128 | 0.0 129 | ] 130 | ] 131 | }, 132 | { 133 | "Name": "Light blue cylindrical cup.", 134 | "bbox_3d": [ 135 | [ 136 | 0.24272731261345826, 137 | 0.5509605710209379, 138 | -0.7126530522734699, 139 | 0.13114139619345888, 140 | 0.14536641809246212, 141 | 0.10336367309178618, 142 | 2.398133451820688, 143 | 0.0, 144 | 0.0 145 | ] 146 | ] 147 | }, 148 | { 149 | "Name": "Green and white bok choy.", 150 | "bbox_3d": [ 151 | [ 152 | -0.16947051050441583, 153 | 0.48852897068541257, 154 | -0.7355282463047628, 155 | 0.17038007531409313, 156 | 0.25531524115276, 157 | 0.07396988788683145, 158 | -1.0229929055844709, 159 | 0.0, 160 | 0.0 161 | ] 162 | ] 163 | }, 164 | { 165 | "Name": "White circular alarm clock.", 166 | "bbox_3d": [ 167 | [ 168 | -0.29906058568573957, 169 | 0.7773581812536778, 170 | -0.6563527562208662, 171 | 0.11525623601554463, 172 | 0.14794219955016966, 173 | 0.13351970865123064, 174 | -1.453594536933811, 175 | 0.0, 176 | 0.0 177 | ] 178 | ] 179 | }, 180 | { 181 | "Name": "Orange juice carton with orange and white colors, \"ORANGE JUICE\" text.", 182 | "bbox_3d": [ 183 | [ 184 | -0.11910513774912884, 185 | 0.6559001853803491, 186 | -0.6265974918806398, 187 | 0.16297827806663062, 188 | 0.15375061767211012, 189 | 0.2198072386901998, 190 | -1.3278610167121037, 191 | 0.0, 192 | 0.0 193 | ] 194 | ] 195 | }, 196 | { 197 | "Name": "Heart-shaped pink eraser.", 198 | "bbox_3d": [ 199 | [ 200 | -0.11139781936853714, 201 | 0.6552562452466136, 202 | -0.572456284198834, 203 | 0.05915619125851447, 204 | 0.11470179230647526, 205 | 0.08474071881173639, 206 | -1.1605898554992926, 207 | 0.0, 208 | 0.0 209 | ] 210 | ] 211 | }, 212 | { 213 | "Name": "Wooden pallet crate", 214 | "bbox_3d": [ 215 | [ 216 | 0.5996147851768805, 217 | 0.9048717598769047, 218 | -0.5165727925513247, 219 | 0.3457252111932344, 220 | 0.3813317410243554, 221 | 0.32189707409624985, 222 | -1.538087589994403, 223 | 0.0, 224 | 0.0 225 | ] 226 | ] 227 | } 228 | ] 229 | } 230 | -------------------------------------------------------------------------------- /example_data/images/example_dataset/example_scene/example_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RoboSpatial/59b091d7694a724d3a46bb2b636d1bc49b899eb9/example_data/images/example_dataset/example_scene/example_image.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML 2 | numpy 3 | tqdm 4 | opencv-python 5 | matplotlib 6 | open3d 7 | -------------------------------------------------------------------------------- /robospatial/README.md: -------------------------------------------------------------------------------- 1 | # RoboSpatial Annotation Generation Logic 2 | 3 | For those who wants to hack the codebase! 4 | 5 | ## Annotation Generation Details 6 | 7 | This section provides a more detailed overview of the logic used to generate each type of spatial annotation and highlights the key configuration parameters found in `configs/example_config.yaml` (and other configuration files) that control this process. 8 | 9 | ### 1. Object Grounding (`spatial_analysis/grounding.py`) 10 | 11 | * **Purpose:** Generates a tight 2D axis-aligned bounding box (`clipped_bbox`) encompassing all visible pixels of an object in the image. 12 | * **Logic:** 13 | * Relies on a pre-calculated 2D boolean `occupancy_map` for each object, which indicates the precise pixels covered by the object's 3D model when projected onto the image. 14 | * It finds the minimum and maximum `x` (column) and `y` (row) coordinates within this occupancy map. 15 | * These bounds directly define the `[xmin, ymin, xmax, ymax]` coordinates of the 2D bounding box. 16 | * **Key Parameters:** None directly in the configuration for this step; it depends on the accuracy of the input 3D models and the camera parameters used to generate the `occupancy_map`. 17 | 18 | ### 2. Spatial Context (`spatial_analysis/context/context.py`) 19 | 20 | * **Purpose:** Samples points in the empty space surrounding a *reference object* and categorizes them based on their spatial relationship (infront, behind, left, right) in three different frames: `objectcentric`, `cameracentric`, and `worldcentric`. 21 | * **Logic:** 22 | 1. Calculates empty space on the floor using a top-down 2D grid based on environment geometry. 23 | 2. Identifies empty grid points within a specific distance (`threshold`) from the reference object's 2D footprint. 24 | 3. Projects these candidate 3D points (at the object's base height) onto the image, filtering those outside the view or behind the camera. 25 | 4. Checks if the projected 2D points are occluded by *other* objects using a pre-computed environment occupancy map. 26 | 5. Categorizes the non-occluded points based on their position relative to the reference object in the three frames (using object orientation for object-centric, and pixel/depth coordinates for camera/world-centric). 27 | 6. Randomly samples up to `num_samples` non-occluded points for each valid category (frame + direction). 28 | * **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):** 29 | * `context_threshold`: Maximum distance (in world units, e.g., meters) from the reference object's footprint to consider sampling points. 30 | * `context_grid_resolution`: The size of each cell in the temporary top-down 2D grid used for finding nearby empty space. Smaller values are more precise but computationally more expensive. 31 | * `context_num_samples`: The maximum number of points to sample and output for each valid category (e.g., max 10 points for 'camera_centric' 'left'). 32 | 33 | ### 3. Spatial Configuration (`spatial_analysis/configuration/configuration.py`) 34 | 35 | * **Purpose:** Determines the pairwise spatial relationship between two objects (`obj1` relative to `obj2`) across the three reference frames (camera, world, object). Relationships include left/right, infront/behind, above/below, and overlapping. 36 | * **Logic:** 37 | 1. Calculates various geometric metrics for both objects (projected 2D bounds, average visible depth, world Z bounds, etc.) using their individual pre-computed `occupancy_map`s. 38 | 2. **Camera/World-centric:** Compares these metrics. The `strictness` parameter controls the comparison method: 39 | * `'strict'`: Uses the absolute min/max bounds. Requires clear separation; considers objects overlapping if their projected bounds intersect at all. Sensitive to partial occlusions. 40 | * `'lenient'`: Uses projected centers, average visible depths, and average Z coordinates. More robust to partial occlusion but might misclassify tightly packed objects. 41 | 3. **Object-centric:** Uses the Separating Axis Theorem (SAT) on the 3D OBBs to check for overlap. If not overlapping, it determines the direction based on the relative position of `obj1`'s center projected onto `obj2`'s local forward and right axes. Above/below still uses world Z coordinates. 42 | * **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):** 43 | * `spatial_configuration_strictness`: (`'strict'` or `'lenient'`) Selects the comparison logic for camera-centric and world-centric frames. Default is `'lenient'`. 44 | * `pairwise_relationship_mode`: (`'unique_categories_only'` or `'all_visible_objects'`) Determines which pairs of objects are considered for configuration analysis. `'unique_categories_only'` only considers pairs where each object is the only instance of its category visible, while `'all_visible_objects'` considers all permutations of visible objects. 45 | 46 | ### 4. Spatial Compatibility (`spatial_analysis/compatibility/compatibility.py`) 47 | 48 | * **Purpose:** Assesses whether one object (`obj_a`) *could* be placed in the empty space relative to another (`obj_b`) without collision. It checks directions like left, right, in front, behind, and specifically `on_top`. 49 | * **Logic:** 50 | 1. Samples potential placement points around `obj_b` using the Spatial Context logic (`get_point_in_space_relative_to_object`), using a dynamic threshold based on the sizes of `obj_a` and `obj_b`. 51 | 2. For each sampled point, it simulates placing `obj_a` horizontally centered at that point's 2D location. 52 | 3. It checks for collisions between the placed `obj_a` (potentially with a `buffer_ratio`) and: 53 | * The static environment (using a 2D occupancy grid). 54 | * The reference object `obj_b` (maintaining a `min_distance`). 55 | 4. A relationship (e.g., 'left') is considered compatible (`True`) if *any* sampled point corresponding to that relationship allows `obj_a` to fit. 56 | 5. A separate, simpler check (`can_fit_on_top`) determines the 'on_top' relationship by comparing the horizontal dimensions of `obj_a` and `obj_b`, but only if `obj_a` is placeable and `obj_b` has a flat surface. 57 | * **Key Parameters (`configs/example_config.yaml` -> `data_generation.generation_options`):** 58 | * `compatibility_grid_resolution`: Resolution of the 2D grid used for collision checking against the environment. 59 | * `compatibility_num_samples`: How many potential placement points to sample around `obj_b`. 60 | * `compatibility_min_distance`: The minimum required distance (in world units) between the placed `obj_a` and the reference `obj_b`. 61 | * `compatibility_buffer_ratio`: A ratio applied to `obj_a`'s dimensions during collision checks, effectively adding a safety margin. 0 means no buffer, 0.1 means 10% buffer. 62 | * `context_threshold`: The *base* threshold used for sampling points (dynamically increased based on object sizes). 63 | 64 | --- 65 | 66 | ## Project Structure 67 | 68 | * `configs/`: Contains YAML configuration files (e.g., `example_config.yaml`). 69 | * `data_loader/`: Modules for loading and interfacing with different 3D datasets. Includes `embodiedscan_loader.py` and a [README](data_loader/README.md) explaining how to add custom loaders. 70 | * `spatial_analysis/`: Modules performing the core spatial reasoning and annotation generation logic. 71 | * `context/`: Logic for spatial context (points relative to an object). 72 | * `configuration/`: Logic for spatial configuration (relative position between objects). 73 | * `compatibility/`: Logic for spatial compatibility (fitting assessment). 74 | * `grounding.py`: Logic for 2D object grounding. 75 | * `relationships.py`: High-level wrappers for spatial analysis functions. 76 | * `relationship_utils.py`: Utility functions for geometry and projections. 77 | * `topdown_map.py`: Functions for creating 2D top-down occupancy grids. 78 | * `obj_properties.py`: Lists defining object properties (e.g., `items_with_face`). 79 | * `annotation_generator.py`: Orchestrates the generation process for a single scene. 80 | * `run_generation.py`: Main script to run annotation generation across datasets/scenes. 81 | 82 | ## Output Files 83 | 84 | * **`///.annotations.json`**: The primary output. Contains the generated spatial annotations for a single image, structured by type (grounding, unary relations, pairwise relations). 85 | * **`generation_progress.json`**: Stores a map of datasets to lists of scene names that have been successfully processed. Allows the script to resume if interrupted. Located in the directory where `run_generation.py` is executed. 86 | * **`generation_stats.json`**: Contains aggregated statistics about the generated annotations (e.g., counts of each annotation type) overall and per-dataset. Located in the directory where `run_generation.py` is executed. 87 | -------------------------------------------------------------------------------- /robospatial/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /robospatial/annotation_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | """Core annotation generation logic for a single scene. 11 | 12 | This module defines the `generate_and_save_annotations` function, which is responsible 13 | for processing all images within a given scene. It calculates various spatial 14 | relationships and grounding information based on object data (OBBs, categories) 15 | and camera parameters. 16 | 17 | It utilizes functions from the `spatial_analysis` package to compute: 18 | - Object Grounding: Bounding boxes in 2D. 19 | - Spatial Context: Points relative to an object (e.g., in front, behind). 20 | - Spatial Compatibility: Fit assessment (e.g., can A fit on B). 21 | - Spatial Configuration: Relative positioning (e.g., left/right, above/below). 22 | 23 | The generated annotations are saved as JSON files, one per processed image. 24 | This module is typically called by a higher-level script (e.g. `run_generation.py`) 25 | that handles dataset iteration and overall workflow management. 26 | """ 27 | 28 | 29 | import itertools 30 | import os 31 | import json 32 | import cv2 33 | from collections import defaultdict 34 | import numpy as np 35 | from tqdm import tqdm 36 | 37 | from spatial_analysis.grounding import get_object_grounding 38 | from spatial_analysis.relationships import get_spatial_configuration, get_spatial_compatibility, get_spatial_context 39 | from spatial_analysis.relationship_utils import calculate_occupied_pixels 40 | 41 | 42 | 43 | # --- Main Annotation Generation Function --- 44 | 45 | def generate_and_save_annotations(loader, dataset_name, scene_name, images_ann_dict, config, num_workers): 46 | """ 47 | Generates and saves annotations for a scene based on the configuration. 48 | Handles multiple annotation types: localization, compatibility, point_grounding, bbox_grounding. 49 | """ 50 | 51 | # --- Statistics Initialization --- 52 | stats = defaultdict(int) 53 | stats['num_total_images'] = 0 54 | 55 | # --- Read Compatibility Check Configs --- 56 | comp_grid_res = config["data_generation"]["generation_options"]["compatibility_grid_resolution"] 57 | comp_min_distance = config["data_generation"]["generation_options"]["compatibility_min_distance"] 58 | comp_buffer_ratio = config["data_generation"]["generation_options"]["compatibility_buffer_ratio"] 59 | comp_num_samples = config["data_generation"]["generation_options"]["compatibility_num_samples"] 60 | 61 | # --- Read Spatial Context Configs --- 62 | context_threshold = config["data_generation"]["generation_options"]["context_threshold"] 63 | context_grid_res = config["data_generation"]["generation_options"]["context_grid_resolution"] 64 | context_num_samples = config["data_generation"]["generation_options"]["context_num_samples"] 65 | 66 | # --- Read Spatial Configuration Strictness --- 67 | spatial_config_strictness = config["data_generation"]["generation_options"]["spatial_configuration_strictness"] 68 | 69 | # --- Read Pairwise Relationship Mode --- 70 | pairwise_mode = config["data_generation"]["generation_options"]["pairwise_relationship_mode"] 71 | 72 | 73 | # --- Generate Annotations --- 74 | # Determine the iterator based on whether tqdm should be used 75 | image_iterator = images_ann_dict.items() 76 | if num_workers <= 1: 77 | # Wrap with tqdm only if single-threaded 78 | image_iterator = tqdm(image_iterator, desc=f"Processing images in {scene_name}", leave=False) 79 | 80 | for image_name, image_ann in image_iterator: 81 | 82 | # Initial setup 83 | relationships_to_generate = config["data_generation"]["generation_options"]["spatial_relationship_types"] 84 | extrinsic = image_ann['extrinsic'] 85 | intrinsic = image_ann['intrinsic'] 86 | 87 | # --- Image Setup --- 88 | image_path = os.path.join(config["data_loading"]["image_root"], image_ann["img_path"]) # Use path as identifier 89 | image_file = cv2.imread(image_path) 90 | 91 | if image_file is None: 92 | print(f"Warning: Could not read image {image_path}. Skipping.") 93 | continue 94 | h, w, _ = image_file.shape 95 | image_size = (w, h) 96 | 97 | # Process visible objects in the image 98 | vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs = loader.list_objects(dataset_name, scene_name, image_ann) 99 | 100 | if len(all_objs) == 0: 101 | print(f"Warning: No objects detected in image {image_name}. Skipping image.") 102 | continue 103 | 104 | # Get all OBBs 105 | obbs = {obj["name"]: obj["obb"] for obj in all_objs.values() if "name" in obj and "obb" in obj} 106 | 107 | # --- Precompute Environment Occupancy Maps (Combined and Individual) --- 108 | # Pass list of object dictionaries to calculate_occupied_pixels 109 | objects_for_occupancy = [obj for obj in all_objs.values() if 'obb' in obj and 'name' in obj] 110 | if not objects_for_occupancy: 111 | print(f"Warning: No objects with OBB and name found for occupancy calculation in {image_name}. Skipping image.") 112 | continue # Or handle appropriately 113 | 114 | env_occupancy_map, individual_occupancy_maps = calculate_occupied_pixels( 115 | objects_for_occupancy, extrinsic, intrinsic, image_size 116 | ) 117 | 118 | # --- Annotation Generation --- 119 | spatial_relationships = { 120 | "unary_relations": [], 121 | "pairwise_relations": [] 122 | } 123 | # Initialize defaultdict to create a dictionary with empty lists for grounding keys 124 | object_grounding = [] 125 | generated_something_for_image = False 126 | 127 | # 1. Generate Grounding Annotations (per object type) 128 | if 'spatial_context' in relationships_to_generate or 'object_grounding' in relationships_to_generate: 129 | 130 | for obj_name, obj in vis_objs.items(): 131 | 132 | category = obj["category"] 133 | category = obj.get("category") 134 | obj_map = individual_occupancy_maps.get(obj_name) # Get precomputed map 135 | 136 | if category is None or obj_map is None: 137 | print(f"Warning: Skipping object {obj_name} due to missing category or precomputed occupancy map.") 138 | continue 139 | 140 | # No need to differentiate unique/multi here as we iterate through vis_objs directly 141 | 142 | #NOTE object grounding handles both single and multi instance objects 143 | if 'object_grounding' in relationships_to_generate: 144 | grounding_info = get_object_grounding(obj, obj_map) 145 | 146 | if grounding_info: 147 | results = { 148 | "name": obj_name, 149 | "category": category, 150 | "bbox": grounding_info["clipped_bbox"], 151 | "bbox_3d": obj["bbox_3d"], 152 | } 153 | object_grounding.append(results) 154 | generated_something_for_image = True 155 | stats['num_object_grounding_generated'] += 1 156 | 157 | #NOTE spatial context handles single instance objects 158 | if 'spatial_context' in relationships_to_generate and category in unique_vis_categories: 159 | # Filter obbs to exclude the current object 160 | context_obbs = [obb for name, obb in obbs.items() if name != obj_name] 161 | # Pass precomputed maps to get_spatial_context 162 | points_2d, points_3d, generated = get_spatial_context( 163 | obj, extrinsic, intrinsic, floor_bound, context_obbs, image_size, image_path, 164 | individual_occupancy_maps=individual_occupancy_maps, # Pass individual maps 165 | env_occupancy_map=env_occupancy_map, # Pass combined env map 166 | threshold=context_threshold, # Pass configured threshold 167 | grid_resolution=context_grid_res, # Pass configured grid resolution 168 | num_samples=context_num_samples, # Pass configured num samples 169 | ) 170 | if generated: 171 | results = { 172 | "name": obj_name, 173 | "category": category, 174 | "point_space_2d": points_2d, 175 | "point_space_3d": points_3d, 176 | } 177 | spatial_relationships["unary_relations"].append(results) 178 | generated_something_for_image = True 179 | stats['num_spatial_context_generated'] += 1 180 | 181 | # 2. Generate Relationship Annotations (per object pair) 182 | generate_pairwise = 'spatial_configuration' in relationships_to_generate or 'spatial_compatibility' in relationships_to_generate 183 | objects_available_for_pairwise = (pairwise_mode == 'unique_categories_only' and len(unique_vis_categories) >= 2) or \ 184 | (pairwise_mode == 'all_visible_objects' and len(vis_objs) >= 2) 185 | 186 | 187 | if generate_pairwise and objects_available_for_pairwise: 188 | # Determine the iterator based on the mode 189 | if pairwise_mode == 'unique_categories_only': 190 | iterator = itertools.permutations(unique_vis_categories, 2) 191 | get_obj = lambda cat: vis_objs[cat] # Function to get object by category 192 | # Use unique_vis_categories which are typically names/keys in vis_objs for unique items 193 | iterator = itertools.permutations(unique_vis_categories, 2) 194 | # Need to handle potential KeyError if a category name isn't directly a key in vis_objs 195 | # Assuming unique_vis_categories contains keys that *are* in vis_objs 196 | get_obj = lambda cat_key: vis_objs.get(cat_key) 197 | else: # pairwise_mode == 'all_visible_objects' 198 | iterator = itertools.permutations(vis_objs.keys(), 2) 199 | get_obj = lambda key: vis_objs[key] # Function to get object by key 200 | get_obj = lambda key: vis_objs.get(key) 201 | 202 | 203 | for item1_key, item2_key in iterator: 204 | obj1 = get_obj(item1_key) 205 | obj2 = get_obj(item2_key) 206 | 207 | # Skip if objects couldn't be retrieved (e.g., bad key from unique_vis_categories) 208 | if obj1 is None or obj2 is None: 209 | print(f"Warning: Could not retrieve objects for pair ({item1_key}, {item2_key}). Skipping.") 210 | continue 211 | 212 | # Get object names 213 | obj1_name = obj1["name"] 214 | obj2_name = obj2["name"] 215 | # Get object names and categories safely 216 | obj1_name = obj1.get("name") 217 | obj2_name = obj2.get("name") 218 | obj1_cat = obj1.get("category") 219 | obj2_cat = obj2.get("category") 220 | 221 | if not all([obj1_name, obj2_name, obj1_cat, obj2_cat]): 222 | print(f"Warning: Missing name or category for objects in pair ({item1_key}, {item2_key}). Skipping.") 223 | continue 224 | 225 | pair_result = { 226 | "pair": (obj1_name, obj2_name), 227 | "pair_category": (obj1["category"], obj2["category"]), 228 | "pair_category": (obj1_cat, obj2_cat), 229 | } 230 | 231 | 232 | if 'spatial_configuration' in relationships_to_generate: 233 | # Pass individual maps to get_spatial_configuration 234 | config_rels = get_spatial_configuration( 235 | obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps,spatial_config_strictness) 236 | pair_result["spatial_configuration"] = config_rels 237 | generated_something_for_image = True 238 | stats['num_spatial_configuration_pairs'] += 1 239 | 240 | if 'spatial_compatibility' in relationships_to_generate: 241 | # Filter obbs to exclude obj1 242 | compatibility_obbs = [obb for name, obb in obbs.items() if name != obj1_name] 243 | # Pass individual and combined maps to get_spatial_compatibility 244 | comp_rels = get_spatial_compatibility( 245 | obj1, obj2, extrinsic, intrinsic, floor_bound, compatibility_obbs, image_size, image_path, 246 | individual_occupancy_maps=individual_occupancy_maps, # Pass individual maps 247 | env_occupancy_map=env_occupancy_map, # Pass combined env map 248 | grid_resolution=comp_grid_res, # Pass configured grid resolution 249 | num_samples=comp_num_samples, # Pass configured num samples 250 | min_distance=comp_min_distance, # Pass configured min distance 251 | buffer_ratio=comp_buffer_ratio # Pass configured buffer ratio 252 | ) 253 | pair_result["spatial_compatibility"] = comp_rels 254 | generated_something_for_image = True 255 | stats['num_spatial_compatibility_pairs'] += 1 256 | 257 | if len(pair_result) > 2: # Check if more than just pair info was added 258 | spatial_relationships["pairwise_relations"].append(pair_result) 259 | 260 | # --- Save Results --- 261 | if generated_something_for_image: 262 | stats['num_total_images'] += 1 263 | 264 | image_results = { 265 | "dataset": dataset_name, 266 | "scene_name": scene_name, 267 | "image_identifier": image_name, 268 | "image_path": image_path, 269 | "image_size": image_size, 270 | "depth_path": image_ann.get("depth_path", ""), 271 | "visible_instance_ids": image_ann.get('visible_instance_ids', []), 272 | } 273 | 274 | cam_ann = {} 275 | for key in ['extrinsic', 'intrinsic']: 276 | if key in image_ann and image_ann[key] is not None: 277 | cam_ann[key] = image_ann[key].tolist() 278 | image_results["camera_annotations"] = cam_ann 279 | 280 | if object_grounding: 281 | image_results["object_grounding"] = object_grounding 282 | if spatial_relationships: 283 | image_results["spatial_relationships"] = spatial_relationships 284 | 285 | folder_path = os.path.join(config["data_generation"]["output_dir"], scene_name) 286 | os.makedirs(folder_path, exist_ok=True) 287 | 288 | output_suffix = config.get("output_suffix", ".annotations.json") 289 | file_name = f"{image_ann['image_basename']}{output_suffix}" 290 | file_path = os.path.join(folder_path, file_name) 291 | 292 | with open(file_path, 'w') as json_file: 293 | json.dump(image_results, json_file, indent=4) 294 | 295 | # --- Return Scene Statistics --- 296 | scene_ann_stats = { 297 | 'dataset_name': dataset_name, 298 | 'scene_name': scene_name, 299 | 'num_processed_images': stats['num_total_images'], 300 | 'num_spatial_configuration_pairs': stats['num_spatial_configuration_pairs'], 301 | 'num_spatial_compatibility_pairs': stats['num_spatial_compatibility_pairs'], 302 | 'num_object_grounding_generated': stats['num_object_grounding_generated'], 303 | 'num_spatial_context_generated': stats['num_spatial_context_generated'], 304 | } 305 | 306 | return scene_ann_stats -------------------------------------------------------------------------------- /robospatial/configs/embodiedscan.yaml: -------------------------------------------------------------------------------- 1 | data_generation: 2 | # Number of parallel worker threads for processing scenes. 3 | # Defaults to min(os.cpu_count(), 4) if not specified or overridden by CLI --num_workers. 4 | num_workers: 8 5 | progress_file: generation_progress.json 6 | stats_file: generation_stats.json 7 | output_suffix: .annotations.json 8 | 9 | generation_options: 10 | spatial_relationship_types: 11 | - object_grounding 12 | - spatial_context 13 | - spatial_configuration 14 | - spatial_compatibility 15 | 16 | # Settings for spatial context point generation 17 | context_threshold: 0.5 # Distance threshold from reference object 18 | context_grid_resolution: 0.1 # Grid resolution for empty space check during context generation 19 | context_num_samples: 50 # Number of points to sample for spatial context 20 | 21 | # Settings for spatial compatibility checks 22 | compatibility_grid_resolution: 0.1 # Grid resolution for can_fit checks 23 | compatibility_min_distance: 0.2 # Minimum distance between objects in can_fit checks 24 | compatibility_buffer_ratio: 0.5 # Buffer ratio for can_fit checks 25 | compatibility_num_samples: 50 # Number of samples for point space in can_fit checks 26 | 27 | # Strictness level for spatial configuration checks ('strict' or 'lenient') 28 | spatial_configuration_strictness: 'strict' 29 | 30 | # Mode for calculating pairwise relationships ('unique_categories_only' or 'all_visible_objects') 31 | pairwise_relationship_mode: 'unique_categories_only' # Default to current behavior 32 | 33 | output_dir: /path/to/output/dir/ 34 | 35 | # Data Loader Settings 36 | data_loading: 37 | verbose: True 38 | datasets: 39 | - scannet 40 | - matterport3d 41 | - 3rscan 42 | - arkitscenes 43 | loader_class: data_loader.embodiedscan_loader.EmbodiedScanLoader 44 | annotation_key: embodiedscan_ann 45 | split: # Which splits to use, for EmbodiedScan we use train and val 46 | - train 47 | - val 48 | 49 | image_root: /path/to/your/processed/image/data 50 | embodiedscan_ann: 51 | train: /path/to/embodiedscan_infos_train.pkl 52 | val: /path/to/embodiedscan_infos_val.pkl 53 | test: /path/to/embodiedscan_infos_test.pkl 54 | -------------------------------------------------------------------------------- /robospatial/configs/example_config.yaml: -------------------------------------------------------------------------------- 1 | # Example configuration with minimal required fields for using a custom loader 2 | 3 | # Top-level structure often separates different concerns like loading and generation 4 | 5 | data_loading: 6 | # --- Required Fields --- 7 | 8 | # Specifies the Python class for your custom data loader. (Required) 9 | # Format: "module_name.ClassName" 10 | loader_class: "data_loader.example_loader.ExampleLoader" 11 | 12 | # List of dataset identifiers that this configuration applies to. (Required) 13 | # These identifiers must match keys under the data_loading section 14 | # where dataset-specific parameters are defined. 15 | datasets: 16 | - example_dataset 17 | 18 | # --- Dataset Specific Configuration --- 19 | # Parameters required by the specific loader ('ExampleLoader' in this case). 20 | # These fields are essential *for this loader*. 21 | annotation_dir: "/path/to/your/dataset/annotations/" 22 | image_root: "/path/to/your/dataset/images/" 23 | 24 | # Specifies which data split(s) to process (e.g., train, validation, test). 25 | # If omitted, the loader might default to a specific split or load all. 26 | # split: 27 | # - "train" 28 | # - "val" 29 | 30 | # Enable verbose logging output during data loading. (Optional, defaults may vary) 31 | # verbose: True 32 | 33 | # Add any *other* parameters required only by your specific CustomDatasetLoader here. 34 | 35 | 36 | data_generation: 37 | # Base directory where all generated QA data will be saved. 38 | output_dir: "/path/to/your/output/qa_data" 39 | 40 | # Number of parallel worker threads for processing scenes. 41 | # If omitted or null, defaults to os.cpu_count() (or 4 if count is unavailable). 42 | # Can be overridden by the --num_workers command-line argument. 43 | num_workers: 1 # Example: Set to 1 to disable parallel processing, or e.g., 4 to use 4 threads. 44 | 45 | # Suffix to append to output annotation filenames. 46 | output_suffix: ".annotations.json" 47 | 48 | # Path to the file used for saving and resuming generation progress. 49 | progress_file: "generation_progress_example.json" 50 | 51 | # Path to the file where generation statistics will be saved. 52 | stats_file: "generation_stats_example.json" 53 | 54 | # --- QA Generation Options --- 55 | # These options control the types and specifics of the Question-Answering pairs generated. 56 | generation_options: 57 | # List of spatial relationship/QA types to generate. 58 | # Common types might include: 59 | # - object_grounding: Questions about the location/existence of specific objects. 60 | # - spatial_context: Questions about objects relative to empty space or general areas. 61 | # - spatial_configuration: Questions about the arrangement of multiple objects. 62 | # - spatial_compatibility: Questions about whether objects *could* fit somewhere. 63 | spatial_relationship_types: 64 | - "object_grounding" 65 | - "spatial_context" 66 | - "spatial_configuration" 67 | - "spatial_compatibility" 68 | # Add any custom QA types your system supports 69 | 70 | # --- Settings for Specific QA Types --- 71 | 72 | # Threshold distance (in meters) from a reference object when generating 73 | # points for "spatial_context" questions. Points further than this are ignored. 74 | context_threshold: 0.5 75 | 76 | # Grid resolution (in meters) used for checking empty space when sampling 77 | # points for "spatial_context" questions. 78 | context_grid_resolution: 0.1 79 | 80 | # Number of points to sample around reference objects for "spatial_context" questions. 81 | context_num_samples: 50 82 | 83 | # Grid resolution (in meters) used for collision checking (e.g., "can this object fit here?") 84 | # in "spatial_compatibility" questions. 85 | compatibility_grid_resolution: 0.1 86 | 87 | # Minimum distance (in meters) between objects when checking for "spatial_compatibility" questions. 88 | compatibility_min_distance: 0 89 | 90 | # Buffer ratio for "spatial_compatibility" checks. 91 | # This is the ratio of how much of the buffer zone can be occupied by other objects. 92 | # For example, if the buffer ratio is 0.6, then the buffer zone can be occupied by other objects 93 | # up to 60% of the time. 94 | compatibility_buffer_ratio: 0.6 95 | 96 | # Number of points to sample on object surfaces or within volumes for 97 | # collision/fitting checks in "spatial_compatibility" questions. 98 | compatibility_num_samples: 50 99 | 100 | # Strictness level for "spatial_configuration" checks. 101 | # 'strict': Requires precise matching of object arrangements. 102 | # 'lenient': Allows for some tolerance in positions/orientations. 103 | spatial_configuration_strictness: 'lenient' 104 | 105 | # Mode for calculating pairwise relationships between objects (used in several QA types). 106 | # 'unique_categories_only': Considers relationships only between objects of different categories. 107 | # 'all_visible_objects': Considers relationships between all pairs of visible objects. 108 | pairwise_relationship_mode: 'unique_categories_only' -------------------------------------------------------------------------------- /robospatial/configs/example_dataset.yaml: -------------------------------------------------------------------------------- 1 | data_generation: 2 | num_workers: 1 3 | progress_file: generation_progress_example.json # Use a different progress file 4 | stats_file: generation_stats_example.json # Use a different stats file 5 | output_suffix: .annotations.json 6 | 7 | generation_options: 8 | spatial_relationship_types: 9 | - object_grounding 10 | - spatial_context 11 | - spatial_configuration 12 | - spatial_compatibility 13 | 14 | # Settings for spatial context point generation 15 | context_threshold: 0.5 16 | context_grid_resolution: 0.1 17 | context_num_samples: 50 18 | 19 | # Settings for spatial compatibility checks 20 | compatibility_grid_resolution: 0.1 21 | compatibility_min_distance: 0 22 | compatibility_buffer_ratio: 0.6 23 | compatibility_num_samples: 50 24 | 25 | # Strictness level for spatial configuration checks 26 | spatial_configuration_strictness: lenient 27 | 28 | # Mode for calculating pairwise relationships 29 | pairwise_relationship_mode: unique_categories_only 30 | 31 | # Adjust output directory as needed 32 | output_dir: ../example_data/example_qa 33 | 34 | # Data Loader Settings for Example Dataset JSON Annotations 35 | data_loading: 36 | verbose: True 37 | # Define the dataset name(s) you want to load. 38 | # The loader will look for this key in the 'dataset' field within the JSON files. 39 | datasets: 40 | - example_dataset 41 | 42 | # Specify the modified ExampleLoader class 43 | loader_class: data_loader.example_loader.ExampleLoader 44 | 45 | # Specify the directory containing the JSON annotation files 46 | annotation_dir: ../example_data/annotations/ 47 | 48 | # Specify the directory containing the images 49 | image_root: ../example_data/images/ 50 | -------------------------------------------------------------------------------- /robospatial/data_loader/README.md: -------------------------------------------------------------------------------- 1 | # Implementing a Custom Data Loader 2 | 3 | This document outlines the steps and requirements for implementing a custom data loader compatible with the RoboSpatial annotation generation pipeline. 4 | 5 | ## Overview 6 | 7 | The data loader is responsible for interfacing with your specific dataset format and providing the necessary information (scenes, images, object instances, metadata) to the generation pipeline. To ensure compatibility, your custom loader must inherit from the `BaseLoader` abstract base class (`robospatial.data_loader.base_loader.BaseLoader`) and implement its required methods. 8 | 9 | ## BaseLoader Interface 10 | 11 | Your custom loader class must implement the following methods: 12 | 13 | ### `__init__(self, config)` 14 | 15 | * **Purpose:** Initializes the data loader. This typically involves loading annotations, setting up paths, and potentially pre-processing metadata. 16 | * **Args:** 17 | * `config (dict)`: A dictionary containing the `data_loading` section from the configuration file (e.g., `configs/embodiedscan.yaml`). This allows access to dataset paths, annotation file locations, selected datasets, splits, and other relevant parameters. 18 | * **Implementation Notes:** 19 | * Use the `config` dictionary to locate and load your dataset's annotation files. 20 | * Store necessary metadata, such as class labels and mappings, as instance variables. 21 | * Organize the loaded data in a way that facilitates efficient retrieval by the other methods (e.g., nested dictionaries keyed by dataset and scene name, as seen in `EmbodiedScanLoader`). 22 | 23 | ### `list_scenes(self, dataset_list)` 24 | 25 | * **Purpose:** Provides a generator that yields information about each scene within the specified datasets. 26 | * **Args:** 27 | * `dataset_list (list)`: A list of dataset names (strings) requested by the pipeline (e.g., `['scannet', '3rscan']`). 28 | * **Returns:** 29 | * `generator`: Yields tuples of `(dataset_name, scene_idx, scene_name)`. 30 | * `dataset_name (str)`: The name of the dataset the scene belongs to. 31 | * `scene_idx (int)`: A unique index for the scene within its dataset (can be a simple counter). 32 | * `scene_name (str)`: A unique identifier for the scene, often including the dataset prefix (e.g., `'scannet/scene0000_00'`). This name is used in subsequent calls. 33 | 34 | ### `list_images(self, dataset_name, scene_name)` 35 | 36 | * **Purpose:** Lists all images (or viewpoints) associated with a specific scene. 37 | * **Args:** 38 | * `dataset_name (str)`: The name of the dataset. 39 | * `scene_name (str)`: The unique identifier of the scene (obtained from `list_scenes`). 40 | * **Returns:** 41 | * `dict`: A dictionary where keys are unique image identifiers (e.g., `'/'` or `'/'`) and values are dictionaries containing image-specific annotations. Each image annotation dictionary **must** include: 42 | * `extrinsic` (or equivalent): 4x4 Transformation matrix (e.g., NumPy array or list of lists) from camera coordinates to the global/world coordinate system of the scene. 43 | ```python 44 | # Example: 45 | [[ -0.9897, 0.1085, 0.0927, 1.2120], 46 | [ -0.0330, 0.4577, -0.8884, 0.3075], 47 | [ -0.1388, -0.8824, -0.4494, 1.4804], 48 | [ 0. , 0. , 0. , 1. ]] 49 | ``` 50 | * `intrinsic`: 4x4 Camera intrinsics matrix (e.g., NumPy array or list of lists). 51 | ```python 52 | # Example: 53 | [[ 1170.18, 0. , 647.75, 0. ], 54 | [ 0. , 1170.18, 483.75, 0. ], 55 | [ 0. , 0. , 1. , 0. ], 56 | [ 0. , 0. , 0. , 1. ]] 57 | ``` 58 | * `img_path`: Absolute or relative path to the image file. `img_path` gets joined with `image_root` path in the config file. 59 | * Any other metadata required by `list_objects` (e.g., `visible_instance_ids` in `EmbodiedScanLoader`). 60 | 61 | ### `list_objects(self, dataset_name, scene_name, image_ann)` 62 | 63 | * **Purpose:** Identifies and processes object instances visible from a specific viewpoint (image). It organizes objects based on visibility and category, handles duplicate categories, and calculates scene bounds. 64 | * **Args:** 65 | * `dataset_name (str)`: The name of the dataset. 66 | * `scene_name (str)`: The unique identifier of the scene. 67 | * `image_ann (dict)`: The annotation dictionary for a single image, obtained from the output of `list_images`. 68 | * **Returns:** 69 | * `tuple`: A 5-element tuple containing: 70 | 1. `vis_objs (dict)`: Dictionary of *visible*, *non-environmental* objects. 71 | * Keys: Object category name. If multiple instances of the same category are visible, append an index (e.g., `'chair_0'`, `'chair_1'`). Environmental objects like 'wall', 'floor', 'ceiling', and generic 'object' categories should be excluded. 72 | * Values: Instance annotation dictionaries. Each dictionary should contain at least: 73 | * `category (str)`: The original object category label. 74 | * `name (str)`: The potentially indexed name used as the key in `vis_objs`. 75 | * `bbox_3d` (or equivalent, optional but recommended): The original 3D bounding box representation from your dataset (e.g., 9 DoF parameters: center, size, orientation). While the pipeline primarily uses the `obb` for calculations, this original `bbox_3d` is saved in the final annotations if provided. 76 | * `obb`: The Open3D `OrientedBoundingBox` representation (`open3d.geometry.OrientedBoundingBox`). **This is crucial for spatial relationship calculations.** Your `list_objects` implementation is responsible for creating this, often by converting from `bbox_3d` (see `EmbodiedScanLoader` line ~241 for an example using `_9dof_to_box`) or by generating it directly if your dataset provides OBBs. 77 | 2. `unique_vis_categories (set)`: A set of category names (strings) for objects that appear *exactly once* in the `vis_objs` dictionary (excluding environmental/generic categories). 78 | 3. `multi_vis_categories (set)`: A set of category names (strings) for objects that appear *multiple times* in the `vis_objs` dictionary (excluding environmental/generic categories). 79 | 4. `floor_bound (list)`: A list containing two `numpy.ndarray`s representing the minimum and maximum coordinates `[min_bound, max_bound]` that encompass the floor and all non-environmental objects. This is often derived from the combined OBBs of relevant objects. 80 | 5. `all_objs (dict)`: Dictionary of *all* non-environmental objects associated with the *scene* (not just the current view), keyed by their potentially indexed name (e.g., 'chair_0'). 81 | Used for occupancy map generation or other downstream tasks. The structure mirrors `vis_objs` but includes objects not necessarily visible in the current `image_ann`. 82 | Each object dictionary must contain at least `category`, `name`, and `obb`. Including `bbox_3d` is recommended if available. 83 | *Note: Depending on your dataset structure, you might populate this similarly to `vis_objs` based on `visible_instance_ids` or load all scene objects separately.* 84 | 85 | ## Configuration 86 | 87 | To use your custom data loader, update the `data_loading` section in your configuration file (e.g., `configs/example_config.yaml`): 88 | 89 | ```yaml 90 | data_loading: 91 | # ... other settings ... 92 | loader_class: path.to.your.module.YourCustomLoaderClassName # Update this line 93 | # Provide any custom keys your loader's __init__ needs 94 | your_custom_annotation_path: 95 | train: /path/to/your/train_annotations.pkl 96 | val: /path/to/your/val_annotations.pkl 97 | # ... other dataset-specific paths or parameters ... 98 | ``` 99 | 100 | * Set `loader_class` to the fully qualified Python path of your custom loader class. 101 | * Ensure any necessary configuration parameters (like annotation file paths) needed by your loader's `__init__` method are present in the `data_loading` section. 102 | 103 | ## Example 104 | 105 | Refer to `data_loader.embodiedscan_loader.EmbodiedScanLoader` for a concrete implementation example using datasets like ScanNet, Matterport3D, and 3RScan. 106 | 107 | Additionally, refer to `data_loader.example_loader.py` for a simpler implementation tailored specifically to the JSON annotation format found in the `example_data/` directory. This loader demonstrates how to handle the example annotations provided for testing the pipeline. 108 | 109 | ## Visualizing Your Loader Output 110 | 111 | To verify that your custom data loader is producing the correct outputs (specifically the object instances with their 3D bounding boxes and camera parameters), you can use the provided visualization script: `scripts/visualize_input.py`. 112 | 113 | **Purpose:** 114 | 115 | This script takes an image file and a corresponding intermediate annotation JSON file (similar to those in `example_data/annotations/`, representing the data your loader would prepare for a single image) as input. It reads the camera parameters (`extrinsic`, `intrinsic`) and the object information (specifically `bbox_3d`) from the JSON. It then projects the 3D bounding boxes onto the 2D image and displays the result. 116 | 117 | This helps you visually confirm: 118 | 119 | * Camera parameters (`extrinsic`, `intrinsic`) are correct. 120 | * Oriented object bounding boxes (derived from `bbox_3d`) align with the objects in the image. 121 | * The data format your loader prepares is being interpreted correctly before passing it to the main pipeline. 122 | 123 | **Important Note:** 124 | 125 | The provided visualization script, `scripts/visualize_input.py`, is designed to help debug your custom loader's output *before* running the full generation pipeline. It reads an intermediate JSON file (like those in `example_data/annotations/`) which represents the data your loader passes for a single image. 126 | 127 | Currently, this script expects the JSON to contain an `objects` array. For each object in this array, it specifically looks for a `bbox_3d` field containing a list with 9 DoF parameters (center, size, rotation) as its first element. It uses these parameters to generate an Open3D `OrientedBoundingBox` (`obb`) via the `_9dof_to_box` function for visualization. 128 | 129 | * **If your custom loader generates an intermediate JSON where the 3D bounding box information is stored differently (e.g., different format within `bbox_3d`, different field name, or only providing a pre-computed `obb`),** you will need to modify the `visualize_single_image` function in `scripts/visualize_input.py` (around line 195) to correctly parse your data and create the `o3d_box` for drawing. 130 | -------------------------------------------------------------------------------- /robospatial/data_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /robospatial/data_loader/base_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | from abc import ABC, abstractmethod 12 | 13 | class BaseLoader(ABC): 14 | """ 15 | Abstract base class for dataset loaders. 16 | 17 | This class defines the interface that dataset loaders must implement 18 | to be compatible with the annotation generation pipeline. 19 | """ 20 | 21 | @abstractmethod 22 | def __init__(self, config): 23 | """ 24 | Initialize the dataset loader. 25 | 26 | Args: 27 | config (dict): Configuration dictionary containing dataset parameters. 28 | """ 29 | pass 30 | 31 | @abstractmethod 32 | def list_scenes(self, dataset_list): 33 | """ 34 | List all scenes available in the specified datasets. 35 | 36 | Args: 37 | dataset_list (list): List of dataset names to query. 38 | 39 | Returns: 40 | generator: Yields tuples of (dataset_name, scene_idx, scene_name). 41 | """ 42 | pass 43 | 44 | @abstractmethod 45 | def list_images(self, dataset_name, scene_name): 46 | """ 47 | List all images available in the specified scene. 48 | 49 | Args: 50 | dataset_name (str): Name of the dataset. 51 | scene_name (str): Name of the scene (e.g., 'scannet/scene00191_00'). 52 | 53 | Returns: 54 | dict: Dictionary of image annotations keyed by image name (e.g., '/'), 55 | each containing at minimum: 56 | - extrinsic: Camera to global transformation matrix. 57 | - intrinsic: Camera to image transformation matrix. 58 | - img_path: Path to the image file. 59 | - (If needed) axis_align_matrix: Matrix to align to world coordinates. 60 | """ 61 | pass 62 | 63 | @abstractmethod 64 | def list_objects(self, dataset_name, scene_name, image_ann): 65 | """ 66 | List all object instances visible in an image in the specified scene. 67 | 68 | Processes visible objects in an image and organizes them. 69 | 70 | Args: 71 | dataset_name (str): Name of the dataset. 72 | scene_name (str): Name of the scene. 73 | image_ann (dict): Image annotation dictionary from list_images. 74 | 75 | Returns: 76 | tuple: A 5-element tuple containing: 77 | - vis_objs (dict): Dictionary of visible, non-environmental objects. 78 | Keys are categories (indexed if duplicates exist, e.g., 'chair_0'). 79 | Values are instance dictionaries. 80 | - unique_vis_categories (set): Set of categories for objects appearing only once 81 | (excluding environmental/generic object categories). 82 | - multi_vis_categories (set): Set of categories for objects appearing multiple times 83 | (excluding environmental/generic object categories). 84 | - floor_bound (list): Min and max floor boundaries derived from object OBBs, 85 | as [min_bound, max_bound]. 86 | - all_objs (dict): Dictionary of all non-environmental objects (floor, wall, ceiling excluded), 87 | keyed by their potentially indexed name (e.g., 'chair_0'). 88 | Used for occupancy map calculation or other downstream tasks. 89 | """ 90 | pass -------------------------------------------------------------------------------- /robospatial/data_loader/embodiedscan_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | # 10 | # This code is partially adapted from 11 | # https://github.com/OpenRobotLab/EmbodiedScan/blob/main/embodiedscan/explorer.py 12 | # under the Apache 2.0 license. 13 | 14 | 15 | import os 16 | import pickle 17 | from collections import defaultdict 18 | 19 | import numpy as np 20 | from data_loader.base_loader import BaseLoader 21 | 22 | from spatial_analysis.relationship_utils import _9dof_to_box 23 | 24 | 25 | class EmbodiedScanLoader(BaseLoader): 26 | """ 27 | Loader for EmbodiedScan datasets (3RScan, ScanNet, Matterport3D). 28 | 29 | Inherits from BaseLoader and implements its interface methods. 30 | """ 31 | 32 | def __init__(self, config): 33 | """ 34 | Initialize the EmbodiedScan loader. 35 | 36 | Args: 37 | config (dict): Data loader configuration dictionary. 38 | """ 39 | self.verbose = config["verbose"] 40 | 41 | if self.verbose: 42 | print('Loading EmbodiedScan...') 43 | 44 | # Get annotation key name from config 45 | annotation_key = config.get("annotation_key", "embodiedscan_ann") 46 | 47 | # Get splits from config 48 | splits = config.get("split") 49 | 50 | # Load annotation files based on splits 51 | ann_files = [] 52 | if annotation_key in config: 53 | for split in splits: 54 | if split in config[annotation_key]: 55 | ann_files.append(config[annotation_key][split]) 56 | 57 | self.ann_files = ann_files 58 | 59 | 60 | self.metainfo = None 61 | ## Load embodiedscan annotated scan datasets (scannet, matterport3d, 3rscan, arkitscenes) 62 | data_list = [] 63 | for file in self.ann_files: 64 | with open(file, 'rb') as f: 65 | data = pickle.load(f) 66 | 67 | if self.metainfo is None: 68 | self.metainfo = data['metainfo'] 69 | else: 70 | assert self.metainfo == data['metainfo'] 71 | 72 | data_list += data['data_list'] 73 | 74 | 75 | if isinstance(self.metainfo['categories'], list): 76 | self.classes = self.metainfo['categories'] 77 | self.id_to_index = {i: i for i in range(len(self.classes))} 78 | elif isinstance(self.metainfo['categories'], dict): 79 | self.classes = list(self.metainfo['categories'].keys()) 80 | self.id_to_index = { 81 | i: self.classes.index(classes) 82 | for classes, i in self.metainfo['categories'].items() 83 | } 84 | 85 | # Check if certain scan exists 86 | self.data = defaultdict(dict) 87 | for data in data_list: 88 | 89 | splits = data['sample_idx'].split('/') # sample_idx is scene name 90 | dataset_name = splits[0] 91 | 92 | data['dataset'] = dataset_name 93 | if dataset_name == 'scannet': 94 | region = splits[1] 95 | dirpath = os.path.join(config['image_root'], dataset_name, 'posed_images', 96 | region) 97 | elif dataset_name == '3rscan': 98 | region = splits[1] 99 | dirpath = os.path.join(config['image_root'], dataset_name, region) 100 | elif dataset_name == 'matterport3d': 101 | building, region = splits[1], splits[2] 102 | dirpath = os.path.join(config['image_root'], dataset_name, 103 | building) 104 | else: 105 | region = splits[1] 106 | dirpath = os.path.join(self.data_root[dataset_name], region) 107 | if os.path.exists(dirpath): 108 | # scene_name is the scene name in the dataset with dataset name prepended if it is not already present 109 | scene_name = data['sample_idx'] 110 | if not data['sample_idx'].startswith(dataset_name): 111 | scene_name = f"{dataset_name}/{data['sample_idx']}" 112 | self.data[dataset_name][scene_name] = data 113 | # self.dataset_stats = {} 114 | # for dataset, data in self.data.items(): 115 | # self.dataset_stats[dataset] = len(data) 116 | 117 | if self.verbose: 118 | for dataset_name, data in self.data.items(): 119 | print(f"Loaded {len(data)} scenes from {dataset_name}") 120 | print('Loading complete') 121 | 122 | def list_scenes(self, dataset_list): 123 | """ 124 | Implementation of BaseLoader.list_scenes for EmbodiedScan datasets. 125 | 126 | Args: 127 | dataset_list (list): List of dataset names to query. 128 | 129 | Returns: 130 | generator: Yields tuples of (dataset_name, scene_idx, scene_name). 131 | #NOTE scene_name is / 132 | """ 133 | for dataset_name in dataset_list: 134 | if dataset_name in self.data: 135 | for scene_idx, scene_name in enumerate(self.data[dataset_name]): 136 | yield dataset_name, scene_idx, scene_name 137 | 138 | def list_images(self, dataset_name, scene_name): 139 | """ 140 | Implementation of BaseLoader.list_images for EmbodiedScan datasets. 141 | 142 | Args: 143 | dataset_name (str): Name of the dataset. 144 | scene_name (str): Name of the scene. Example: scannet/scene00191_00 145 | 146 | Returns: 147 | list: List of image annotations. 148 | """ 149 | if scene_name not in self.data[dataset_name]: 150 | if self.verbose: 151 | print(f"Warning: Scene {scene_name} not found in annotations") 152 | return [] 153 | 154 | # Extract scene-wide annotations 155 | axis_align_matrix = np.array(self.data[dataset_name][scene_name]['axis_align_matrix']) # scene wide 156 | if "cam2img" in self.data[dataset_name][scene_name]: 157 | cam2img = np.array(self.data[dataset_name][scene_name]['cam2img']) # scene wide 158 | else: 159 | cam2img = np.array(self.data[dataset_name][scene_name]['images'][0]['cam2img']) # Some scenes have cam2img in images 160 | if "depth_cam2img" in self.data[dataset_name][scene_name]: 161 | depth_cam2img = np.array(self.data[dataset_name][scene_name]['depth_cam2img']) # scene wide 162 | else: 163 | depth_cam2img = [] 164 | 165 | # Add scene-wide annotations to each image annotation 166 | image_annotations = {} 167 | for image_ann in self.data[dataset_name][scene_name]['images']: 168 | # Add dataset and scene information to image annotation 169 | image_ann['dataset'] = dataset_name 170 | image_ann['scene'] = scene_name 171 | image_ann['image_basename'] = os.path.basename(image_ann["img_path"]) #NOTE Actual image filename 172 | image_ann['extrinsic'] = axis_align_matrix @ image_ann['cam2global'] # Camera to world 173 | image_ann['intrinsic'] = cam2img # Camera to image 174 | image_ann['cam2img'] = cam2img 175 | image_ann['axis_align_matrix'] = axis_align_matrix 176 | image_ann['cam2global'] = image_ann['cam2global'] 177 | image_ann['depth_cam2img'] = depth_cam2img 178 | image_ann['depth_path'] = image_ann['depth_path'] 179 | image_ann['visible_instance_ids'] = image_ann['visible_instance_ids'] 180 | image_name = scene_name + "/" + image_ann['image_basename'] 181 | image_annotations[image_name] = image_ann #NOTE Image name is / 182 | 183 | return image_annotations 184 | 185 | def list_objects(self, dataset_name, scene_name, image_ann): 186 | """ 187 | Implementation of BaseLoader.list_objects for EmbodiedScan datasets. 188 | 189 | Processes visible objects in an image and organizes them into multiple categories: 190 | - unique_vis_categories: Objects that appear exactly once (dictionary keyed by category) 191 | - multi_vis_categories: Objects that appear multiple times (dictionary keyed by category_0, category_1, etc.) 192 | - vis_objs: All visible objects 193 | - all_objs: All non-environmental objects 194 | 195 | Also calculates the floor boundaries for the scene. 196 | 197 | Args: 198 | dataset_name (str): Name of the dataset. 199 | scene_name (str): Name of the scene. 200 | image_ann (dict): Image annotation dictionary. 201 | 202 | Returns: 203 | tuple: A 5-element tuple containing: 204 | - vis_objs (dict): Dictionary of visible, non-environmental objects. Keys are categories (indexed if duplicates exist, e.g., 'chair_0', 'chair_1'). Values are instance dictionaries. 205 | - unique_vis_categories (set): Set of categories for objects appearing only once (excluding environmental/object categories). 206 | - multi_vis_categories (set): Set of categories for objects appearing multiple times (excluding environmental/object categories). 207 | - floor_bound (list): Min and max floor boundaries as [min_bound, max_bound]. 208 | - all_objs (dict): Dictionary of all non-environmental objects (floor, wall, ceiling excluded), keyed by their potentially indexed name (e.g., 'chair_0'). Used for occupancy map calculation. 209 | """ 210 | # Get visible instance ids from image annotation 211 | #NOTE you can use different ways to get this. 212 | visible_instance_ids = image_ann['visible_instance_ids'] 213 | 214 | if scene_name not in self.data[dataset_name]: 215 | if self.verbose: 216 | print(f"Warning: Scene {scene_name} not found in annotations") 217 | return {}, set(), set(), [], {} # Return empty structures matching the new return type 218 | 219 | # First pass to count occurrences of each non-environmental category 220 | category_total_counts = defaultdict(int) 221 | for i in visible_instance_ids: 222 | instance = self.data[dataset_name][scene_name]['instances'][i] 223 | category = self.classes[self.id_to_index[instance['bbox_label_3d']]] 224 | # Exclude environmental or generic object categories from indexed naming 225 | if category not in ["wall", "ceiling", "floor", "object"]: 226 | category_total_counts[category] += 1 227 | 228 | # Process instances to create the unified vis_objs dictionary 229 | vis_objs = {} 230 | unique_vis_categories = set() 231 | multi_vis_categories = set() 232 | category_indices = defaultdict(int) # To track current index for duplicate categories 233 | env_objs = {} 234 | all_objs = {} # Still needed for floor bounding box calculation 235 | 236 | for i in visible_instance_ids: 237 | instance = self.data[dataset_name][scene_name]['instances'][i] 238 | category = self.classes[self.id_to_index[instance['bbox_label_3d']]] 239 | instance["category"] = category # Keep original label name in instance dict 240 | instance["obb"] = _9dof_to_box(instance["bbox_3d"]) # We use Open3D obb for all spatial relationships 241 | 242 | # Handle environmental objects (for floor calculation) 243 | if category in ["floor", "wall", "ceiling"]: 244 | env_objs[category] = instance 245 | 246 | # Parse categories to handle duplicates, assume there is only one floor, wall, and ceiling 247 | total_count = category_total_counts[category] 248 | if total_count == 1: 249 | obj_key = category 250 | instance["name"] = obj_key 251 | unique_vis_categories.add(category) 252 | else: 253 | current_index = category_indices[category] 254 | obj_key = f"{category}_{current_index}" 255 | instance["name"] = obj_key 256 | multi_vis_categories.add(category) 257 | category_indices[category] += 1 258 | 259 | # Add to vis_objs if it is not an environmental object 260 | if category not in ["wall", "ceiling", "floor", "object"]: 261 | vis_objs[obj_key] = instance 262 | 263 | # Get all objects for occupancy map calculation 264 | if category not in ["floor", "wall", "ceiling"]: 265 | all_objs[obj_key] = instance 266 | 267 | # Track all non-floor/wall/ceiling objects for OBB calculation - This part is now handled above 268 | # if category not in ["floor", "wall", "ceiling"]: 269 | # all_objs[i] = instance # Use original instance id as key 270 | 271 | all_obbs = [obj["obb"] for obj in all_objs.values()] 272 | 273 | # Create floor box representation automatically 274 | # Ensure floor object exists before accessing it 275 | if "floor" in env_objs: 276 | floor_obj = env_objs["floor"] 277 | floor_obb = _9dof_to_box(floor_obj["bbox_3d"]) 278 | min_bound = np.min([box.get_min_bound() for box in all_obbs + [floor_obb]], axis=0) 279 | max_bound = np.max([box.get_max_bound() for box in all_obbs + [floor_obb]], axis=0) 280 | floor_bound = [min_bound, max_bound] 281 | else: 282 | # Handle cases where there might not be a floor object detected/annotated 283 | if len(all_obbs) > 0: 284 | min_bound = np.min([box.get_min_bound() for box in all_obbs], axis=0) 285 | max_bound = np.max([box.get_max_bound() for box in all_obbs], axis=0) 286 | floor_bound = [min_bound, max_bound] # Use bounds from other objects if floor is missing 287 | else: 288 | floor_bound = None 289 | 290 | return vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs 291 | -------------------------------------------------------------------------------- /robospatial/data_loader/example_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | import os 12 | import json 13 | from collections import defaultdict 14 | import glob 15 | 16 | import numpy as np 17 | from data_loader.base_loader import BaseLoader 18 | 19 | from spatial_analysis.relationship_utils import _9dof_to_box 20 | 21 | # Top-level function for defaultdict factory (picklable) 22 | def nested_dict_factory(): 23 | return defaultdict(dict) 24 | 25 | class ExampleLoader(BaseLoader): 26 | """ 27 | Loader for example data from JSON annotation files. 28 | 29 | Inherits from BaseLoader and implements its interface methods. 30 | """ 31 | 32 | def __init__(self, config): 33 | """ 34 | Initialize the Example loader. 35 | 36 | Args: 37 | config (dict): Data loader configuration dictionary. 38 | Expected keys: 39 | - annotation_dir (str): Path to the directory containing JSON annotation files. 40 | - verbose (bool): Optional verbosity flag. 41 | """ 42 | self.verbose = config.get("verbose", False) 43 | self.config = config 44 | 45 | if self.verbose: 46 | print('Loading Example Dataset JSON annotations...') 47 | annotation_dir = config.get("annotation_dir") 48 | if not annotation_dir or not os.path.isdir(annotation_dir): 49 | raise ValueError("Config must contain a valid 'annotation_dir' pointing to the JSON annotations.") 50 | 51 | self.data = defaultdict(nested_dict_factory) # Use the named function 52 | # Recursively find all json files in the annotation_dir 53 | json_files = glob.glob(os.path.join(annotation_dir, '**', '*.json'), recursive=True) 54 | 55 | 56 | if not json_files: 57 | print(f"Warning: No JSON files found in {annotation_dir}") 58 | return 59 | 60 | for file_path in json_files: 61 | with open(file_path, 'r') as f: 62 | image_data = json.load(f) 63 | 64 | # Validate basic structure (can be expanded) 65 | required_keys = ['dataset', 'scene_name', 'image_name', 'objects', 'camera_annotations'] 66 | if not all(k in image_data for k in required_keys): 67 | if self.verbose: 68 | print(f"Warning: Skipping file {file_path} due to missing one or more required keys: {required_keys}.") 69 | continue 70 | if not all(k in image_data['camera_annotations'] for k in ['extrinsic', 'intrinsic']): 71 | if self.verbose: 72 | print(f"Warning: Skipping file {file_path} due to missing extrinsic/intrinsic in camera_annotations.") 73 | continue 74 | 75 | dataset_name = image_data['dataset'] 76 | # Use the scene name provided in the JSON 77 | scene_name = image_data['scene_name'] 78 | image_name = image_data['image_name'] 79 | 80 | # Store the loaded data, grouping by dataset and scene 81 | # self.data[dataset_name][scene_name] will hold a dict of {image_identifier: image_data} 82 | self.data[dataset_name][scene_name][image_name] = image_data 83 | # except Exception as e: 84 | # if self.verbose: 85 | # print(f"Warning: Skipping file {file_path} due to error: {e}") 86 | 87 | if self.verbose: 88 | total_scenes = 0 89 | total_images = 0 90 | for dataset_name, scenes in self.data.items(): 91 | print(f"Loaded {len(scenes)} scenes from {dataset_name}") 92 | total_scenes += len(scenes) 93 | for scene_name, images in scenes.items(): 94 | total_images += len(images) 95 | print(f'Loading complete. Total scenes: {total_scenes}, Total images: {total_images}') 96 | 97 | 98 | def list_scenes(self, dataset_list): 99 | """ 100 | Implementation of BaseLoader.list_scenes for Example Dataset dataset loaded from JSON. 101 | 102 | Args: 103 | dataset_list (list): List of dataset names to query. 104 | 105 | Returns: 106 | generator: Yields tuples of (dataset_name, scene_idx, scene_name). 107 | """ 108 | 109 | for dataset_name in dataset_list: 110 | if dataset_name in self.data: 111 | # Ensure consistent scene indexing if needed, otherwise enumerate keys 112 | # Using enumerate(self.data[dataset_name]) provides a simple index 113 | # The scene_name now directly comes from the JSON data structure keys 114 | for scene_idx, scene_name in enumerate(self.data[dataset_name]): 115 | yield dataset_name, scene_idx, scene_name 116 | 117 | def list_images(self, dataset_name, scene_name): 118 | """ 119 | Implementation of BaseLoader.list_images for Example Dataset loaded from JSON. 120 | 121 | Args: 122 | dataset_name (str): Name of the dataset. 123 | scene_name (str): Name of the scene. Example: gr00t/deduped_data_normal 124 | 125 | Returns: 126 | dict: Dictionary of image annotations, keyed by image_identifier. 127 | """ 128 | if dataset_name not in self.data or scene_name not in self.data[dataset_name]: 129 | if self.verbose: 130 | print(f"Warning: Scene {scene_name} not found in dataset {dataset_name}") 131 | return {} 132 | 133 | scene_images_data = self.data[dataset_name][scene_name] 134 | image_annotations = {} 135 | 136 | for image_identifier, image_data in scene_images_data.items(): 137 | image_ann = {} 138 | # Basic info from the loaded JSON data 139 | image_ann['dataset'] = dataset_name 140 | image_ann['scene'] = scene_name 141 | image_ann['image_identifier'] = image_identifier 142 | image_ann['img_path'] = dataset_name + "/" + scene_name + "/" + image_identifier #NOTE image_path is combined with image_root from config to create the absolute image path 143 | image_ann['image_basename'] = os.path.basename(image_data['image_name']) 144 | image_ann['image_size'] = image_data.get('image_size') # Optional 145 | 146 | # Camera parameters from the loaded JSON data 147 | cam_ann = image_data.get('camera_annotations', {}) # Presence checked in __init__ 148 | image_ann['extrinsic'] = np.array(cam_ann.get('extrinsic')) 149 | image_ann['intrinsic'] = np.array(cam_ann.get('intrinsic')) 150 | image_ann['objects'] = image_data.get('objects') 151 | 152 | # visible_instance_ids is no longer the primary way to get objects for list_objects 153 | # image_ann['visible_instance_ids'] = image_data.get('visible_instance_ids', []) 154 | 155 | # Use image_identifier as the key 156 | image_annotations[image_identifier] = image_ann 157 | 158 | 159 | 160 | return image_annotations 161 | 162 | def list_objects(self, dataset_name, scene_name, image_ann): 163 | """ 164 | Implementation of BaseLoader.list_objects for Example Dataset from JSON. 165 | 166 | Processes objects listed in the 'object_grounding' field of the image annotation. 167 | 168 | Args: 169 | dataset_name (str): Name of the dataset (provides context). 170 | scene_name (str): Name of the scene (provides context). 171 | image_ann (dict): Image annotation dictionary (from list_images). 172 | 173 | Returns: 174 | tuple: A 5-element tuple containing: 175 | - vis_objs (dict): Dictionary of visible, non-environmental objects. Keys are categories (indexed if duplicates exist, e.g., 'chair_0', 'chair_1'). Values are instance dictionaries with 'obb' and 'name'. 176 | - unique_vis_categories (set): Set of original category names for objects appearing only once (excluding environmental/object categories). 177 | - multi_vis_categories (set): Set of original category names for objects appearing multiple times (excluding environmental/object categories). 178 | - floor_bound (list): Min and max floor boundaries [min_bound, max_bound] calculated from *this image's* non-environmental objects and floor (if present). Can be None. 179 | - all_objs (dict): Dictionary of all non-environmental objects in this image, keyed by their potentially indexed name (same as vis_objs in this implementation). 180 | """ 181 | image_identifier = image_ann.get('image_identifier', 'unknown_image') # For logging 182 | objects = image_ann.get('objects', []) 183 | 184 | if not objects: 185 | # Return empty structures matching the expected return type 186 | return {}, set(), set(), None, {} 187 | 188 | # First pass to count occurrences of each non-environmental category in this image 189 | category_total_counts = defaultdict(int) 190 | parsed_objects = [] # Store parsed objects temporarily 191 | for obj_data in objects: 192 | category = obj_data.get("Name") #NOTE Name is the category name for gr00t 193 | bbox_3d_list = obj_data.get("bbox_3d") 194 | 195 | # Basic validation 196 | if not category: 197 | if self.verbose: print(f"Warning: Skipping object with missing Name in image {image_identifier}") 198 | continue 199 | if not bbox_3d_list or not isinstance(bbox_3d_list, list) or not bbox_3d_list: 200 | if self.verbose: print(f"Warning: Skipping object '{category}' with missing or invalid bbox_3d in image {image_identifier}") 201 | continue 202 | 203 | 204 | # Assuming bbox_3d is 9 DOF 205 | bbox_3d_params = bbox_3d_list[0] 206 | # Validate that we indeed have 9 parameters 207 | if not isinstance(bbox_3d_params, list) or len(bbox_3d_params) != 9: 208 | if self.verbose: print(f"Warning: Skipping object '{category}' due to invalid bbox_3d params (expected 9DOF, got {len(bbox_3d_params)}) in image {image_identifier}. Params: {bbox_3d_params}") 209 | continue 210 | 211 | # Removed padding logic as bbox_3d is guaranteed 9DOF. 212 | 213 | instance = { 214 | "name": category, #NOTE name == category for gr00t 215 | "category": category, # Original name from JSON 216 | "bbox_3d": bbox_3d_params, # Store the 9DoF params 217 | } 218 | 219 | try: 220 | # Calculate OBB immediately 221 | instance["obb"] = _9dof_to_box(bbox_3d_params) 222 | parsed_objects.append(instance) # Add to list for further processing 223 | # Count non-environmental categories (case-insensitive check) 224 | if category.lower() not in ["wall", "ceiling", "floor", "object"]: 225 | category_total_counts[category] += 1 # Count using original name 226 | except ValueError as e: 227 | # Catch potential errors from _9dof_to_box if params are still invalid 228 | if self.verbose: 229 | print(f"Error converting bbox for object '{category}' in image {image_identifier}: {e}. Params: {bbox_3d_params}") 230 | except Exception as e: # Catch other potential exceptions 231 | if self.verbose: 232 | print(f"Unexpected error processing object '{category}' in image {image_identifier}: {e}") 233 | 234 | 235 | # Process parsed instances to create the final dictionaries 236 | vis_objs = {} 237 | unique_vis_categories = set() 238 | multi_vis_categories = set() 239 | category_indices = defaultdict(int) # To track current index for duplicate categories 240 | env_objs = {} 241 | all_objs_for_bounds = [] # Collect OBBs for floor calculation 242 | 243 | for instance in parsed_objects: 244 | category = instance["category"] # Original name 245 | 246 | # Handle environmental objects (for floor calculation) 247 | # Use lower case for comparison to identify type 248 | cat_lower = category.lower() 249 | if cat_lower in ["floor", "wall", "ceiling"]: 250 | # Assuming only one of each environmental object per image annotation 251 | # Store with lowercase key for easy lookup 252 | env_objs[cat_lower] = instance 253 | if cat_lower == "floor": 254 | # Add floor OBB for bound calculation if it exists 255 | if "obb" in instance: 256 | all_objs_for_bounds.append(instance["obb"]) 257 | continue # Skip adding env objects to vis_objs/multi/unique sets 258 | 259 | # Process non-environmental objects (already counted) 260 | if category in category_total_counts: 261 | total_count = category_total_counts[category] 262 | if total_count == 1: 263 | obj_key = category # Use original name as key 264 | instance["name"] = obj_key # Store potentially indexed name 265 | unique_vis_categories.add(category) # Store original category name 266 | else: 267 | # Use original category name for indexing 268 | current_index = category_indices[category] 269 | obj_key = f"{category}_{current_index}" 270 | instance["name"] = obj_key # Store potentially indexed name 271 | multi_vis_categories.add(category) # Store original category name 272 | category_indices[category] += 1 273 | 274 | vis_objs[obj_key] = instance 275 | # Add OBB for floor calculation if it exists 276 | if "obb" in instance: 277 | all_objs_for_bounds.append(instance["obb"]) 278 | 279 | 280 | # Calculate floor bounds based on OBBs from this image (non-env + floor if present) 281 | floor_bound = None 282 | if all_objs_for_bounds: 283 | try: 284 | # Combine points from all relevant OBBs 285 | all_points = np.vstack([box.get_box_points() for box in all_objs_for_bounds]) 286 | min_bound = np.min(all_points, axis=0) 287 | max_bound = np.max(all_points, axis=0) 288 | 289 | # The bounds derived this way represent the extent of the objects considered. 290 | floor_bound = [min_bound.tolist(), max_bound.tolist()] 291 | except Exception as e: 292 | if self.verbose: 293 | print(f"Error calculating floor bounds for image {image_identifier}: {e}") 294 | floor_bound = None # Indicate failure 295 | 296 | 297 | # `all_objs` in the original return signature was intended for occupancy map calculation, 298 | # usually containing all non-environmental objects in the scene. 299 | # In this JSON-based, image-level loading, it effectively becomes the same as `vis_objs` 300 | # as we only process objects visible/annotated in the current image JSON. 301 | all_objs = vis_objs.copy() 302 | 303 | return vis_objs, unique_vis_categories, multi_vis_categories, floor_bound, all_objs 304 | -------------------------------------------------------------------------------- /robospatial/run_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | """Main entry script for generating spatial annotations from 3D scan datasets. 11 | 12 | This script orchestrates the annotation generation process based on a YAML 13 | configuration file. It handles: 14 | - Parsing command-line arguments for configuration path, scene filtering (range, specific scene, specific image), and dry runs. 15 | - Loading the dataset configuration and initializing the appropriate data loader. 16 | - Iterating through specified datasets and scenes. 17 | - Calling the `generate_and_save_annotations` function from `annotation_generator.py` 18 | to perform the core annotation generation for each scene. 19 | - Tracking generation progress across scenes and saving it periodically. 20 | - Aggregating and saving final statistics (overall and per-dataset). 21 | 22 | Supported annotation types (configured via YAML) include: 23 | - Object Grounding 24 | - Spatial Context 25 | - Spatial Configuration 26 | - Spatial Compatibility 27 | 28 | Usage: 29 | python robospatial/run_generation.py --config path/to/your/config.yaml [options] 30 | """ 31 | 32 | # run_generation.py 33 | # Main entry script to generate annotations from 3D scan datasets 34 | # Supports flexible configuration of annotation types (object grounding, spatial context, spatial configuration, spatial compatibility) 35 | 36 | import argparse 37 | import yaml 38 | import os 39 | import json 40 | import importlib 41 | import concurrent.futures 42 | from tqdm import tqdm 43 | from collections import defaultdict 44 | 45 | # Import the new generator function 46 | from annotation_generator import generate_and_save_annotations 47 | 48 | def parse_args(): 49 | parser = argparse.ArgumentParser(description="Parse configuration file for annotation generation.") 50 | parser.add_argument('--config', type=str, default="configs/base_local.yaml", help='Path to the configuration YAML file.') 51 | parser.add_argument('--range', type=int, nargs=2, help='Range of scene indices to process (inclusive start, exclusive end).') 52 | parser.add_argument('--scene', type=str, help='Specific scene name to process (e.g., "scannet/scene0190_00").') # New argument for specific scene 53 | parser.add_argument('--image', type=str, help='Specific image basename (e.g., "rgb_00010") to process within the specified scene for debugging.') # New argument for specific image 54 | parser.add_argument('--num_workers', type=int, help='Number of worker threads to use for processing scenes.') 55 | parser.add_argument('--dry_run', action='store_true', help='Enable dry run mode (processes only the first 5 images per scene).') 56 | args = parser.parse_args() 57 | 58 | if args.range: 59 | start, end = args.range 60 | # Make range inclusive by adding 1 to end for Python range behavior 61 | args.range = range(start, end + 1) # Store as a range object 62 | if start > end: 63 | parser.error("Start of range must not be greater than end.") 64 | 65 | return args 66 | 67 | def load_config(config_file): 68 | with open(config_file, 'r') as file: 69 | config = yaml.safe_load(file) 70 | # Add default output suffix if not present 71 | if "output_suffix" not in config.get("data_generation", {}): 72 | if "data_generation" not in config: 73 | config["data_generation"] = {} 74 | config["data_generation"]["output_suffix"] = ".annotations.json" 75 | return config 76 | 77 | def create_loader(config): 78 | """Create a loader instance based on configuration.""" 79 | # Default to EmbodiedScanLoader if not specified 80 | loader_class_path = config.get("loader_class") 81 | if loader_class_path is None: 82 | raise ValueError("loader_class not specified in config[data_loading]") 83 | 84 | module_name, class_name = loader_class_path.rsplit('.', 1) 85 | module = importlib.import_module(module_name) 86 | loader_class = getattr(module, class_name) 87 | 88 | # Create the loader instance with config data 89 | loader = loader_class(config) 90 | 91 | return loader 92 | 93 | # Define the scene processing function outside run or nested inside run 94 | def process_scene(args_tuple): 95 | loader, dataset_name, scene_idx, scene_name, config, specific_image, dry_run, num_workers = args_tuple # Unpack num_workers 96 | 97 | # Only print if not multi-threaded 98 | if num_workers <= 1: 99 | tqdm.write(f"\nProcessing {dataset_name} scene: {scene_name} ({scene_idx+1})") # Note: total count isn't readily available here 100 | 101 | try: # Add try/except block for robustness in threads 102 | images_ann_dict_full = loader.list_images(dataset_name, scene_name) 103 | 104 | # Filter images if a specific image name is provided 105 | if specific_image: 106 | if specific_image in images_ann_dict_full: 107 | images_ann_dict = {specific_image: images_ann_dict_full[specific_image]} 108 | if num_workers <= 1: 109 | tqdm.write(f" - Specific image requested: Processing only '{specific_image}'.") 110 | else: 111 | if num_workers <= 1: 112 | tqdm.write(f" - Warning: Specific image '{specific_image}' not found in scene '{scene_name}'. Skipping scene.") 113 | return dataset_name, scene_name, None # Return None for stats if skipped 114 | 115 | # Limit images if dry_run is enabled AND no specific image was requested 116 | elif dry_run and len(images_ann_dict_full) > 5: 117 | images_ann_dict = dict(list(images_ann_dict_full.items())[:5]) 118 | if num_workers <= 1: 119 | tqdm.write(f" - Dry run enabled: Processing only the first 5 images out of {len(images_ann_dict_full)}.") 120 | else: 121 | images_ann_dict = images_ann_dict_full 122 | 123 | # Print total only if not dry run or specific image, and not multi-threaded 124 | if num_workers <= 1: 125 | if not dry_run and not specific_image: 126 | tqdm.write(f" - Listed {len(images_ann_dict_full)} total images") 127 | elif dry_run and not specific_image: # Also print total if dry run 128 | tqdm.write(f" - Listed {len(images_ann_dict_full)} total images") 129 | 130 | if not images_ann_dict: 131 | if num_workers <= 1: 132 | tqdm.write(f"Warning: No images found for scene {scene_name}. Skipping.") 133 | return dataset_name, scene_name, None # Return None for stats if skipped 134 | 135 | scene_stats = generate_and_save_annotations( 136 | loader, 137 | dataset_name, 138 | scene_name, 139 | images_ann_dict, 140 | config, 141 | num_workers 142 | ) 143 | if num_workers <= 1: 144 | tqdm.write(f"Finished scene {scene_name}. Stats: {dict(scene_stats)}") 145 | return dataset_name, scene_name, scene_stats 146 | except Exception as e: 147 | # Always write errors, regardless of num_workers 148 | tqdm.write(f"Error processing scene {scene_name}: {e}") 149 | # Optionally re-raise or log the full traceback 150 | import traceback 151 | # Use tqdm.write for traceback as well 152 | tqdm.write(f"Traceback for error in scene {scene_name}:\n{traceback.format_exc()}") 153 | return dataset_name, scene_name, None # Indicate failure 154 | 155 | 156 | def run(config, specific_scene=None, dry_run=False, specific_image=None, num_workers_arg=None): # Added num_workers_arg 157 | # Normal execution path 158 | print("Starting annotation generation with configuration:") 159 | print(yaml.dump(config, indent=2)) 160 | 161 | # --- Determine Number of Workers --- 162 | num_workers = num_workers_arg # CLI argument takes precedence 163 | if num_workers is None: 164 | num_workers = config.get("data_generation", {}).get("num_workers") 165 | if num_workers is None: 166 | num_workers = 1 # Default to 1 167 | print(f"Number of workers not specified, defaulting to {num_workers}") 168 | print(f"Using {num_workers} worker threads.") 169 | 170 | 171 | # --- Dataset Loading --- 172 | dataset_list = config["data_loading"]["datasets"] 173 | 174 | if not dataset_list: 175 | print("Error: No valid datasets specified. Please include valid datasets in the config.") 176 | return 177 | 178 | # Create the loader instance 179 | loader = create_loader(config["data_loading"]) 180 | print(f"Loader initialized.") 181 | 182 | 183 | # --- Statistics Initialization --- 184 | total_stats = defaultdict(lambda: defaultdict(int)) 185 | overall_stats = defaultdict(int) 186 | generated_something = False 187 | progress_file_path = config["data_generation"].get("progress_file", "generation_progress.json") 188 | completed_scenes_map = defaultdict(list) 189 | 190 | # Load progress if file exists 191 | if os.path.exists(progress_file_path): 192 | with open(progress_file_path, 'r') as f: 193 | loaded_progress = json.load(f) 194 | if isinstance(loaded_progress, dict): 195 | completed_scenes_map.update(loaded_progress) 196 | else: 197 | print(f"Warning: Progress file {progress_file_path} has unexpected format. Starting fresh.") 198 | 199 | # --- Prepare Scene List --- 200 | print("\n--- Preparing Scene List ---") 201 | scene_list_all = list(loader.list_scenes(dataset_list)) 202 | print(f"Found {len(scene_list_all)} total scenes across specified datasets.") 203 | 204 | scenes_to_process_info = [] 205 | skipped_count = 0 206 | for idx, (dataset_name, scene_idx, scene_name) in enumerate(scene_list_all): 207 | # Apply filters 208 | if specific_scene and scene_name != specific_scene: 209 | skipped_count += 1 210 | continue 211 | if config.get("range") and idx not in config["range"]: 212 | skipped_count += 1 213 | continue 214 | if scene_name in completed_scenes_map.get(dataset_name, []): 215 | skipped_count += 1 216 | continue 217 | 218 | # If not skipped, add to list 219 | scenes_to_process_info.append((loader, dataset_name, idx, scene_name, config, specific_image, dry_run, num_workers)) 220 | 221 | if skipped_count > 0: 222 | print(f"Skipped {skipped_count} scenes (due to filters: specific_scene, range, or already completed).") 223 | print(f"Processing {len(scenes_to_process_info)} scenes.") 224 | 225 | if not scenes_to_process_info: 226 | print("No scenes left to process based on filters.") 227 | # Skip the rest if nothing to process 228 | print("\n--- Generation Complete ---") 229 | print("No new annotations were generated.") 230 | return 231 | 232 | 233 | # --- Generation Loop (Parallelized) --- 234 | print("\n--- Processing Scenes ---") 235 | generated_something = False # Reset here, check results later 236 | 237 | with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor: 238 | # Submit tasks and store Future objects 239 | futures = [executor.submit(process_scene, args) for args in scenes_to_process_info] 240 | 241 | # Process results as they complete 242 | for future in tqdm(concurrent.futures.as_completed(futures), total=len(scenes_to_process_info), desc="Processing Scenes"): 243 | try: 244 | result = future.result() # Get the result from the completed future 245 | # --- Aggregation logic using 'result' --- 246 | if result: # Check if result is not None (i.e., processing didn't fail or skip) 247 | dataset_name, scene_name, scene_stats = result 248 | if scene_stats is not None: # Check if stats were successfully generated 249 | # Aggregate stats 250 | for key, value in scene_stats.items(): 251 | if isinstance(value, (int, float)): 252 | total_stats[dataset_name][key] += value 253 | overall_stats[key] += value 254 | generated_something = True 255 | completed_scenes_map[dataset_name].append(scene_name) 256 | # else: scene processing might have skipped internally or failed 257 | # --- End of aggregation logic --- 258 | except Exception as exc: 259 | # Handle exceptions raised within the process_scene function 260 | # Find the arguments that caused the exception for better logging (optional, requires mapping futures back to args) 261 | tqdm.write(f'\nError: A scene generation task generated an exception: {exc}') 262 | # Optionally log the full traceback 263 | # import traceback 264 | # tqdm.write(f"Traceback:\n{traceback.format_exc()}") 265 | 266 | # --- Final Statistics and Cleanup --- 267 | print("\n--- Generation Complete ---") 268 | if generated_something: 269 | print("\n--- Overall Statistics ---") 270 | print(json.dumps(overall_stats, indent=4)) 271 | 272 | print("\n--- Per-Dataset Statistics ---") 273 | print(json.dumps(total_stats, indent=4)) 274 | 275 | with open(progress_file_path, 'w') as f: 276 | json.dump(completed_scenes_map, f, indent=4) 277 | print(f"Final progress saved to {progress_file_path}") 278 | 279 | stats_file_path = config["data_generation"].get("stats_file", "generation_stats.json") 280 | final_stats_data = { 281 | "overall_stats": overall_stats, 282 | "per_dataset_stats": total_stats 283 | } 284 | with open(stats_file_path, 'w') as f: 285 | json.dump(final_stats_data, f, indent=4) 286 | print(f"Final statistics saved to {stats_file_path}") 287 | else: 288 | print("No new annotations were generated.") 289 | 290 | return 291 | 292 | if __name__ == "__main__": 293 | args = parse_args() 294 | config = load_config(args.config) 295 | if args.range: 296 | config["range"] = args.range 297 | # Pass num_workers from args 298 | run(config, specific_scene=args.scene, dry_run=args.dry_run, specific_image=args.image, num_workers_arg=args.num_workers) -------------------------------------------------------------------------------- /robospatial/spatial_analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RoboSpatial/59b091d7694a724d3a46bb2b636d1bc49b899eb9/robospatial/spatial_analysis/__init__.py -------------------------------------------------------------------------------- /robospatial/spatial_analysis/compatibility/compatibility.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | import numpy as np 11 | 12 | from spatial_analysis.context.context import get_point_in_space_relative_to_object 13 | from spatial_analysis.topdown_map import get_empty_space 14 | from spatial_analysis.compatibility.compatibility_utils import can_fit_at_point 15 | 16 | DEBUG_FIT=False 17 | 18 | def can_fit_object_a_in_relation_to_b( 19 | floor_bound, 20 | environment_boxes, 21 | obj_a, # Dictionary of the object being placed (contains name, obb, etc.) 22 | obj_b, # Dictionary of the reference object (contains name, obb, etc.) 23 | have_face, 24 | extrinsic, 25 | intrinsic, 26 | image_size, 27 | image_path, 28 | grid_resolution, 29 | num_samples, 30 | individual_occupancy_maps, 31 | env_occupancy_map, 32 | threshold=0.5, 33 | min_distance=0.2, 34 | buffer_ratio=0.3 35 | ): 36 | """Checks if object A (the target object) can be placed in empty space relative to object B (the reference object). 37 | 38 | The function operates in several steps: 39 | 1. Calculates the available empty space in the environment using `get_empty_space`, generating a 2D grid representation. 40 | 2. Determines a dynamic threshold for empty space sampling based on the sizes of object A and B. 41 | 3. Samples a set of potential placement points (`num_samples`) around object B in various directions 42 | (infront, behind, left, right) across different reference frames (object-centric, camera-centric, world-centric) 43 | using `get_point_in_space_relative_to_object`. This sampling considers precomputed occupancy maps. 44 | 4. For each sampled 3D point corresponding to a specific frame and direction, it checks if object A's 45 | oriented bounding box (OBB) can be placed at that point without colliding with the environment or object B using `can_fit_at_point`. This check utilizes the 2D occupancy grid. 46 | 5. Aggregates the results, indicating whether *at least one* valid placement position was found for 47 | each frame/direction combination. 48 | 49 | Args: 50 | floor_bound (list): A list defining the bounding box of the walkable floor area, used for empty space calculation. 51 | environment_boxes (list): A list of open3d.geometry.OrientedBoundingBox objects representing static obstacles 52 | in the environment. 53 | obj_a (dict): Dictionary representing object A (the object being placed), must contain 'obb' (open3d OBB) 54 | and 'name' (str). 55 | obj_b (dict): Dictionary representing the reference object B, must contain 'obb' (open3d OBB) and 'name' (str). 56 | have_face (bool): Indicates if object B has a defined 'face' or primary orientation, affecting object-centric sampling. 57 | extrinsic (np.ndarray): 4x4 camera extrinsic matrix (camera-to-world transformation). 58 | intrinsic (np.ndarray): 3x3 or 4x4 camera intrinsic matrix (only the top-left 3x3 portion is used if 4x4). 59 | image_size (tuple): Size of the image (width, height), used for camera-centric calculations. 60 | image_path (str): Path to the associated scene image, primarily used for debugging visualizations within called functions. 61 | grid_resolution (float): The resolution (e.g., meters per grid cell) of the 2D occupancy grid used for collision checks. 62 | num_samples (int): The number of candidate points to sample around object B for potential placement checks. 63 | individual_occupancy_maps (dict): Precomputed 2D occupancy numpy arrays for each individual dynamic object (including A and B). 64 | Keys are object names, values are the occupancy map arrays. 65 | env_occupancy_map (np.ndarray): Precomputed combined 2D occupancy numpy array representing the static environment. 66 | threshold (float, optional): Base distance threshold used in empty space calculation. This is dynamically adjusted based on 67 | object sizes. Defaults to 0.5. 68 | 69 | Returns: 70 | dict: A nested dictionary indicating whether a valid placement was found for object A relative to object B 71 | for each combination of reference frame and direction. 72 | Example: `{'objectcentric': {'infront': True, 'behind': False, ...}, 'cameracentric': {...}, ...}` 73 | `True` means at least one valid point was found for that relative position. 74 | """ 75 | empty_areas, grid, occupied = get_empty_space(floor_bound, environment_boxes, grid_resolution) 76 | 77 | box_a = obj_a['obb'] # Extract OBB from obj_a dictionary 78 | obj_a_name = obj_a['name'] # Extract name from obj_a dictionary 79 | box_b = obj_b['obb'] # Extract OBB from obj_b dictionary 80 | obj_b_name = obj_b['name'] # Extract name from obj_b dictionary 81 | 82 | # Adjust the sampling distance threshold based on the average horizontal size of the two objects. 83 | max_extent_a = np.max(box_a.extent[:2]) # Max extent in world x-y plane 84 | max_extent_b = np.max(box_b.extent[:2]) # Max extent in world x-y plane 85 | dynamic_threshold = threshold + (max_extent_a + max_extent_b) / 2 86 | 87 | # Sample potential placement points around obj_b using the precomputed occupancy information. 88 | _, _, visible_points_3d_all, _ = get_point_in_space_relative_to_object( 89 | floor_bound, environment_boxes, 90 | ref_obj=obj_b, 91 | extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size, 92 | have_face=have_face, num_samples=num_samples, 93 | individual_occupancy_maps=individual_occupancy_maps, 94 | env_occupancy_map=env_occupancy_map, 95 | threshold=dynamic_threshold, grid_resolution=grid_resolution, 96 | image_path=image_path, 97 | empty_areas=empty_areas, grid=grid, occupied=occupied, 98 | ) 99 | 100 | results = {} 101 | 102 | # Check placement possibility for each defined reference frame and direction 103 | frames_to_check = ['objectcentric', 'cameracentric', 'worldcentric'] 104 | directions_to_check = ['infront', 'behind', 'left', 'right'] 105 | 106 | for frame in frames_to_check: 107 | results[frame] = {} 108 | for direction in directions_to_check: 109 | # Retrieve the list of 3D candidate points sampled for this specific frame/direction 110 | points_in_direction_3d = visible_points_3d_all.get(frame, {}).get(direction, []) 111 | 112 | if not points_in_direction_3d: 113 | results[frame][direction] = False # No candidate points found for this relative position 114 | continue 115 | 116 | # Check if obj_a fits at any of the sampled points without collision 117 | can_fit = False 118 | for point_3d in points_in_direction_3d: 119 | # Check collision using the 2D grid, environment OBBs, and the reference object B's OBB 120 | if can_fit_at_point(grid, box_a, occupied, point_3d[:2], environment_boxes, box_b, min_distance=min_distance, buffer_ratio=buffer_ratio, box_name=obj_a_name, box_b_name=obj_b_name, frame=frame, direction=direction, DEBUG_FIT=DEBUG_FIT): 121 | can_fit = True 122 | break # Found a valid spot for this direction 123 | 124 | results[frame][direction] = can_fit 125 | 126 | if DEBUG_FIT: 127 | # Print final fitting results if debug flag is enabled 128 | print(f"Can fit results for {obj_a_name} relative to {obj_b_name}:") 129 | print(results) 130 | 131 | return results 132 | 133 | 134 | def can_fit_on_top(top_box, base_box): 135 | """Determines if the top OrientedBoundingBox can fit horizontally on top of the base OrientedBoundingBox. 136 | 137 | Args: 138 | top_box (o3d.geometry.OrientedBoundingBox): The bounding box of the object to be placed on top. 139 | base_box (o3d.geometry.OrientedBoundingBox): The bounding box of the base object. 140 | 141 | Returns: 142 | bool: True if the top box's x and y extents are less than or equal to the base box's, 143 | False otherwise. 144 | """ 145 | base_extent = base_box.extent 146 | top_extent = top_box.extent 147 | 148 | # Simple check: Top object's horizontal dimensions must be <= base object's dimensions. 149 | # Assumes alignment of the boxes' principal axes with the world axes for this check. 150 | if (top_extent[0] <= base_extent[0] and 151 | top_extent[1] <= base_extent[1]): 152 | result = True 153 | else: 154 | result = False 155 | return result -------------------------------------------------------------------------------- /robospatial/spatial_analysis/compatibility/compatibility_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | import numpy as np 12 | from scipy.spatial import ConvexHull 13 | import matplotlib.path as mpath 14 | import open3d as o3d 15 | 16 | from spatial_analysis.relationship_utils import project_to_floor 17 | 18 | 19 | 20 | def is_hull_within_bounds(hull_vertices, bounds): 21 | """Check if a convex hull is completely within given bounds. 22 | 23 | Args: 24 | hull_vertices (np.ndarray): Vertices of the convex hull 25 | bounds (tuple): (x_min, x_max, y_min, y_max) 26 | 27 | Returns: 28 | bool: True if hull is within bounds 29 | """ 30 | x_min, x_max, y_min, y_max = bounds 31 | hull_min = np.min(hull_vertices, axis=0) 32 | hull_max = np.max(hull_vertices, axis=0) 33 | return (hull_min[0] >= x_min and hull_max[0] <= x_max and 34 | hull_min[1] >= y_min and hull_max[1] <= y_max) 35 | 36 | 37 | 38 | 39 | 40 | def can_fit_at_point(grid, box, occupied, point, environment_boxes, box_b, min_distance=0.02, buffer_ratio=0.3, box_name="Object A", box_b_name="Object B", frame=None, direction=None, DEBUG_FIT=False): 41 | """Check if an object can fit at a given point, trying different rotations. 42 | 43 | Args: 44 | grid: The floor grid (meshgrid output: grid[0]=X, grid[1]=Y) 45 | box: The object's bounding box 46 | occupied: Occupancy grid (boolean, same shape as grid[0]) 47 | point: The point (x, y) to try placing at 48 | environment_boxes: List of all environment boxes 49 | box_b: The reference object box 50 | min_distance: Minimum distance required between objects (in meters) 51 | buffer_ratio: Ratio of buffer zone to object size 52 | box_name: Name of the object being placed 53 | box_b_name: Name of the reference object 54 | frame (str, optional): The reference frame for visualization context. 55 | direction (str, optional): The direction for visualization context. 56 | """ 57 | x_coords, y_coords = grid 58 | h, w = x_coords.shape 59 | x_min, y_min = x_coords.min(), y_coords.min() 60 | x_max, y_max = x_coords.max(), y_coords.max() 61 | bounds = (x_min, x_max, y_min, y_max) 62 | 63 | grid_res_x = 0 64 | grid_res_y = 0 65 | if w > 1: 66 | grid_res_x = x_coords[0, 1] - x_coords[0, 0] 67 | if h > 1: 68 | grid_res_y = y_coords[1, 0] - y_coords[0, 0] 69 | 70 | if grid_res_x > 1e-6 and grid_res_y > 1e-6: 71 | grid_resolution = min(grid_res_x, grid_res_y) 72 | elif grid_res_x > 1e-6: 73 | grid_resolution = grid_res_x 74 | elif grid_res_y > 1e-6: 75 | grid_resolution = grid_res_y 76 | else: 77 | grid_resolution = 0 # Indicate failure to determine resolution 78 | print(f"Warning: Could not determine valid grid resolution for buffer calculation.") 79 | 80 | buffer_size_in_cells = 0 # Default buffer size in grid cells 81 | if grid_resolution > 1e-6: # Check if grid resolution is valid 82 | buffer_size_in_cells = int(np.ceil(min_distance / grid_resolution)) 83 | else: 84 | print(f"Warning: Invalid grid resolution ({grid_resolution}). Setting buffer_size_in_cells to 0.") 85 | 86 | try: 87 | original_projected_points = project_to_floor(box) 88 | # Calculate the 2D center of the projected points 89 | box_center_2d = np.mean(original_projected_points, axis=0) 90 | # Calculate the initial hull 91 | initial_hull = ConvexHull(original_projected_points) 92 | # Store hull vertices relative to the projected center 93 | relative_hull_vertices = original_projected_points[initial_hull.vertices] - box_center_2d 94 | except Exception as e: 95 | return False # Cannot proceed if base hull fails 96 | 97 | rotations = [0, np.pi/4, np.pi/2] 98 | 99 | for rotation_idx, rotation in enumerate(rotations): 100 | # --- Transform the precomputed hull --- 101 | cos_theta = np.cos(rotation) 102 | sin_theta = np.sin(rotation) 103 | rotation_matrix_2d = np.array([ 104 | [cos_theta, -sin_theta], 105 | [sin_theta, cos_theta] 106 | ]) 107 | # Apply rotation to relative vertices 108 | rotated_vertices = relative_hull_vertices @ rotation_matrix_2d.T # Note the transpose for point rotation 109 | # Apply translation (move center to the target 'point') 110 | hull_vertices = rotated_vertices + point[:2] # Use only x,y from point 111 | 112 | # --- Check 1: Is hull within grid bounds? --- 113 | if not is_hull_within_bounds(hull_vertices, bounds): 114 | if DEBUG_FIT: 115 | # Need a 3D rotation matrix for visualization 116 | rotation_matrix_3d = np.array([ 117 | [cos_theta, -sin_theta, 0], 118 | [sin_theta, cos_theta, 0], 119 | [0, 0, 1] 120 | ]) 121 | visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 122 | rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - Out of Bounds", 123 | bounds, box_name, box_b_name, frame, direction) 124 | continue 125 | 126 | # --- Check 2: Hull Occupancy (Vectorized) --- 127 | path = mpath.Path(hull_vertices) 128 | 129 | # Find the bounding box of the hull to minimize grid points checked 130 | hull_min_x, hull_min_y = np.min(hull_vertices, axis=0) 131 | hull_max_x, hull_max_y = np.max(hull_vertices, axis=0) 132 | 133 | # Convert hull bounds to grid indices (clamp to grid dimensions) 134 | min_ix = np.clip(int(np.floor((hull_min_x - x_min) / grid_res_x)) if grid_res_x > 1e-6 else 0, 0, w - 1) 135 | max_ix = np.clip(int(np.ceil((hull_max_x - x_min) / grid_res_x)) if grid_res_x > 1e-6 else w - 1, 0, w - 1) 136 | min_iy = np.clip(int(np.floor((hull_min_y - y_min) / grid_res_y)) if grid_res_y > 1e-6 else 0, 0, h - 1) 137 | max_iy = np.clip(int(np.ceil((hull_max_y - y_min) / grid_res_y)) if grid_res_y > 1e-6 else h - 1, 0, h - 1) 138 | 139 | # Create subset of grid points and indices within the hull's bounding box 140 | sub_x, sub_y = np.meshgrid(np.arange(min_ix, max_ix + 1), np.arange(min_iy, max_iy + 1)) 141 | sub_points_x = x_coords[sub_y.ravel(), sub_x.ravel()] 142 | sub_points_y = y_coords[sub_y.ravel(), sub_x.ravel()] 143 | sub_grid_points = np.vstack((sub_points_x, sub_points_y)).T 144 | sub_grid_indices = (sub_y.ravel(), sub_x.ravel()) # Indices into the original 'occupied' grid 145 | 146 | if sub_grid_points.size == 0: 147 | if DEBUG_FIT: 148 | # Need a 3D rotation matrix for visualization 149 | rotation_matrix_3d = np.array([ 150 | [cos_theta, -sin_theta, 0], 151 | [sin_theta, cos_theta, 0], 152 | [0, 0, 1] 153 | ]) 154 | visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 155 | rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - No Grid Points", 156 | bounds, box_name, box_b_name, frame, direction) 157 | continue # Hull is likely too small or outside grid center area 158 | 159 | # Check which subset points are inside the actual hull polygon 160 | inside_hull_mask_flat = path.contains_points(sub_grid_points) 161 | 162 | # Get the indices of the grid cells that are inside the hull 163 | hull_indices_flat = tuple(idx[inside_hull_mask_flat] for idx in sub_grid_indices) 164 | 165 | # Check occupancy for points *inside* the hull 166 | occupied_inside_hull = occupied[hull_indices_flat] 167 | num_occupied_inside = np.sum(occupied_inside_hull) 168 | total_cells_inside = len(occupied_inside_hull) 169 | 170 | # --- Check 3: Buffer Zone Occupancy --- 171 | buffer_occupied_cells = 0 172 | checked_buffer_indices = set() # Keep track of checked indices to avoid double counting 173 | 174 | # Only check buffer if hull itself is not significantly occupied 175 | if total_cells_inside == 0 or num_occupied_inside / total_cells_inside <= 0.0: # Do not allow overlap 176 | # Iterate through the grid cells *inside* the hull 177 | for iy_hull, ix_hull in zip(*hull_indices_flat): 178 | # Check the neighborhood (buffer) around this hull cell 179 | for dy in range(-buffer_size_in_cells, buffer_size_in_cells + 1): 180 | for dx in range(-buffer_size_in_cells, buffer_size_in_cells + 1): 181 | if dx == 0 and dy == 0: 182 | continue # Skip the cell itself 183 | 184 | ix_buffer = ix_hull + dx 185 | iy_buffer = iy_hull + dy 186 | 187 | # Check if the buffer cell index is valid and hasn't been checked 188 | if 0 <= ix_buffer < w and 0 <= iy_buffer < h: 189 | buffer_idx_tuple = (iy_buffer, ix_buffer) 190 | if buffer_idx_tuple not in checked_buffer_indices: 191 | # Check if this buffer cell is outside the hull but occupied 192 | buffer_point = (x_coords[iy_buffer, ix_buffer], y_coords[iy_buffer, ix_buffer]) 193 | if not path.contains_point(buffer_point) and occupied[iy_buffer, ix_buffer]: 194 | buffer_occupied_cells += 1 195 | checked_buffer_indices.add(buffer_idx_tuple) 196 | 197 | # --- Final Decision for this rotation --- 198 | # Conditions: 199 | # 1. Hull must have some cells under it. 200 | # 2. Significant overlap inside the hull is not allowed. 201 | # 3. Significant occupation in the buffer zone is not allowed. 202 | fit_this_rotation = False 203 | overlap_ratio = 0 204 | current_buffer_ratio = 0 205 | 206 | if total_cells_inside > 0: 207 | overlap_ratio = num_occupied_inside / total_cells_inside 208 | current_buffer_ratio = buffer_occupied_cells / total_cells_inside # Compare buffer count to hull size 209 | 210 | # Do not allow overlap and limited buffer occupation (e.g. < 50%) 211 | # These thresholds might need tuning based on grid resolution and object sizes 212 | if overlap_ratio <= 0.0 and current_buffer_ratio < buffer_ratio: 213 | fit_this_rotation = True 214 | 215 | # Debug visualization 216 | if DEBUG_FIT: 217 | result_text = "SUCCESS" if fit_this_rotation else "FAILED" 218 | # Need a 3D rotation matrix for visualization 219 | rotation_matrix_3d = np.array([ 220 | [cos_theta, -sin_theta, 0], 221 | [sin_theta, cos_theta, 0], 222 | [0, 0, 1] 223 | ]) 224 | visualize_placement(grid, box, occupied, point, environment_boxes, box_b, 225 | rotation_matrix_3d, f"Rotation {np.degrees(rotation):.0f}° - {result_text} (Overlap: {overlap_ratio:.2f}, Buffer: {current_buffer_ratio:.2f})", 226 | bounds, box_name, box_b_name, frame, direction) 227 | 228 | if fit_this_rotation: 229 | return True # Found a valid rotation 230 | 231 | return False 232 | 233 | 234 | 235 | def visualize_placement(grid, box, occupied, point, environment_boxes, box_b, rotation=None, step_name="", bounds=None, box_name="Object A", box_b_name="Object B", frame=None, direction=None): 236 | """Visualize the placement attempt in a top-down view. 237 | 238 | Args: 239 | grid: The floor grid 240 | box: The object's bounding box 241 | occupied: Occupancy grid 242 | point: The point to try placing at 243 | environment_boxes: List of all environment boxes 244 | box_b: The reference object box 245 | rotation: Current rotation being tried 246 | step_name: Name of the current step 247 | bounds: Grid bounds (x_min, x_max, y_min, y_max) 248 | box_name: Name of the object being placed 249 | box_b_name: Name of the reference object 250 | frame (str, optional): The reference frame (e.g., 'objectcentric'). 251 | direction (str, optional): The direction within the frame (e.g., 'infront'). 252 | """ 253 | import matplotlib.pyplot as plt 254 | from matplotlib.patches import Rectangle, Polygon 255 | 256 | # Create figure 257 | plt.figure(figsize=(10, 10)) 258 | 259 | # Create color-coded grid 260 | color_grid = np.zeros((*occupied.shape, 3), dtype=np.uint8) 261 | color_grid[occupied] = [255, 0, 0] # Red for occupied 262 | color_grid[~occupied] = [0, 255, 0] # Green for empty 263 | 264 | # Plot the grid 265 | plt.imshow(color_grid, 266 | extent=[grid[0].min(), grid[0].max(), grid[1].min(), grid[1].max()], 267 | origin='lower') 268 | 269 | # Plot environment boxes 270 | for env_box in environment_boxes: 271 | if env_box == box_b: 272 | color = 'magenta' # Reference box 273 | label = box_b_name 274 | else: 275 | color = 'red' 276 | label = 'Environment' 277 | corners = project_to_floor(env_box) 278 | plt.fill(corners[:, 0], corners[:, 1], color, alpha=0.3, label=label) 279 | plt.plot(corners[:, 0], corners[:, 1], color) 280 | 281 | # Plot the current placement attempt 282 | if rotation is not None: 283 | translated_box = o3d.geometry.OrientedBoundingBox() 284 | translated_box.center = np.array([point[0], point[1], box.extent[2] / 2]) 285 | translated_box.R = rotation 286 | translated_box.extent = box.extent 287 | 288 | corners = project_to_floor(translated_box) 289 | plt.fill(corners[:, 0], corners[:, 1], 'blue', alpha=0.3, label=box_name) 290 | plt.plot(corners[:, 0], corners[:, 1], 'blue') 291 | 292 | # Plot the point 293 | plt.scatter(point[0], point[1], c='yellow', s=100, marker='*', label='Target Point') 294 | 295 | # Add title and labels 296 | title = f"Placement Attempt: {step_name}" 297 | if frame and direction: 298 | title += f" ({frame} - {direction} relative to {box_b_name})" 299 | else: 300 | title += f" (relative to {box_b_name})" 301 | plt.title(title) 302 | plt.xlabel('X') 303 | plt.ylabel('Y') 304 | 305 | # Show grid bounds if provided 306 | if bounds: 307 | x_min, x_max, y_min, y_max = bounds 308 | plt.axvline(x=x_min, color='k', linestyle='--') 309 | plt.axvline(x=x_max, color='k', linestyle='--') 310 | plt.axhline(y=y_min, color='k', linestyle='--') 311 | plt.axhline(y=y_max, color='k', linestyle='--') 312 | 313 | plt.grid(True) 314 | plt.legend() 315 | plt.show() -------------------------------------------------------------------------------- /robospatial/spatial_analysis/configuration/configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | import numpy as np 12 | 13 | from spatial_analysis.configuration.configuration_utils import get_object_metrics 14 | 15 | 16 | def check_spatial_configuration_relationships(obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness='lenient'): 17 | """Calculates the spatial relationship between two 3D objects (obj1 relative to obj2) from multiple perspectives. 18 | 19 | This function determines relationships like 'left', 'right', 'in front of', 'behind', 'above', 20 | 'below', and 'overlapping' based on the objects' oriented bounding boxes (OBBs). It leverages 21 | pre-calculated object metrics (pixel projections, depth ranges, world coordinates) obtained 22 | via `get_object_metrics`. 23 | 24 | Relationships are assessed in three reference frames: 25 | 1. **camera_centric**: Based on the objects' 2D projections onto the image plane (for left/right) 26 | and their depth relative to the camera (for in front/behind). Vertical relationships (above/below) 27 | in this frame are determined using world Z coordinates. 28 | 2. **world_centric**: Uses the same logic as camera_centric for horizontal and depth relationships 29 | in this implementation, but explicitly defines overlap based on world Z-axis separation. 30 | 3. **object_centric**: Determines relationships based on the relative position of obj1's center 31 | with respect to obj2's orientation (forward and right vectors). It uses the Separating Axis 32 | Theorem (SAT) to check for OBB overlap in 3D, influencing directional judgments. Vertical 33 | relationships (above/below) use the world Z coordinates. 34 | 35 | The `strictness` parameter controls the calculation logic for camera_centric and world_centric: 36 | - **'strict'**: Requires clear separation based on the minimum and maximum bounds of the objects' 37 | metrics (pixel coordinates, depth values, world Z). Objects are considered overlapping if their 38 | bounds intersect, even slightly. This mode is sensitive to full visibility. 39 | - **'lenient'**: Uses object centers (for pixel projection), average visible depth, and a combination 40 | of average/min/max world Z coordinates. It's more robust to partial occlusions or near overlaps. It may still use strict bounds checks in ambiguous cases (e.g., very close average depths). 41 | 42 | Args: 43 | obj1 (dict): First object, containing at least 'name' (str) and 'obb' (open3d.geometry.OrientedBoundingBox). 44 | obj2 (dict): Second object, with the same structure as obj1. 45 | extrinsic (np.ndarray): 4x4 extrinsic matrix representing the camera-to-world transformation. 46 | intrinsic (np.ndarray): 3x3 or 4x4 camera intrinsic matrix. Only the top-left 3x3 portion is used. 47 | image_size (tuple): A tuple representing the image size as (width, height). 48 | individual_occupancy_maps (dict): A dictionary containing precomputed individual occupancy maps for objects, 49 | used by `get_object_metrics`. Keys should match object names. 50 | strictness (str, optional): The mode for relationship checks ('strict' or 'lenient'). Defaults to 'lenient'. 51 | 52 | Returns: 53 | dict: A dictionary containing boolean spatial relationships for each reference frame ('camera_centric', 54 | 'world_centric', 'object_centric'). Each frame contains keys: 'left', 'right', 'infront', 55 | 'behind', 'above', 'below', 'overlapping'. 56 | """ 57 | EPSILON = 1e-6 # Tolerance for floating point comparisons 58 | 59 | 60 | box1_metrics = get_object_metrics(obj1, extrinsic, intrinsic, image_size, individual_occupancy_maps) 61 | box2_metrics = get_object_metrics(obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps) 62 | 63 | # Clamp negative world Z values to 0 based on the assumption that nothing is below ground 64 | for metrics in [box1_metrics, box2_metrics]: 65 | if metrics.get('world_z_min') is not None and metrics['world_z_min'] < 0: 66 | metrics['world_z_min'] = 0.0 67 | if metrics.get('world_z_max') is not None and metrics['world_z_max'] < 0: 68 | # If max is negative, min must also be negative (or None), so both are clamped to 0 69 | metrics['world_z_max'] = 0.0 70 | if metrics.get('world_z_avg') is not None and metrics['world_z_avg'] < 0: 71 | metrics['world_z_avg'] = 0.0 72 | 73 | # Nested Function to Determine Relationships Based on Metrics 74 | def determine_camera_and_world_relationship(m1, m2): 75 | """Determines strict and lenient relationships based on pre-calculated metrics.""" 76 | 77 | # Strict Relationship Calculation 78 | def get_relationship_strict(): 79 | horizontal_relation = "overlapping" 80 | # Check visibility before comparing strict horizontal bounds 81 | if m1.get('max_px') is not None and m2.get('min_px') is not None and m1['max_px'] < m2['min_px'] - EPSILON: 82 | horizontal_relation = "left" 83 | elif m1.get('min_px') is not None and m2.get('max_px') is not None and m1['min_px'] > m2['max_px'] + EPSILON: 84 | horizontal_relation = "right" 85 | 86 | depth_relation = "overlapping" 87 | # Check visibility before comparing strict depth bounds 88 | if m1.get('max_depth') is not None and m2.get('min_depth') is not None and m1['max_depth'] < m2['min_depth'] - EPSILON: 89 | depth_relation = "in front of" 90 | elif m1.get('min_depth') is not None and m2.get('max_depth') is not None and m1['min_depth'] > m2['max_depth'] + EPSILON: 91 | depth_relation = "behind" 92 | 93 | vertical_world_relation = "overlapping" 94 | # World Z is always calculated 95 | if m1.get('world_z_max') is not None and m2.get('world_z_min') is not None and m1['world_z_max'] < m2['world_z_min'] - EPSILON: 96 | vertical_world_relation = "below" 97 | elif m1.get('world_z_min') is not None and m2.get('world_z_max') is not None and m1['world_z_min'] > m2['world_z_max'] + EPSILON: 98 | vertical_world_relation = "above" 99 | 100 | return { 101 | "left": horizontal_relation == "left", 102 | "right": horizontal_relation == "right", 103 | "infront": depth_relation == "in front of", 104 | "behind": depth_relation == "behind", 105 | "cam_overlapping": horizontal_relation == "overlapping", # Overlap based on pixel projection 106 | "above": vertical_world_relation == "above", 107 | "below": vertical_world_relation == "below", 108 | "world_overlapping": vertical_world_relation == "overlapping" # Overlap based on world Z 109 | } 110 | 111 | # Lenient Relationship Calculation 112 | def get_relationship_lenient(): 113 | # Check if centers are comparable 114 | centers_comparable = m1.get('center_px') is not None and m2.get('center_px') is not None 115 | 116 | horizontal_relation = "overlapping" 117 | if centers_comparable: 118 | # Check containment using pixel bounds 119 | box1_center_in_box2_px = (m1.get('min_px') is not None and # Check all required bounds exist 120 | m1.get('max_px') is not None and 121 | m1.get('min_py') is not None and 122 | m1.get('max_py') is not None and 123 | m2.get('min_px') is not None and 124 | m2.get('max_px') is not None and 125 | m2.get('min_py') is not None and 126 | m2.get('max_py') is not None and 127 | m2['min_px'] <= m1['center_px'][0] <= m2['max_px'] and 128 | m2['min_py'] <= m1['center_px'][1] <= m2['max_py']) 129 | 130 | box2_center_in_box1_px = (m1.get('min_px') is not None and # Check all required bounds exist 131 | m1.get('max_px') is not None and 132 | m1.get('min_py') is not None and 133 | m1.get('max_py') is not None and 134 | m2.get('min_px') is not None and 135 | m2.get('max_px') is not None and 136 | m2.get('min_py') is not None and 137 | m2.get('max_py') is not None and 138 | m1['min_px'] <= m2['center_px'][0] <= m1['max_px'] and 139 | m1['min_py'] <= m2['center_px'][1] <= m1['max_py']) 140 | 141 | if m1['center_px'][0] < m2['center_px'][0] - EPSILON and not box2_center_in_box1_px: 142 | horizontal_relation = "left" 143 | elif m1['center_px'][0] > m2['center_px'][0] + EPSILON and not box1_center_in_box2_px: 144 | horizontal_relation = "right" 145 | 146 | # Lenient depth check based on average visible depth 147 | depth_relation = "overlapping" 148 | avg_depths_comparable = m1.get('visible_depth_avg') is not None and m2.get('visible_depth_avg') is not None 149 | if avg_depths_comparable: 150 | # Optional: Add hybrid check using strict bounds if averages are close 151 | if abs(m1['visible_depth_avg'] - m2['visible_depth_avg']) < EPSILON: 152 | # Averages are close, fall back to strict check only if strictly separated 153 | if m1.get('max_depth') is not None and m2.get('min_depth') is not None and m1['max_depth'] < m2['min_depth'] - EPSILON: 154 | depth_relation = "in front of" 155 | elif m1.get('min_depth') is not None and m2.get('max_depth') is not None and m1['min_depth'] > m2['max_depth'] + EPSILON: 156 | depth_relation = "behind" 157 | # else: stays overlapping 158 | elif m1['visible_depth_avg'] < m2['visible_depth_avg']: 159 | depth_relation = "in front of" 160 | elif m1['visible_depth_avg'] > m2['visible_depth_avg']: 161 | depth_relation = "behind" 162 | 163 | # Lenient vertical check based on world Z, prioritizing separation 164 | vertical_world_relation = "overlapping" # Default to overlapping 165 | 166 | # Check if metrics are available for comparison 167 | m1_z_max = m1.get('world_z_max') 168 | m1_z_min = m1.get('world_z_min') 169 | m1_z_avg = m1.get('world_z_avg') 170 | m2_z_max = m2.get('world_z_max') 171 | m2_z_min = m2.get('world_z_min') 172 | m2_z_avg = m2.get('world_z_avg') 173 | 174 | all_metrics_exist = all(v is not None for v in [m1_z_max, m1_z_min, m1_z_avg, m2_z_max, m2_z_min, m2_z_avg]) 175 | 176 | if all_metrics_exist: 177 | # 1. Check strict separation first 178 | if m1_z_max < m2_z_min - EPSILON: 179 | vertical_world_relation = "below" 180 | elif m1_z_min > m2_z_max + EPSILON: 181 | vertical_world_relation = "above" 182 | # 2. If strictly overlapping, check lenient conditions (avg vs max/min) 183 | else: 184 | # Check if average of 1 is above max of 2 185 | if m1_z_avg > m2_z_max + EPSILON: 186 | vertical_world_relation = "above" 187 | # Check if average of 2 is above max of 1 (meaning 1 is below 2) 188 | elif m2_z_avg > m1_z_max + EPSILON: 189 | vertical_world_relation = "below" 190 | # Otherwise, they remain overlapping 191 | 192 | return { 193 | "left": horizontal_relation == "left", 194 | "right": horizontal_relation == "right", 195 | "infront": depth_relation == "in front of", 196 | "behind": depth_relation == "behind", 197 | "cam_overlapping": horizontal_relation == "overlapping", # Overlap based on pixel projection centroid logic 198 | "above": vertical_world_relation == "above", 199 | "below": vertical_world_relation == "below", 200 | "world_overlapping": vertical_world_relation == "overlapping" # Overlap based on world Z average logic 201 | } 202 | 203 | # Return both results 204 | return { 205 | "strict": get_relationship_strict(), 206 | "lenient": get_relationship_lenient() 207 | } 208 | 209 | # Calculate Camera/World Relationships 210 | # Object visibility/comparability is handled by None checks within determine_camera_and_world_relationship. 211 | cam_world_relations = determine_camera_and_world_relationship(box1_metrics, box2_metrics) 212 | 213 | # Calculate Object Centric Relationships 214 | def get_object_centric_relationship(obj1, obj2): 215 | def get_facing_direction(box): 216 | rotation_matrix = np.asarray(box.R) 217 | forward_direction = rotation_matrix[:, 0] 218 | return forward_direction 219 | 220 | def check_overlap(box1, box2): 221 | box1_points = np.asarray(box1.get_box_points()) 222 | box2_points = np.asarray(box2.get_box_points()) 223 | 224 | def project_points(points, axis): 225 | return np.dot(points, axis) 226 | 227 | def overlap_on_axis(box1_proj, box2_proj): 228 | box1_min, box1_max = np.min(box1_proj), np.max(box1_proj) 229 | box2_min, box2_max = np.min(box2_proj), np.max(box2_proj) 230 | return not (box1_max < box2_min or box2_max < box1_min) 231 | 232 | # Use OBB axes for Separating Axis Theorem (more robust than just world axes diffs) 233 | axes = [] 234 | axes.extend(box1.R.T) # Box 1 axes 235 | axes.extend(box2.R.T) # Box 2 axes 236 | # Add cross products of axes (simplified common implementation) 237 | # Calculate the 9 potential separating axes derived from cross products 238 | # of each edge direction of box1 with each edge direction of box2. 239 | # Since OBB axes are parallel to edge directions, we cross the axes vectors. 240 | for i in range(3): 241 | for j in range(3): 242 | # Cross product of box1 axis i and box2 axis j 243 | cross_product = np.cross(box1.R[:, i], box2.R[:, j]) 244 | if np.linalg.norm(cross_product) > EPSILON: # Avoid zero vectors 245 | axes.append(cross_product / np.linalg.norm(cross_product)) 246 | 247 | for axis in axes: 248 | if not overlap_on_axis(project_points(box1_points, axis), 249 | project_points(box2_points, axis)): 250 | # Separating axis found, no overlap 251 | return False 252 | 253 | # If no separating axis is found by SAT, the OBBs are considered overlapping. 254 | return True 255 | 256 | # Simplified overlap check based on SAT result for object-centric logic 257 | overlap_obj_centric = check_overlap(obj1["obb"], obj2["obb"]) 258 | 259 | obj2_forward = get_facing_direction(obj2["obb"]) 260 | obj1_center = np.mean(np.asarray(obj1["obb"].get_box_points()), axis=0) 261 | obj2_center = np.asarray(obj2["obb"].get_center()) 262 | 263 | relative_position = obj1_center - obj2_center 264 | dot_product = np.dot(relative_position, obj2_forward) 265 | # Use Up vector (assuming Z is up) for cross product for left/right relative to forward 266 | world_up = np.array([0, 0, 1]) 267 | obj2_right = np.cross(obj2_forward, world_up) 268 | # Ensure obj2_right is normalized if needed, though only sign matters for dot product 269 | if np.linalg.norm(obj2_right) > EPSILON: 270 | obj2_right /= np.linalg.norm(obj2_right) 271 | else: 272 | # Handle cases where forward is aligned with up (e.g. object pointing straight up/down) 273 | # Use world X or Y as a fallback 'right' ? This needs careful thought. 274 | # For now, if right vector is invalid, horizontal relation is ambiguous/overlapping 275 | obj2_right = None 276 | 277 | horizontal_dot = np.dot(relative_position, obj2_right) if obj2_right is not None else 0 278 | 279 | 280 | # Object-centric depth uses dot product with forward vector 281 | depth_relation = "overlapping" 282 | if not overlap_obj_centric: # Only assign directional if not overlapping 283 | if dot_product > EPSILON: 284 | depth_relation = "in front of" 285 | elif dot_product < -EPSILON: 286 | depth_relation = "behind" 287 | # else: stays overlapping (or on the plane) 288 | 289 | # Object-centric horizontal uses dot product with right vector 290 | horizontal_relation = "overlapping" 291 | if not overlap_obj_centric and obj2_right is not None: # Only assign if not overlapping and right vector is valid 292 | if horizontal_dot > EPSILON: # Project onto right vector: positive is "right" 293 | horizontal_relation = "right" 294 | elif horizontal_dot < -EPSILON: # Negative is "left" 295 | horizontal_relation = "left" 296 | # else: stays overlapping (or directly in front/behind) 297 | 298 | return horizontal_relation, depth_relation 299 | 300 | obj_centric_horizontal, obj_centric_depth = get_object_centric_relationship(obj1, obj2) 301 | 302 | # Select strict or lenient results based on parameter 303 | chosen_relation = cam_world_relations.get(strictness, cam_world_relations['lenient']) # Default to lenient 304 | 305 | # Assemble Final Result 306 | relationships = { 307 | "camera_centric": { 308 | "left": chosen_relation["left"], 309 | "right": chosen_relation["right"], 310 | "infront": chosen_relation["infront"], 311 | "behind": chosen_relation["behind"], 312 | # Use world vertical for camera frame above/below 313 | "above": chosen_relation["above"], 314 | "below": chosen_relation["below"], 315 | "overlapping": chosen_relation["cam_overlapping"], 316 | }, 317 | "world_centric": { 318 | # World uses same planar relationships as camera in this implementation 319 | "left": chosen_relation["left"], 320 | "right": chosen_relation["right"], 321 | "infront": chosen_relation["infront"], 322 | "behind": chosen_relation["behind"], 323 | "above": chosen_relation["above"], 324 | "below": chosen_relation["below"], 325 | "overlapping": chosen_relation["world_overlapping"] # Use Z-based overlap here 326 | }, 327 | "object_centric": { 328 | "left": obj_centric_horizontal == "left", 329 | "right": obj_centric_horizontal == "right", 330 | "infront": obj_centric_depth == "in front of", 331 | "behind": obj_centric_depth == "behind", 332 | # Use world vertical for object frame above/below 333 | "above": chosen_relation["above"], 334 | "below": chosen_relation["below"], 335 | # Object centric overlap combines horizontal and depth states 336 | "overlapping": obj_centric_horizontal == "overlapping" or obj_centric_depth == "overlapping" 337 | } 338 | } 339 | 340 | return relationships -------------------------------------------------------------------------------- /robospatial/spatial_analysis/configuration/configuration_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | import numpy as np 11 | 12 | from spatial_analysis.relationship_utils import get_min_max_visible_depth_for_box 13 | 14 | 15 | # --- Calculate Metrics for Both Objects --- 16 | def get_object_metrics(obj, extrinsic, intrinsic, image_size, individual_occupancy_maps): 17 | metrics = {} 18 | obb = obj.get('obb') # Use .get for safety 19 | obj_name = obj.get('name') # Get object name 20 | 21 | if obb is None or obj_name is None: # Check if retrieval failed 22 | print(f"Warning: Skipping metrics calculation due to missing 'obb' or 'name' in object: {obj}") 23 | return {'is_visible': False} # Return basic structure indicating not visible 24 | 25 | world_coords = np.asarray(obb.get_box_points()) 26 | 27 | # Pixel Occupancy & 2D Metrics 28 | # --- Retrieve pre-calculated occupancy map --- 29 | occupancy_map = individual_occupancy_maps.get(obj_name) 30 | if occupancy_map is None: 31 | # Removed fallback calculation, error is printed instead. 32 | print(f"Error: Occupancy map not found for '{obj_name}' in get_object_metrics. Aborting.") 33 | return {'is_visible': False} 34 | 35 | occupied_coords = np.argwhere(occupancy_map) # Shape (N, 2), cols=[row, col] 36 | 37 | # Visible Depth Range 38 | min_depth, max_depth = get_min_max_visible_depth_for_box(obb, extrinsic, intrinsic, image_size) 39 | 40 | # Determine overall visibility 41 | metrics['is_visible'] = occupied_coords.size > 0 and min_depth is not None 42 | 43 | if metrics['is_visible']: 44 | metrics['min_px'] = np.min(occupied_coords[:, 1]) # Min X pixel 45 | metrics['max_px'] = np.max(occupied_coords[:, 1]) # Max X pixel 46 | metrics['min_py'] = np.min(occupied_coords[:, 0]) # Min Y pixel 47 | metrics['max_py'] = np.max(occupied_coords[:, 0]) # Max Y pixel 48 | metrics['center_px'] = np.mean(occupied_coords[:, ::-1], axis=0) # Centroid (x, y) 49 | metrics['min_depth'] = min_depth 50 | metrics['max_depth'] = max_depth 51 | metrics['visible_depth_avg'] = (min_depth + max_depth) / 2.0 52 | else: 53 | # Set pixel/depth metrics to None if not visible 54 | metrics['min_px'] = metrics['max_px'] = metrics['min_py'] = metrics['max_py'] = None 55 | metrics['center_px'] = None 56 | metrics['min_depth'] = metrics['max_depth'] = None 57 | metrics['visible_depth_avg'] = None 58 | 59 | # World Z Metrics (calculated regardless of visibility) 60 | metrics['world_z_min'] = np.min(world_coords[:, 2]) 61 | metrics['world_z_max'] = np.max(world_coords[:, 2]) 62 | # Handle potential empty world_coords if needed, though unlikely for OBB 63 | if world_coords.size > 0: 64 | metrics['world_z_avg'] = np.mean(world_coords[:, 2]) 65 | else: 66 | metrics['world_z_avg'] = None # Should not happen with OBB 67 | 68 | return metrics -------------------------------------------------------------------------------- /robospatial/spatial_analysis/context/context_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | 12 | import numpy as np 13 | from matplotlib.path import Path 14 | 15 | 16 | 17 | 18 | 19 | 20 | def compute_distances_to_bbox(points, bbox_corners): 21 | """ 22 | Compute the shortest distance from multiple points to a bounding box (convex polygon). 23 | 24 | Parameters: 25 | - points: A NumPy array of shape (N, 2) representing N points. 26 | - bbox_corners: A NumPy array of shape (M, 2) representing the corners of the bounding box in order. 27 | 28 | Returns: 29 | - distances: A NumPy array of shape (N,) containing the shortest distances from each point to the bounding box. 30 | """ 31 | # Create a Path object for the bounding box 32 | path = Path(bbox_corners) 33 | 34 | # Determine which points are inside the bounding box 35 | inside = path.contains_points(points) 36 | 37 | # Initialize distances array 38 | distances = np.zeros(points.shape[0]) 39 | 40 | # Points outside the polygon 41 | outside_points = points[~inside] 42 | 43 | if outside_points.size > 0: 44 | num_edges = bbox_corners.shape[0] 45 | distances_outside = np.full(outside_points.shape[0], np.inf) 46 | 47 | # Compute distances from points to each edge 48 | for i in range(num_edges): 49 | A = bbox_corners[i] 50 | B = bbox_corners[(i + 1) % num_edges] 51 | AB = B - A 52 | AB_squared = np.dot(AB, AB) 53 | 54 | if AB_squared == 0: 55 | # A and B are the same point 56 | distances_edge = np.linalg.norm(outside_points - A, axis=1) 57 | else: 58 | AP = outside_points - A 59 | t = np.dot(AP, AB) / AB_squared 60 | t = np.clip(t, 0, 1) 61 | closest = A + t[:, np.newaxis] * AB 62 | distances_edge = np.linalg.norm(outside_points - closest, axis=1) 63 | 64 | distances_outside = np.minimum(distances_outside, distances_edge) 65 | 66 | # Assign distances to the corresponding points 67 | distances[~inside] = distances_outside 68 | 69 | # Points inside have zero distance 70 | distances[inside] = 0.0 71 | 72 | return distances 73 | 74 | 75 | def project_points_to_image(points, extrinsic, intrinsic): 76 | extrinsic_w2c = np.linalg.inv(extrinsic) 77 | points = np.concatenate([points, np.ones((points.shape[0], 1))], axis=1) 78 | points_img = intrinsic @ extrinsic_w2c @ points.transpose() 79 | points_img = points_img.transpose() 80 | 81 | # Normalize homogeneous coordinates 82 | w = points_img[:, 3] 83 | points_img = points_img[:, :3] / w[:, np.newaxis] 84 | 85 | # Initialize output arrays 86 | points_pixel = points_img[:, :2] / points_img[:, 2][:, np.newaxis] 87 | points_depth = points_img[:, 2] 88 | 89 | return np.round(points_pixel).astype(int).tolist(), points_depth.tolist() -------------------------------------------------------------------------------- /robospatial/spatial_analysis/grounding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | import numpy as np 12 | 13 | def get_object_grounding(obj, occupancy_map): 14 | """ 15 | Generates 2D bounding boxes based on pixel occupancy for a given 3D object. 16 | Uses a pre-calculated occupancy map. 17 | 18 | Args: 19 | obj (dict): A single object dict with at least 'obb' and 'name'. 20 | extrinsic (np.ndarray): Extrinsic matrix. 21 | intrinsic (np.ndarray): Intrinsic matrix. 22 | image_size (tuple): (width, height). 23 | occupancy_map (np.ndarray): Pre-calculated 2D boolean occupancy map for this object. 24 | 25 | Returns: 26 | dict or None: A dictionary containing: 27 | - 'name': Object name (str) 28 | - 'clipped_bbox': Axis-aligned clipped [xmin, ymin, xmax, ymax] (list) 29 | - 'square_bbox': Axis-aligned square clipped [xmin, ymin, xmax, ymax] (list) 30 | - 'bbox_3d': Original 3D bounding box coordinates (list) - if available in input 31 | - 'obb': Original OrientedBoundingBox - needed later for occupancy map 32 | Returns None if the object has no OBB or is not visible. 33 | """ 34 | if 'obb' not in obj: 35 | print(f"Warning: Skipping object {obj.get('name', 'Unknown')} because 'obb' is missing.") 36 | return None 37 | if occupancy_map is None: 38 | print(f"Warning: Occupancy map is None for object {obj.get('name', 'Unknown')}. Cannot calculate grounding.") 39 | return None 40 | 41 | # Use the provided occupancy_map 42 | occupied_coords = np.argwhere(occupancy_map) # Shape (N, 2), cols=[row, col] -> (y, x) 43 | 44 | if occupied_coords.size == 0: 45 | # Object does not project onto any pixels 46 | # Object does not project onto any pixels according to the map 47 | return None 48 | 49 | # --- Calculate Clipped Axis-Aligned BBox from Occupied Pixels --- 50 | # Remember: occupied_coords are (row, col) -> (y, x) 51 | clipped_min_y = np.min(occupied_coords[:, 0]) 52 | clipped_max_y = np.max(occupied_coords[:, 0]) 53 | clipped_min_x = np.min(occupied_coords[:, 1]) 54 | clipped_max_x = np.max(occupied_coords[:, 1]) 55 | 56 | clipped_coords_bbox = [clipped_min_x, clipped_min_y, clipped_max_x, clipped_max_y] 57 | 58 | # --- Store Info --- 59 | info = { 60 | 'name': obj.get("name"), 61 | 'clipped_bbox': [float(c) for c in clipped_coords_bbox], # Ensure floats 62 | 'bbox_3d': obj.get("bbox_3d"), 63 | 'obb': obj['obb'] 64 | } 65 | # Return the dictionary directly, not a list containing the dictionary 66 | return info -------------------------------------------------------------------------------- /robospatial/spatial_analysis/obj_properties.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | """Defines default lists of object categories based on physical properties. 11 | 12 | These lists categorize objects based on characteristics relevant for spatial 13 | analysis tasks, such as determining potential interactions or relationships. 14 | For example, knowing if an object has a discernible 'face' can influence 15 | contextual relationship calculations (e.g., 'in front of'), and knowing if an 16 | object is movable and placeable on a flat surface is crucial for compatibility 17 | checks like 'on top of'. 18 | 19 | The lists provided here are defaults and are intended to be user-configurable. 20 | Users can modify these lists directly or replace them based on the specific 21 | object categories present in their datasets or required for their analysis tasks. 22 | 23 | Defined Lists: 24 | - `items_with_face`: Objects typically having a primary interaction face. 25 | - `flat_surface_items`: Objects typically providing a flat surface for placement. 26 | - `movable_items`: Objects generally considered movable. 27 | - `movable_and_placeable_items`: A subset of movable items suitable for placing on surfaces. 28 | - `embodiedscan_objs`: A comprehensive list of object categories found in the EmbodiedScan dataset (provided for reference). 29 | """ 30 | 31 | # GPT-generated item filter list 32 | 33 | items_with_face = [ 34 | 'air conditioner', 'alarm', 'album', 'bed', 'bicycle', 'blackboard', 35 | 'book', 'camera', 'car', 'cabinet', 'calendar', 'clock', 'computer', 36 | 'copier', 'couch', 'desk', 'dishwasher', 'door', 'drawer', 'drum', 37 | 'fan', 'fireplace', 'garage door', 'guitar', 'heater', 'humidifier', 'kettle', 38 | 'keyboard', 'laptop', 'mailbox', 'microwave', 'mirror', 'monitor', 'oven', 39 | 'piano', 'picture', 'projector', 'refrigerator', 'screen', 'shelf', 'sink', 'stand', 40 | 'speaker', 'stairs', 'stove', 'tablet', 'telephone', 'tv', 41 | 'toilet', 'vacuum cleaner', 'washing machine', 'window' 42 | ] 43 | 44 | 45 | flat_surface_items = [ 46 | 'bar', 'beam', 'bed', 'bench', 'cabinet', 'cart', 47 | 'chair', 'counter', 'countertop', 'crate', 48 | 'cube', 'desk', 'dresser', 'footstool', 49 | 'kitchen island', 'mat', 'mattress', 'ottoman', 50 | 'package', 'panel', 'pedestal', 'pool table', 51 | 'rack', 'rail', 'shelf', 'stand', 'steps', 'stool', 'structure', 'support', 52 | 'table', 'tray', 'vanity', 'windowsill' 53 | ] 54 | 55 | movable_items = [ 56 | 'adhesive tape', 'air conditioner', 'alarm', 'album', 'backpack', 'bag', 'ball', 57 | 'basket', 'beanbag', 'bicycle', 'bin', 'blackboard', 'blanket', 'board', 58 | 'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 59 | 'broom', 'brush', 'bucket', 'camera', 'can', 'candle', 'candlestick', 60 | 'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'cleanser', 61 | 'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'commode', 'computer', 62 | 'conducting wire', 'container', 'control', 'copier', 'cosmetics', 'crib', 'cup', 63 | 'curtain', 'cushion', 'decoration', 'desk', 'detergent', 'device', 'dish rack', 64 | 'dispenser', 'divider', 'drawer', 'dress', 'dresser', 'drum', 'dumbbell', 65 | 'dustpan', 'dvd', 'eraser', 'excercise equipment', 'fan', 'file', 'fire extinguisher', 66 | 'flowerpot', 'folder', 'food', 'footstool', 'frame', 'fruit', 'furniture', 67 | 'garbage', 'glass', 'globe', 'glove', 'guitar', 'hair dryer', 'hamper', 68 | 'hat', 'headphones', 'heater', 'helmets', 'holder', 'hook', 'humidifier', 69 | 'jacket', 'jar', 'kettle', 'keyboard', 'kitchenware', 'knife', 'label', 70 | 'ladder', 'lamp', 'laptop', 'letter', 'light', 'luggage', 'machine', 71 | 'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu', 72 | 'microwave', 'mirror', 'mop', 'mouse', 'napkins', 'notebook', 'object', 73 | 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'paper', 74 | 'paper cutter', 'pen', 'pillow', 'pitcher', 'plant', 'plate', 'player', 75 | 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot', 'price tag', 76 | 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio', 'remote control', 77 | 'roll', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', 78 | 'seasoning', 'shampoo', 'sheet', 'shirt', 'shoe', 'shovel', 'sign', 79 | 'soap', 'soap dish', 'soap dispenser', 'speaker', 'sponge', 'spoon', 'stand', 80 | 'stapler', 'statue', 'stick', 'stool', 'sunglasses', 'tablet', 'teapot', 81 | 'telephone', 'tissue', 'tissue box', 'toaster', 'toiletry', 'tool', 'toothbrush', 82 | 'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube', 83 | 'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vase', 'wardrobe', 84 | 'washbasin', 'washing machine', 'wine', 'wrap' 85 | ] 86 | 87 | 88 | movable_and_placeable_items = [ 89 | 'adhesive tape', 90 | 'alarm', 91 | 'album', 92 | 'backpack', 93 | 'bag', 94 | 'ball', 95 | 'basket', 96 | 'beanbag', 97 | 'bicycle', 98 | 'bin', 99 | 'blanket', 100 | 'board', 101 | 'body loofah', 102 | 'book', 103 | 'boots', 104 | 'bottle', 105 | 'bowl', 106 | 'box', 107 | 'bread', 108 | 'broom', 109 | 'brush', 110 | 'bucket', 111 | 'calendar', 112 | 'camera', 113 | 'can', 114 | 'candle', 115 | 'candlestick', 116 | 'cap', 117 | 'carpet', 118 | 'cart', 119 | 'case', 120 | 'chair', 121 | 'cleanser', 122 | 'clock', 123 | 'clothes', 124 | 'coat hanger', 125 | 'coffee maker', 126 | 'coil', 127 | 'computer', 128 | 'conducting wire', 129 | 'container', 130 | 'control', 131 | 'cosmetics', 132 | 'crate', 133 | 'cube', 134 | 'cup', 135 | 'curtain', 136 | 'cushion', 137 | 'decoration', 138 | 'detergent', 139 | 'device', 140 | 'dish rack', 141 | 'dispenser', 142 | 'door knob', 143 | 'drawer', 144 | 'dress', 145 | 'drum', 146 | 'dumbbell', 147 | 'dustpan', 148 | 'dvd', 149 | 'eraser', 150 | 'fan', 151 | 'file', 152 | 'fire extinguisher', 153 | 'flowerpot', 154 | 'folder', 155 | 'food', 156 | 'footstool', 157 | 'frame', 158 | 'fruit', 159 | 'garbage', 160 | 'glass', 161 | 'globe', 162 | 'glove', 163 | 'guitar', 164 | 'hair dryer', 165 | 'hamper', 166 | 'hanger', 167 | 'hat', 168 | 'headphones', 169 | 'heater', 170 | 'helmets', 171 | 'holder', 172 | 'hook', 173 | 'humidifier', 174 | 'ironware', 175 | 'jacket', 176 | 'jar', 177 | 'kettle', 178 | 'keyboard', 179 | 'kitchenware', 180 | 'knife', 181 | 'label', 182 | 'ladder', 183 | 'lamp', 184 | 'laptop', 185 | 'letter', 186 | 'light', 187 | 'luggage', 188 | 'magazine', 189 | 'map', 190 | 'mask', 191 | 'mat', 192 | 'menu', 193 | 'microwave', 194 | 'mirror', 195 | 'monitor', 196 | 'mop', 197 | 'mouse', 198 | 'napkins', 199 | 'notebook', 200 | 'object', 201 | 'ottoman', 202 | 'pack', 203 | 'package', 204 | 'pad', 205 | 'pan', 206 | 'paper', 207 | 'paper cutter', 208 | 'pen', 209 | 'picture', 210 | 'pillow', 211 | 'pipe', 212 | 'pitcher', 213 | 'plant', 214 | 'plate', 215 | 'player', 216 | 'plug', 217 | 'plunger', 218 | 'poster', 219 | 'pot', 220 | 'price tag', 221 | 'printer', 222 | 'projector', 223 | 'purse', 224 | 'rack', 225 | 'radio', 226 | 'remote control', 227 | 'rod', 228 | 'roll', 229 | 'rope', 230 | 'sack', 231 | 'salt', 232 | 'scale', 233 | 'scissors', 234 | 'screen', 235 | 'seasoning', 236 | 'shampoo', 237 | 'sheet', 238 | 'shirt', 239 | 'shoe', 240 | 'shovel', 241 | 'sign', 242 | 'soap', 243 | 'soap dish', 244 | 'soap dispenser', 245 | 'speaker', 246 | 'sponge', 247 | 'spoon', 248 | 'stand', 249 | 'stapler', 250 | 'statue', # if small enough 251 | 'stick', 252 | 'stool', 253 | 'sunglasses', 254 | 'table', # if small enough 255 | 'tablet', 256 | 'teapot', 257 | 'telephone', 258 | 'tissue', 259 | 'tissue box', 260 | 'toaster', 261 | 'toilet paper', 262 | 'toiletry', 263 | 'tool', 264 | 'toothbrush', 265 | 'toothpaste', 266 | 'towel', 267 | 'toy', 268 | 'tray', 269 | 'trophy', 270 | 'tube', 271 | 'tv', # most modern sets can be moved by one person 272 | 'umbrella', 273 | 'urn', 274 | 'utensil', 275 | 'vacuum cleaner', 276 | 'vase', 277 | 'wine', 278 | 'wire', 279 | 'wood', 280 | 'wrap' 281 | ] 282 | 283 | 284 | 285 | embodiedscan_objs = [ 286 | 'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack', 287 | 'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard', 288 | 'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 289 | 'bicycle', 'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 290 | 'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 291 | 'broom', 'brush', 'bucket', 'cabinet', 'calendar', 'camera', 'can', 292 | 'candle', 'candlestick', 'cap', 'car', 'carpet', 'cart', 'case', 293 | 'ceiling', 'chair', 'chandelier', 'cleanser', 'clock', 'clothes', 294 | 'clothes dryer', 'coat hanger', 'coffee maker', 'coil', 'column', 295 | 'commode', 'computer', 'conducting wire', 'container', 'control', 296 | 'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 297 | 'crib', 'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 298 | 'detergent', 'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 299 | 'door', 'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 300 | 'dresser', 'drum', 'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 301 | 'exercise equipment', 'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 302 | 'fireplace', 'floor', 'flowerpot', 'flush', 'folder', 'food', 'footstool', 303 | 'frame', 'fruit', 'furniture', 'garage door', 'garbage', 'glass', 'globe', 304 | 'glove', 'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 305 | 'hanger', 'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 306 | 'hook', 'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle', 307 | 'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder', 308 | 'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine', 309 | 'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu', 310 | 'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins', 311 | 'notebook', 'object', 'ottoman', 'oven', 'pack', 'package', 'pad', 312 | 'pan', 'panel', 'paper', 'paper cutter', 'partition', 'pedestal', 313 | 'pen', 'person', 'piano', 'picture', 'pillar', 'pillow', 'pipe', 314 | 'pitcher', 'plant', 'plate', 'player', 'plug', 'plunger', 'pool', 315 | 'pool table', 'poster', 'pot', 'price tag', 'printer', 'projector', 316 | 'purse', 'rack', 'radiator', 'radio', 'rail', 'range hood', 317 | 'refrigerator', 'remote control', 'ridge', 'rod', 'roll', 'roof', 318 | 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', 'seasoning', 319 | 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel', 'shower', 320 | 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket', 321 | 'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler', 322 | 'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure', 323 | 'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot', 324 | 'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 325 | 'toilet', 'toilet paper', 'toiletry', 'tool', 'toothbrush', 326 | 'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube', 327 | 'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vanity', 328 | 'vase', 'vent', 'ventilation', 'wall', 'wardrobe', 'washbasin', 329 | 'washing machine', 'water cooler', 'water heater', 'window', 330 | 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap' 331 | ] 332 | 333 | -------------------------------------------------------------------------------- /robospatial/spatial_analysis/relationship_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | # Utils for spatial relationships 11 | import numpy as np 12 | from matplotlib.path import Path 13 | import open3d as o3d 14 | from scipy.spatial import ConvexHull 15 | 16 | 17 | 18 | 19 | def get_min_max_visible_depth_for_box(box, extrinsic, intrinsic, image_size): 20 | """ 21 | Calculates the minimum and maximum depth of the visible parts of a box. 22 | 23 | Args: 24 | box (o3d.geometry.OrientedBoundingBox): The bounding box. 25 | extrinsic (np.ndarray): 4x4 extrinsic matrix (camera to world). 26 | intrinsic (np.ndarray): 4x4 camera intrinsic matrix. 27 | image_size (tuple): (width, height) of the image. 28 | 29 | Returns: 30 | tuple: (min_visible_depth, max_visible_depth) or (None, None) if the box 31 | is not visible or behind the camera. 32 | """ 33 | width, height = image_size 34 | extrinsic_w2c = np.linalg.inv(extrinsic) 35 | EPS = 1e-6 # Small epsilon for depth checks 36 | 37 | # 1. Get world corners 38 | corners_world = np.asarray(box.get_box_points()) 39 | corners_world_hom = np.hstack((corners_world, np.ones((8, 1)))) # Homogeneous coordinates 40 | 41 | # 2. Transform to camera coordinates 42 | corners_cam_hom = corners_world_hom @ extrinsic_w2c.T 43 | 44 | # Validate transformation results 45 | if not np.all(np.isfinite(corners_cam_hom)): 46 | # print("Warning: Non-finite values encountered during world to camera transformation.") 47 | return None, None 48 | 49 | # corners_cam = corners_cam_hom[:, :3] / corners_cam_hom[:, 3][:, np.newaxis] # Normalize if W is not 1 50 | corners_cam = corners_cam_hom[:, :3] # Assume w=1 from standard transformation 51 | depths = corners_cam[:, 2] # Z-coordinate is depth 52 | 53 | # 3. Filter points behind the camera 54 | valid_depth_mask = depths > EPS 55 | if not np.any(valid_depth_mask): 56 | return None, None # Box entirely behind camera 57 | 58 | valid_corners_cam = corners_cam[valid_depth_mask] 59 | valid_depths = depths[valid_depth_mask] 60 | 61 | # If no valid points in front of the camera, return None 62 | if valid_corners_cam.shape[0] == 0: 63 | return None, None 64 | 65 | # 4. Project *all* valid camera points (not just visible ones) to pixel coordinates to check overlap 66 | # This helps catch cases where vertices are off-screen but faces/edges are visible 67 | valid_corners_cam_hom_for_proj = np.hstack((valid_corners_cam, np.ones((valid_corners_cam.shape[0], 1)))) 68 | corners_proj = valid_corners_cam_hom_for_proj @ intrinsic.T 69 | 70 | # Validate projection results 71 | if not np.all(np.isfinite(corners_proj)): 72 | # print("Warning: Non-finite values encountered during projection.") 73 | return None, None 74 | 75 | proj_depths = corners_proj[:, 2] 76 | # Filter points where projection depth is too small (avoids division by zero) 77 | valid_proj_mask = np.abs(proj_depths) >= EPS 78 | if not np.any(valid_proj_mask): 79 | return None, None # All points projected onto image plane or behind 80 | 81 | corners_proj = corners_proj[valid_proj_mask] 82 | proj_depths = proj_depths[valid_proj_mask] 83 | corners_pixel = corners_proj[:, :2] / proj_depths[:, np.newaxis] 84 | # We also need to filter the original depths to match the filtered projected points 85 | valid_depths = valid_depths[valid_proj_mask] 86 | 87 | corners_pixel_rounded = np.round(corners_pixel).astype(int) 88 | 89 | # 5. Check visibility: At least one vertex inside image bounds? 90 | in_image_mask = (corners_pixel_rounded[:, 0] >= 0) & (corners_pixel_rounded[:, 0] < width) & \ 91 | (corners_pixel_rounded[:, 1] >= 0) & (corners_pixel_rounded[:, 1] < height) 92 | any_vertex_visible = np.any(in_image_mask) 93 | 94 | # 6. Check visibility: Projected bounding box overlaps image? 95 | min_px, min_py = np.min(corners_pixel_rounded, axis=0) 96 | max_px, max_py = np.max(corners_pixel_rounded, axis=0) 97 | bbox_overlaps_image = not (max_px < 0 or min_px >= width or max_py < 0 or min_py >= height) 98 | 99 | # 7. Determine if any part is visible 100 | is_visible = any_vertex_visible or bbox_overlaps_image 101 | 102 | # 8. Return min/max depth if visible 103 | if is_visible and valid_depths.size > 0: # Ensure there are depths to calculate min/max from 104 | min_visible_depth = np.min(valid_depths) 105 | max_visible_depth = np.max(valid_depths) 106 | return min_visible_depth, max_visible_depth 107 | else: 108 | return None, None 109 | 110 | 111 | 112 | def calculate_occupied_pixels(objects, extrinsic, intrinsic, img_shape): 113 | """Compute occupancy map for the given objects (Optimized). 114 | 115 | Also returns individual occupancy maps for each object. 116 | 117 | Args: 118 | objects (list of dict): List of object dictionaries, each must contain 119 | 'obb' (o3d.geometry.OrientedBoundingBox) and 120 | 'name' (str). 121 | extrinsic (np.ndarray): 4x4 extrinsic matrix (camera to world transformation). 122 | intrinsic (np.ndarray): 4x4 camera intrinsic matrix. 123 | img_shape (tuple): Shape of the image (width, height). 124 | 125 | Returns: 126 | tuple: (combined_occupancy_map, individual_occupancy_maps) 127 | - combined_occupancy_map (np.ndarray): 2D boolean array for all objects. 128 | - individual_occupancy_maps (dict): Dictionary where keys are object 129 | names and values are individual 130 | 2D boolean occupancy maps. 131 | """ 132 | EPS = 1e-6 133 | extrinsic_w2c = np.linalg.inv(extrinsic) 134 | w, h = img_shape # Correctly unpack width and height 135 | combined_occupancy_map = np.zeros((h, w), dtype=bool) 136 | individual_occupancy_maps = {} # Store individual maps here 137 | 138 | faces = [ 139 | [0, 1, 2, 3], [4, 5, 6, 7], 140 | [0, 1, 5, 4], [3, 2, 6, 7], 141 | [0, 3, 7, 4], [1, 2, 6, 5] 142 | ] 143 | 144 | for obj_info in objects: 145 | box = obj_info.get('obb') 146 | obj_name = obj_info.get('name') 147 | 148 | if box is None or obj_name is None: 149 | print(f"Warning: Skipping object due to missing 'obb' or 'name'. Info: {obj_info}") 150 | continue 151 | 152 | # Initialize individual map for this object 153 | current_obj_map = np.zeros((h, w), dtype=bool) 154 | 155 | # Get the corners of the box 156 | corners = np.asarray(box.get_box_points()) 157 | # Reorder corners to match the original code 158 | corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]] 159 | # Add homogeneous coordinate 160 | corners_hom = np.concatenate([corners, np.ones((corners.shape[0], 1))], axis=1) 161 | # Project corners to image plane 162 | corners_img = (intrinsic @ extrinsic_w2c @ corners_hom.T).T 163 | 164 | # Check for invalid depths early 165 | if np.any(np.abs(corners_img[:, 2]) < EPS): 166 | pass # Face-by-face check will handle points behind camera 167 | 168 | # Normalize projected points using actual z-coordinate 169 | corners_pixel = np.zeros((corners_img.shape[0], 2)) 170 | valid_proj_mask = np.abs(corners_img[:, 2]) >= EPS 171 | if np.any(valid_proj_mask): 172 | corners_pixel[valid_proj_mask] = corners_img[valid_proj_mask, :2] / corners_img[valid_proj_mask, 2][:, np.newaxis] 173 | 174 | for face in faces: 175 | # Check if all vertices of the face are behind the camera 176 | if np.any(corners_img[face, 2] < EPS): 177 | continue # Skip faces that are entirely or partially behind the camera plane 178 | 179 | pts = corners_pixel[face] 180 | 181 | # Calculate the bounding box of the projected face 182 | min_coords = np.min(pts, axis=0) 183 | max_coords = np.max(pts, axis=0) 184 | 185 | # Determine the subgrid boundaries, clamping to image dimensions 186 | min_x = max(0, int(np.floor(min_coords[0]))) 187 | min_y = max(0, int(np.floor(min_coords[1]))) 188 | max_x = min(w - 1, int(np.ceil(max_coords[0]))) 189 | max_y = min(h - 1, int(np.ceil(max_coords[1]))) 190 | 191 | # If the bounding box is outside the image or has no area, skip 192 | if max_x < min_x or max_y < min_y: 193 | continue 194 | 195 | # Create coordinate grid only for the bounding box region 196 | sub_x, sub_y = np.meshgrid(np.arange(min_x, max_x + 1), np.arange(min_y, max_y + 1)) 197 | pixel_points_sub = np.vstack((sub_x.flatten(), sub_y.flatten())).T 198 | 199 | if pixel_points_sub.size == 0: 200 | continue # Skip if subgrid is empty 201 | 202 | # Check containment using Path for the subgrid 203 | p = Path(pts) 204 | mask_sub = p.contains_points(pixel_points_sub).reshape((max_y - min_y + 1, max_x - min_x + 1)) 205 | 206 | # Update the *individual* occupancy map 207 | current_obj_map[min_y:max_y+1, min_x:max_x+1] |= mask_sub 208 | 209 | # Store the individual map 210 | individual_occupancy_maps[obj_name] = current_obj_map 211 | # Combine into the main map 212 | combined_occupancy_map |= current_obj_map 213 | 214 | return combined_occupancy_map, individual_occupancy_maps 215 | 216 | 217 | 218 | # Function to project the bounding box to the floor (2D) 219 | def project_to_floor(box): 220 | corners = np.asarray(box.get_box_points()) 221 | # corners[:, 2] = 0 # Set the z-coordinate to 0 to project onto the floor 222 | return corners[:, :2] # Return only the x,y coordinates 223 | 224 | 225 | 226 | # Adapted from: https://github.com/OpenRobotLab/EmbodiedScan/blob/main/embodiedscan/visualization/utils.py 227 | # License: Apache 2.0 228 | def _9dof_to_box(box, label=None, color_selector=None, color=None): 229 | """Convert 9-DoF box from array/tensor to open3d.OrientedBoundingBox. 230 | 231 | Args: 232 | box (numpy.ndarray|torch.Tensor|List[float]): 233 | 9-DoF box with shape (9,). 234 | label (int, optional): Label of the box. Defaults to None. 235 | color_selector (:obj:`ColorSelector`, optional): 236 | Color selector for boxes. Defaults to None. 237 | color (tuple[int], optional): Color of the box. 238 | You can directly specify the color. 239 | If you do, the color_selector and label will be ignored. 240 | Defaults to None. 241 | """ 242 | if isinstance(box, list): 243 | box = np.array(box) 244 | else: 245 | print("box is not a list!") 246 | print(type(box)) 247 | # if isinstance(box, Tensor): #NOTE omitted to not load in torch for just this! 248 | # box = box.cpu().numpy() 249 | center = box[:3].reshape(3, 1) 250 | scale = box[3:6].reshape(3, 1) 251 | rot = box[6:].reshape(3, 1) 252 | rot_mat = o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot) 253 | geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale) 254 | 255 | if color is not None: 256 | geo.color = [x / 255.0 for x in color] 257 | return geo 258 | 259 | if label is not None and color_selector is not None: 260 | color = color_selector.get_color(label) 261 | color = [x / 255.0 for x in color] 262 | geo.color = color 263 | return geo -------------------------------------------------------------------------------- /robospatial/spatial_analysis/relationships.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | """High-level interface for spatial relationship analysis between 3D objects. 11 | 12 | This module provides wrapper functions that simplify the calculation of various 13 | spatial relationships by utilizing lower-level functions from the `context`, 14 | `configuration`, and `compatibility` submodules. 15 | 16 | Key functionalities include: 17 | - `get_spatial_context`: Determines points in space relative to a reference object 18 | (e.g., 'in front of', 'behind'). 19 | - `get_spatial_configuration`: Calculates 3D directional relationships between two 20 | objects (e.g., 'left of', 'above'). 21 | - `get_spatial_compatibility`: Assesses whether one object can physically fit 22 | relative to another (e.g., 'on top of', 'next to'). 23 | 24 | These functions are typically used by higher-level annotation generation scripts. 25 | """ 26 | 27 | from spatial_analysis.obj_properties import items_with_face, movable_and_placeable_items, flat_surface_items 28 | from spatial_analysis.context.context import get_point_in_space_relative_to_object 29 | from spatial_analysis.configuration.configuration import check_spatial_configuration_relationships 30 | from spatial_analysis.compatibility.compatibility import can_fit_object_a_in_relation_to_b, can_fit_on_top 31 | 32 | 33 | def get_spatial_context(obj, extrinsic, intrinsic, floor_bound, obbs, image_size, image_path, 34 | individual_occupancy_maps, env_occupancy_map, 35 | threshold, grid_resolution, num_samples): 36 | """Generates points relative to an object (e.g., in front, behind) for contextual understanding.""" 37 | 38 | have_face = obj["name"] in items_with_face 39 | 40 | # Generate potential points relative to the object's spatial context. 41 | sampled_points, sampled_3d_points, visible_points_3d_all, generated_something = get_point_in_space_relative_to_object( 42 | floor_bound, obbs, 43 | ref_obj=obj, 44 | extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size, have_face=have_face, 45 | num_samples=num_samples, threshold=threshold, grid_resolution=grid_resolution, 46 | individual_occupancy_maps=individual_occupancy_maps, 47 | env_occupancy_map=env_occupancy_map, 48 | image_path=image_path, 49 | ) 50 | 51 | if generated_something: 52 | return sampled_points, sampled_3d_points, True 53 | return None, None, False 54 | 55 | 56 | def get_spatial_configuration(obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness='lenient'): 57 | """Calculates spatial configuration relationships (left/right, above/below, etc.) between two objects.""" 58 | 59 | obj_configuration_relationships = check_spatial_configuration_relationships( 60 | obj1, obj2, extrinsic, intrinsic, image_size, individual_occupancy_maps, strictness 61 | ) 62 | 63 | return obj_configuration_relationships 64 | 65 | 66 | def get_spatial_compatibility(obj1, obj2, extrinsic, intrinsic, floor_bound, obbs, image_size, image_path, 67 | individual_occupancy_maps, env_occupancy_map, 68 | grid_resolution, num_samples, min_distance, buffer_ratio): 69 | """Checks if obj1 can fit in relation to obj2 (e.g., on top, next to).""" 70 | 71 | # Check if the anchor object (obj2) has a face, as this influences spatial context calculations. 72 | have_face = obj2["name"] in items_with_face 73 | 74 | # Check fitting in various spatial relations using sampling-based methods. 75 | results = can_fit_object_a_in_relation_to_b( 76 | floor_bound, obbs, 77 | obj_a=obj1, 78 | obj_b=obj2, 79 | have_face=have_face, 80 | extrinsic=extrinsic, intrinsic=intrinsic, image_size=image_size, image_path=image_path, 81 | grid_resolution=grid_resolution, 82 | min_distance=min_distance, 83 | num_samples=num_samples, 84 | individual_occupancy_maps=individual_occupancy_maps, 85 | env_occupancy_map=env_occupancy_map, 86 | buffer_ratio=buffer_ratio 87 | ) 88 | 89 | # Specifically check 'on_top' relationship using direct OBB comparison 90 | # for movable items on flat surfaces, as this is a common and simpler case. 91 | fits_on_top = False 92 | if obj1["name"] in movable_and_placeable_items and obj2["name"] in flat_surface_items: 93 | fits_on_top = can_fit_on_top(obj1["obb"], obj2["obb"]) 94 | 95 | if "worldcentric" not in results: 96 | results["worldcentric"] = {} 97 | results["worldcentric"]["on_top"] = fits_on_top 98 | 99 | return results -------------------------------------------------------------------------------- /robospatial/spatial_analysis/topdown_map.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual 4 | # property and proprietary rights in and to this material, related 5 | # documentation and any modifications thereto. Any use, reproduction, 6 | # disclosure or distribution of this material and related documentation 7 | # without an express license agreement from NVIDIA CORPORATION or 8 | # its affiliates is strictly prohibited. 9 | 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from scipy.spatial import ConvexHull 14 | import matplotlib.path as mpath 15 | 16 | from spatial_analysis.relationship_utils import project_to_floor 17 | 18 | DEBUG_EMPTY_SPACE=False 19 | 20 | # Function to create the grid representing the floor 21 | def create_floor_grid(floor_bound, grid_resolution=0.1): 22 | 23 | min_bound = floor_bound[0] 24 | max_bound = floor_bound[1] 25 | 26 | x_range = np.arange(min_bound[0], max_bound[0], grid_resolution) 27 | y_range = np.arange(min_bound[1], max_bound[1], grid_resolution) 28 | 29 | return np.meshgrid(x_range, y_range) 30 | 31 | # Function to mark occupied areas on the grid 32 | def mark_occupied_areas(grid, boxes, occupied, floor=False): 33 | x_flat = grid[0].ravel() 34 | y_flat = grid[1].ravel() 35 | points_array = np.column_stack((x_flat, y_flat)) 36 | 37 | for box in boxes: 38 | projected_points = project_to_floor(box) 39 | hull = ConvexHull(projected_points) 40 | hull_vertices = projected_points[hull.vertices] 41 | path = mpath.Path(hull_vertices) 42 | 43 | # Vectorized point-in-polygon test 44 | if floor: 45 | inside = ~path.contains_points(points_array) 46 | else: 47 | inside = path.contains_points(points_array) 48 | 49 | # Update the occupied grid 50 | occupied.ravel()[inside] = True 51 | 52 | return occupied 53 | 54 | # Function to find empty areas on the grid 55 | def find_empty_areas(occupied): 56 | empty_areas = np.logical_not(occupied) 57 | return empty_areas 58 | 59 | def get_empty_space(floor_bound, boxes, grid_resolution=0.01): 60 | grid = create_floor_grid(floor_bound, grid_resolution) 61 | empty_occupied = np.zeros(grid[0].shape, dtype=bool) 62 | occupied = mark_occupied_areas(grid, boxes, empty_occupied) 63 | empty_areas = find_empty_areas(occupied) 64 | 65 | if DEBUG_EMPTY_SPACE: 66 | plt.figure(figsize=(10, 10)) 67 | 68 | # Create color-coded grid 69 | color_grid = np.zeros((*empty_areas.shape, 3), dtype=np.uint8) 70 | color_grid[empty_areas] = [0, 255, 0] # Green for empty 71 | color_grid[~empty_areas] = [255, 0, 0] # Red for occupied 72 | 73 | # Plot the grid 74 | plt.imshow(color_grid, 75 | extent=[grid[0].min(), grid[0].max(), grid[1].min(), grid[1].max()], 76 | origin='lower') 77 | 78 | # Plot boxes as outlines 79 | for box in boxes: 80 | corners = project_to_floor(box) 81 | plt.plot(corners[:, 0], corners[:, 1], 'blue', linewidth=2) 82 | plt.fill(corners[:, 0], corners[:, 1], 'blue', alpha=0.3) 83 | 84 | # If floor_box is not a list, plot it as an outline 85 | min_bound = floor_bound[0] 86 | max_bound = floor_bound[1] 87 | plt.plot(min_bound[0], min_bound[1], 'black', linewidth=2) 88 | plt.fill(min_bound[0], min_bound[1], 'black', alpha=0.1) 89 | plt.plot(max_bound[0], max_bound[1], 'black', linewidth=2) 90 | plt.fill(max_bound[0], max_bound[1], 'black', alpha=0.1) 91 | plt.plot(min_bound[0], max_bound[1], 'black', linewidth=2) 92 | plt.fill(min_bound[0], max_bound[1], 'black', alpha=0.1) 93 | plt.plot(max_bound[0], min_bound[1], 'black', linewidth=2) 94 | plt.fill(max_bound[0], min_bound[1], 'black', alpha=0.1) 95 | 96 | plt.title("Empty Space Grid") 97 | plt.xlabel('X') 98 | plt.ylabel('Y') 99 | plt.grid(True) 100 | # Create a legend instead of a colorbar 101 | from matplotlib.patches import Patch 102 | legend_elements = [ 103 | Patch(facecolor='red', edgecolor='black', label='Occupied'), 104 | Patch(facecolor='green', edgecolor='black', label='Empty') 105 | ] 106 | plt.legend(handles=legend_elements, loc='lower right') 107 | plt.show() 108 | 109 | return empty_areas, grid, occupied 110 | 111 | -------------------------------------------------------------------------------- /scripts/visualize_input.py: -------------------------------------------------------------------------------- 1 | # Create a new file named visualize_simple.py 2 | import os 3 | import json 4 | import cv2 5 | import argparse 6 | import numpy as np 7 | import open3d as o3d 8 | import matplotlib.path as mpath # Needed for face filling in draw_box3d 9 | 10 | # --- Utility Functions (Copied and potentially simplified) --- 11 | 12 | def _9dof_to_box(box_params, color=None): 13 | """Convert 9-DoF box from array/tensor to open3d.OrientedBoundingBox. 14 | 15 | Args: 16 | box_params (numpy.ndarray|list): 9-DoF box [cx, cy, cz, sx, sy, sz, rx, ry, rz]. 17 | color (tuple[int], optional): RGB Color of the box (0-255). Defaults to None. 18 | 19 | Returns: 20 | open3d.geometry.OrientedBoundingBox: The converted Open3D box. 21 | """ 22 | if isinstance(box_params, list): 23 | box_params = np.array(box_params) 24 | 25 | center = box_params[:3].reshape(3, 1) 26 | scale = box_params[3:6].reshape(3, 1) 27 | rot = box_params[6:].reshape(3, 1) 28 | rot_mat = o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot) 29 | geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale) 30 | 31 | if color is not None: 32 | geo.color = [x / 255.0 for x in color] # O3D uses 0-1 range 33 | 34 | return geo 35 | 36 | def _inside_box(box, point): 37 | """Check if any points are inside the box. 38 | 39 | Args: 40 | box (open3d.geometry.OrientedBoundingBox): Oriented Box. 41 | point (np.ndarray): N points represented by nx3 array (x, y, z). 42 | 43 | Returns: 44 | bool: True if any point is inside, False otherwise. 45 | """ 46 | # Reference logic uses nx4, check if conversion needed 47 | if point.shape[1] == 4: 48 | point = point[:, :3] 49 | point_vec = o3d.utility.Vector3dVector(point) 50 | inside_idx = box.get_point_indices_within_bounding_box(point_vec) 51 | return len(inside_idx) > 0 52 | 53 | # Replaced with logic from visualization/img_drawer.py:draw_box3d 54 | def draw_box3d_on_image(image, box, color, label, extrinsic, intrinsic): 55 | """Draw 3D boxes on the image, exactly matching img_drawer.py logic. 56 | 57 | Args: 58 | image (np.ndarray): The image to draw on. 59 | box (open3d.geometry.OrientedBoundingBox): Box to be drawn. 60 | color (tuple): Box color. 61 | label (str): Box category label. 62 | extrinsic (np.ndarray): 4x4 extrinsic matrix (axis_align @ cam2global). 63 | intrinsic (np.ndarray): 4x4 camera intrinsic matrix. 64 | 65 | Returns: 66 | np.ndarray: Image with the box drawn. 67 | """ 68 | EPS = 1e-4 # Epsilon from img_drawer 69 | ALPHA = 0.75 # Alpha from img_drawer (was 0.6) 70 | 71 | extrinsic_w2c = np.linalg.inv(extrinsic) 72 | h, w = image.shape[:2] 73 | x, y = np.meshgrid(np.arange(w), np.arange(h)) 74 | x, y = x.flatten(), y.flatten() 75 | pixel_points = np.vstack((x, y)).T 76 | 77 | # Fix 1: Use transpose() as in original code 78 | camera_pos_in_world = (extrinsic @ np.array([0, 0, 0, 1]).reshape(4, 1)).transpose() 79 | if _inside_box(box, camera_pos_in_world): 80 | return image 81 | 82 | corners = np.asarray(box.get_box_points()) 83 | corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]] # Specific corner order from img_drawer 84 | corners = np.concatenate([corners, np.ones((corners.shape[0], 1))], axis=1) 85 | 86 | # Same projection as img_drawer: intrinsic @ extrinsic_w2c @ corners.transpose() 87 | corners_img = intrinsic @ extrinsic_w2c @ corners.transpose() 88 | corners_img = corners_img.transpose() 89 | 90 | corners_pixel = np.zeros((corners_img.shape[0], 2)) 91 | 92 | # Fix 2: Use np.abs() in division exactly as in img_drawer 93 | for i in range(corners_img.shape[0]): 94 | corners_pixel[i] = corners_img[i][:2] / np.abs(corners_img[i][2]) 95 | 96 | lines = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7], 97 | [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]] 98 | faces = [[0, 1, 2, 3], [4, 5, 6, 7], [0, 1, 5, 4], [3, 2, 6, 7], 99 | [0, 3, 7, 4], [1, 2, 6, 5]] 100 | 101 | image_with_box = image.copy() 102 | 103 | # Fix 3: Use exact depth check from img_drawer for lines 104 | for line in lines: 105 | # This is the exact check from img_drawer 106 | if (corners_img[line][:, 2] < EPS).any(): 107 | continue 108 | px = corners_pixel[line[0]].astype(np.int32) 109 | py = corners_pixel[line[1]].astype(np.int32) 110 | cv2.line(image_with_box, (px[0], px[1]), (py[0], py[1]), color, 2) 111 | 112 | # Fix 4: Use exact mask/face handling from img_drawer 113 | all_mask = np.zeros((h, w), dtype=bool) 114 | for face in faces: 115 | # This is the exact check from img_drawer 116 | if (corners_img[face][:, 2] < EPS).any(): 117 | continue 118 | pts = corners_pixel[face] 119 | p = mpath.Path(pts[:, :2]) 120 | mask = p.contains_points(pixel_points).reshape((h, w)) 121 | all_mask = np.logical_or(all_mask, mask) 122 | 123 | # Apply color blend - exact formula from img_drawer 124 | image_with_box[all_mask] = image_with_box[all_mask] * ALPHA + (1 - ALPHA) * np.array(color) 125 | 126 | # Draw text label if any faces were visible 127 | if all_mask.any(): 128 | textpos = np.min(corners_pixel, axis=0).astype(np.int32) 129 | textpos[0] = np.clip(textpos[0], a_min=0, a_max=w) 130 | textpos[1] = np.clip(textpos[1], a_min=0, a_max=h) 131 | 132 | # Simple text drawing to mimic self.draw_text from img_drawer 133 | font = cv2.FONT_HERSHEY_SIMPLEX 134 | font_scale = 0.6 135 | thickness = 1 136 | text_size, _ = cv2.getTextSize(label, font, font_scale, thickness) 137 | text_w, text_h = text_size 138 | 139 | # Draw background box and text 140 | cv2.rectangle(image_with_box, 141 | (textpos[0], textpos[1]), 142 | (textpos[0] + text_w, textpos[1] + text_h), 143 | color, -1) 144 | cv2.putText(image_with_box, label, 145 | (textpos[0], textpos[1] + text_h), 146 | font, font_scale, (255, 255, 255), thickness) 147 | 148 | return image_with_box 149 | 150 | # --- Main Visualization Logic --- 151 | 152 | def visualize_single_image(image_path, annotation_data): 153 | """Loads image and draws 3D boxes based on annotation data.""" 154 | image = cv2.imread(image_path) 155 | if image is None: 156 | print(f"Error: Could not load image {image_path}") 157 | return 158 | 159 | # Extract camera info 160 | cam_ann = annotation_data.get("camera_annotations") 161 | if not cam_ann: 162 | print("Error: 'camera_annotations' not found in JSON.") 163 | return 164 | try: 165 | # extrinsic is Camera -> World 166 | extrinsic = np.array(cam_ann['extrinsic']) 167 | # Intrinsic matrix 168 | intrinsic = np.array(cam_ann['intrinsic']) 169 | 170 | except KeyError as e: 171 | print(f"Error: Missing camera parameter key: {e}") 172 | return 173 | # Removed LinAlgError check here as inversion happens in drawing function now 174 | except Exception as e: 175 | print(f"Error processing camera parameters: {e}") 176 | return 177 | 178 | 179 | # Extract object grounding info 180 | object_grounding = annotation_data.get("objects", []) 181 | if not object_grounding: 182 | print("Warning: 'objects' array is missing or empty.") 183 | # Display original image if no objects 184 | cv2.imshow("Image with 3D Boxes", image) 185 | cv2.waitKey(0) 186 | cv2.destroyAllWindows() 187 | return 188 | 189 | display_image = image.copy() 190 | # Ensure matplotlib is imported for colormap and path 191 | try: 192 | import matplotlib.pyplot as plt 193 | # Ensure mpath is imported here as it's needed by draw_box3d 194 | import matplotlib.path as mpath 195 | colors = plt.colormaps['tab10'] # Get distinct colors - Updated API 196 | except ImportError: 197 | print("Error: Matplotlib required for colormap and face drawing. Please install.") 198 | # Fallback to manual colors if matplotlib fails 199 | colors = lambda i: [(255,0,0), (0,255,0), (0,0,255)][i % 3] 200 | 201 | 202 | for i, obj_data in enumerate(object_grounding): 203 | obj_name = obj_data.get("Name", f"Object_{i+1}") 204 | bbox_3d_list = obj_data.get("bbox_3d") 205 | 206 | if bbox_3d_list: 207 | # Assuming the first bbox in the list is the one to draw 208 | bbox_9dof = bbox_3d_list[0] 209 | 210 | # Get color 211 | if callable(colors): # Check if it's a colormap function or fallback list 212 | color_float = colors(i)[:3] # Get RGB, discard alpha 213 | color_uint8 = tuple(int(c * 255) for c in color_float) 214 | else: # Fallback list 215 | color_uint8 = colors[i % len(colors)] # Use modulo for safety 216 | 217 | 218 | try: 219 | # Box center/extent/rotation are in Aligned World space from JSON 220 | o3d_box = _9dof_to_box(bbox_9dof, color=color_uint8) 221 | # Pass the combined extrinsic and intrinsic to the drawing function 222 | display_image = draw_box3d_on_image( 223 | display_image, 224 | o3d_box, 225 | color_uint8, 226 | obj_name, 227 | extrinsic, # Combined matrix (axis_align @ cam2global) 228 | intrinsic # Camera intrinsics (K) 229 | ) 230 | except Exception as e: 231 | print(f"Error processing/drawing box for '{obj_name}': {e}") 232 | import traceback 233 | traceback.print_exc() # More detailed error for debugging 234 | else: 235 | print(f"Warning: No 'bbox_3d' found for object '{obj_name}'.") 236 | 237 | # Display the result 238 | cv2.imshow("Image with 3D Boxes", display_image) 239 | print("Press any key to close the window.") 240 | cv2.waitKey(0) 241 | cv2.destroyAllWindows() 242 | 243 | # --- Entry Point --- 244 | 245 | if __name__ == "__main__": 246 | parser = argparse.ArgumentParser(description="Visualize 3D bounding boxes from an annotation file on an image.") 247 | parser.add_argument('--image_path', type=str, required=True, 248 | help='Direct path to the image file.') 249 | parser.add_argument('--annotation_file', type=str, required=True, 250 | help='Path to the JSON annotation file.') 251 | args = parser.parse_args() 252 | 253 | # Load annotation data 254 | try: 255 | with open(args.annotation_file, 'r') as f: 256 | annotation_data = json.load(f) 257 | except FileNotFoundError: 258 | print(f"Error: Annotation file not found at {args.annotation_file}") 259 | exit(1) 260 | except json.JSONDecodeError: 261 | print(f"Error: Could not parse JSON from {args.annotation_file}") 262 | exit(1) 263 | except Exception as e: 264 | print(f"An unexpected error occurred loading annotations: {e}") 265 | exit(1) 266 | 267 | # Use the provided image path directly 268 | image_path = args.image_path 269 | 270 | # Check if the provided image file exists before proceeding 271 | if not os.path.isfile(image_path): 272 | print(f"Error: Image file not found at the provided path: {image_path}") 273 | exit(1) # Exit if the primary path doesn't exist 274 | 275 | # Import matplotlib here to avoid making it a hard dependency if not needed 276 | # although draw_box3d currently needs mpath 277 | try: 278 | import matplotlib.pyplot as plt 279 | # Make sure mpath is imported within the try block as well 280 | import matplotlib.path as mpath 281 | except ImportError: 282 | print("Error: Matplotlib is required by the drawing function. Please install it (`pip install matplotlib`).") 283 | exit(1) 284 | 285 | 286 | visualize_single_image(image_path, annotation_data) 287 | print("Visualization finished.") --------------------------------------------------------------------------------