├── .gitignore ├── LICENSE ├── README.md ├── demos ├── README.md ├── build_tag_map.ipynb ├── download_demo_data.sh └── localization.ipynb ├── evaluation ├── README.md ├── config │ ├── evaluation │ │ ├── matterport_objects.yaml │ │ └── matterport_regions.yaml │ ├── lattice_graph_creation │ │ └── matterport.yaml │ └── tag_map_creation │ │ ├── matterport_ram.yaml │ │ └── matterport_ram_plus.yaml ├── notebooks │ ├── helpers.py │ └── visualize_eval_output_matterport.ipynb └── scripts │ ├── evaluate_localization_matterport.py │ ├── generate_lattice_graph_matterport.py │ ├── generate_tag_maps_matterport.py │ └── visualize_lattice_graph_matterport.py ├── tag_mapping ├── requirements.txt ├── setup.py └── tag_mapping │ ├── __init__.py │ ├── datasets │ └── matterport │ │ ├── __init__.py │ │ ├── category_index_mapping.yaml │ │ ├── category_mapping.py │ │ ├── evaluate_matterport_scan_object_localizations.py │ │ ├── evaluate_matterport_scan_region_localizations.py │ │ ├── file_utils.py │ │ ├── generate_tag_map_from_matterport_scan.py │ │ ├── matterport_object_bounding_box.py │ │ ├── matterport_region_bounding_box.py │ │ ├── mp_region_ram_tags_mapping.py │ │ └── mpcat40_ram_tags_mapping.py │ ├── evaluation │ ├── __init__.py │ ├── lattice_graph_utils.py │ └── lattice_navigation_graph.py │ ├── filtering │ ├── __init__.py │ ├── image_filters.py │ └── inference_filters.py │ ├── localization │ ├── __init__.py │ ├── clustering.py │ ├── pipeline.py │ ├── viewpoint.py │ └── voxel_voting.py │ ├── models │ ├── __init__.py │ ├── image_tagger.py │ ├── ram_plus_tagger.py │ └── ram_tagger.py │ ├── pose_graph.py │ ├── tag_map.py │ └── utils │ ├── __init__.py │ ├── collision_check.py │ ├── get_box_corners.py │ ├── line_mesh.py │ ├── load_yaml_params.py │ └── nearest_points_in_box.py └── thirdparty └── recognize-anything ├── LICENSE ├── MANIFEST.in ├── README.md ├── batch_inference.py ├── datasets ├── openimages_common_214 │ ├── imgs │ │ └── .gitkeep │ ├── openimages_common_214_ram_annots.txt │ ├── openimages_common_214_ram_taglist.txt │ ├── openimages_common_214_tag2text_idannots.txt │ └── openimages_common_214_tag2text_tagidlist.txt └── openimages_rare_200 │ ├── imgs │ └── .gitkeep │ ├── openimages_rare_200_ram_annots.txt │ └── openimages_rare_200_ram_taglist.txt ├── images ├── 1641173_2291260800.jpg ├── demo │ ├── demo1.jpg │ ├── demo2.jpg │ ├── demo3.jpg │ └── demo4.jpg ├── experiment_comparison.png ├── localization_and_recognition.jpg ├── openset_example.jpg ├── ram_grounded_sam.jpg ├── tag2text_framework.png ├── tag2text_grounded_sam.jpg └── tagging_results.jpg ├── inference_ram.py ├── inference_ram_combined.py ├── inference_ram_openset.py ├── inference_tag2text.py ├── ram ├── __init__.py ├── configs │ ├── med_config.json │ ├── q2l_config.json │ └── swin │ │ ├── config_swinB_384.json │ │ └── config_swinL_384.json ├── data │ ├── ram_tag_list.txt │ ├── ram_tag_list_chinese.txt │ ├── ram_tag_list_threshold.txt │ └── tag_list.txt ├── inference.py ├── models │ ├── __init__.py │ ├── bert.py │ ├── ram.py │ ├── ram_plus.py │ ├── swin_transformer.py │ ├── tag2text.py │ ├── utils.py │ └── vit.py ├── transform.py └── utils │ ├── __init__.py │ ├── metrics.py │ └── openset_utils.py ├── recognize_anything_demo.ipynb ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Pytorch checkpoints 163 | *.pth 164 | 165 | demo_data/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models 2 | 3 | [Mike Zhang](https://mikez.xyz), [Kaixian Qu](https://www.linkedin.com/in/kaixian-qu-66a86215a), [Vaishakh Patil](https://www.linkedin.com/in/vaishakhpatil), [Cesar Cadena](https://n.ethz.ch/~cesarc), [Marco Hutter](https://rsl.ethz.ch/the-lab/people/person-detail.MTIxOTEx.TGlzdC8yNDQxLC0xNDI1MTk1NzM1.html) 4 | 5 | 6 | [[Project Page](https://tag-mapping.github.io/)] [[Paper](https://arxiv.org/abs/2409.15451)] 7 | 8 | 9 | ![overview](https://tag-mapping.github.io/media/images/method_overview.svg) 10 | 11 | 12 | ### Abstract 13 | Large Language Models (LLM) have emerged as a tool for robots to generate task plans using common sense reasoning. For the LLM to generate actionable plans, scene context must be provided, often through a map. Recent works have shifted from explicit maps with fixed semantic classes to implicit open vocabulary maps based on queryable embeddings capable of representing any semantic class. However, embeddings cannot directly report the scene context as they are implicit, requiring further processing for LLM integration. To address this, we propose an explicit text-based map that can represent thousands of semantic classes while easily integrating with LLMs due to their text-based nature by building upon large-scale image recognition models. We study how entities in our map can be localized and show through evaluations that our text-based map localizations perform comparably to those from open vocabulary maps while using two to four orders of magnitude less memory. Real-robot experiments demonstrate the grounding of an LLM with the text-based map to solve user tasks. 14 | 15 | 16 | --- 17 | ## Installation 18 | 19 | Create a virtual environment. 20 | ``` 21 | virtualenv -p python3.8 22 | source /bin/activate 23 | pip install --upgrade pip 24 | ``` 25 | 26 | Install torch 27 | ``` 28 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 29 | ``` 30 | 31 | Install the image tagging model. Currently, this repo only supports the [Recognized Anything](https://github.com/xinyu1205/recognize-anything) set of image tagging models. 32 | ``` 33 | pip install -r thirdparty/recognize-anything/requirements.txt 34 | pip install -e thirdparty/recognize-anything/. 35 | ``` 36 | 37 | Download image tagging model checkpoints 38 | ``` 39 | # Recognize Anything Model (RAM) 40 | wget -P https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/ram_swin_large_14m.pth 41 | 42 | # Recognize Anything Plus Model (RAM++) 43 | wget -P https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth 44 | ``` 45 | 46 | 47 | Install the `tag_mapping` package 48 | ``` 49 | pip install -r tag_mapping/requirements.txt 50 | pip install -e tag_mapping/. 51 | ``` 52 | 53 | 54 | --- 55 | ## Demos 56 | 57 | Notebooks demonstrating the Tag Map construction and localization pipelines can be found in the `demos` folder. 58 | 59 | --- 60 | ## Evaluation 61 | The `evaluation` folder contains instructions and scripts for evaluating the Tag Map localizations. 62 | 63 | 64 | --- 65 | ## Citation 66 | If you found our paper or code useful, please cite: 67 | ``` 68 | @inproceedings{zhang2024tagmap, 69 | author = {Zhang, Mike and Qu, Kaixian and Patil, Vaishakh and Cadena, Cesar and Hutter, Marco}, 70 | title = {Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models}, 71 | journal = {Conference on Robot Learning (CoRL)}, 72 | year = {2024}, 73 | } 74 | ``` -------------------------------------------------------------------------------- /demos/README.md: -------------------------------------------------------------------------------- 1 | # Demos 2 | 3 | Please first run the scipt to download the demo data: 4 | ``` 5 | ./download_demo_data.sh 6 | ``` 7 | 8 | There are two demo notebooks provided: 9 | - `build_tag_map.ipynb`: Goes through the Tag Map construction process for the provided demo scene. 10 | - `localization.ipynb`: Runs the coarse localization pipeline over a Tag Map of the demo scene. -------------------------------------------------------------------------------- /demos/download_demo_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/demo_data.zip 4 | echo "Unzipping demo data" 5 | unzip -q demo_data.zip 6 | rm demo_data.zip 7 | echo "Done downloading and unzipping demo data" 8 | -------------------------------------------------------------------------------- /demos/localization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0fa25531", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "\n", 12 | "import numpy as np\n", 13 | "import open3d as o3d\n", 14 | "\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "from matplotlib import cm\n", 17 | "\n", 18 | "import ipywidgets as widgets\n", 19 | "from IPython.display import display" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "ea59c01a", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from tag_mapping.datasets.matterport import (\n", 30 | " read_matterport_image_file,\n", 31 | " read_matterport_depth_file,\n", 32 | " MatterportFilenameBridge\n", 33 | ")\n", 34 | "\n", 35 | "from tag_mapping import TagMap\n", 36 | "\n", 37 | "from tag_mapping.localization import tagmap_entries_to_viewpoints, localization_pipeline\n", 38 | "\n", 39 | "from tag_mapping.utils import box_to_linemesh" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "6f1b59f7", 45 | "metadata": {}, 46 | "source": [ 47 | "## Load scene data\n", 48 | "Please first download the demo data by running `download_demo_data.sh`." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "15091f3d", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "scene_dir = 'demo_data'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "2e42bcc5", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "tag_map = TagMap.load(f'{scene_dir}/scene.tagmap')\n", 69 | "intrinsics = tag_map.metadata[\"intrinsics\"]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "54726254", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "images_dir = os.path.join(scene_dir, 'color')\n", 80 | "depths_dir = os.path.join(scene_dir, 'depth')\n", 81 | "poses_dir = os.path.join(scene_dir, 'poses')\n", 82 | "mesh_path = os.path.join(scene_dir, 'mesh.ply')" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "c3ba44ef", 88 | "metadata": {}, 89 | "source": [ 90 | "Load and visualize the mesh" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "48e91139", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "scene_mesh = o3d.io.read_triangle_mesh(mesh_path)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "3c5ce219", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "o3d.visualization.draw_geometries([scene_mesh])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "3f0609f7", 116 | "metadata": {}, 117 | "source": [ 118 | "## Localize a selected tag\n", 119 | "\n", 120 | "Select a tag recognized in the scene to localize" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "f27dc6c5", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "options = sorted(list(tag_map.unique_objects))\n", 131 | "query_dropdown = widgets.Dropdown(options=options, description='Select an tag:')\n", 132 | "display(query_dropdown)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "0822b061-1fa0-4710-8f72-0aa895711867", 138 | "metadata": {}, 139 | "source": [ 140 | "Retrieve corresponding viewpoints for the selected tag.\n", 141 | "\n", 142 | "__Rerun this block after changing the selection__" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "858838ca", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "query_entries = tag_map.query(query_dropdown.value)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "8f7d4677-406e-4010-83db-c52f85489fbb", 158 | "metadata": {}, 159 | "source": [ 160 | "Show the images for a few of the viewpoints corresponding to the tag" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "c7d7d528", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "max_show = 6\n", 171 | "num_show = min(len(query_entries), max_show)\n", 172 | "\n", 173 | "fig, axes = plt.subplots(1, num_show, figsize=(3*num_show, 6))\n", 174 | "\n", 175 | "for i in range(num_show):\n", 176 | " entry = query_entries[i]\n", 177 | " image_filename = entry.extras['image_filename']\n", 178 | " conf = entry.extras['confidence']\n", 179 | " \n", 180 | " image = read_matterport_image_file(\n", 181 | " os.path.join(images_dir, image_filename))\n", 182 | " \n", 183 | " try:\n", 184 | " ax = axes[i]\n", 185 | " except TypeError:\n", 186 | " ax = axes\n", 187 | " \n", 188 | " ax.imshow(image)\n", 189 | " ax.set_xticks([])\n", 190 | " ax.set_yticks([])\n", 191 | " ax.set_title(f'confidence: {conf:.2f}')\n", 192 | " ax.set_aspect(1)\n", 193 | "\n", 194 | "plt.show()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "b5d9c2da", 200 | "metadata": {}, 201 | "source": [ 202 | "## Compute coarse-grained localizations in 3D for the selected tag\n", 203 | "For each viewpoint corresponding to the selected tag, we first get their frustums in 3D." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "c79bc958", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "viewpoints = tagmap_entries_to_viewpoints(\n", 214 | " entries=query_entries,\n", 215 | " intrinsics=intrinsics,\n", 216 | "\n", 217 | " # set the near plane of the viewpoint frustum to a constant distance away\n", 218 | " near_dist_fn=lambda x: 0.2,\n", 219 | " \n", 220 | " # the far plane of the viewpoint frustum is set as the 80th percentile depth value\n", 221 | " # of each viewpoint\n", 222 | " far_dist_fn=lambda entry: entry.extras['depth_percentiles']['0.8'],\n", 223 | ")" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "13dffe4c-cd21-4efb-bd51-829709ae22cd", 229 | "metadata": {}, 230 | "source": [ 231 | "Visualize the retrieved viewpoint frustums" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "5423e3f7", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "o3d.visualization.draw_geometries([scene_mesh] + [vp.o3d_lineset(color=np.random.rand(3)) for vp in viewpoints])" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "id": "fad8af37-b98c-4edc-b1ff-1fd53f74f8ba", 247 | "metadata": {}, 248 | "source": [ 249 | "### Localization pipeline\n", 250 | "The localization pipeline takes as input the frustums of the retrieved viewpoints and performs a voting procedure over voxels in the scene to generate localized regions for the selected tag.\n", 251 | "\n", 252 | "The final output is a set of proposed localizations for the tag, represented as bounding boxes, along with the confidence level (min number of votes) for each bounding box." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "e139603a", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "voxel_size = 0.2\n", 263 | "\n", 264 | "localization_params = {\n", 265 | " 'voxel_voting': {\n", 266 | " 'viewpoint_weight': None, # [None, 'confidence']\n", 267 | " 'voxel_size': voxel_size,\n", 268 | " 'scoring_method': 'normalized_votes', # ['normalized_votes', 'votes']\n", 269 | " },\n", 270 | " \n", 271 | " 'clustering': {\n", 272 | " 'algorithm': 'dbscan', # ['dbscan', 'hdbscan']\n", 273 | " 'dbscan_kwargs': {\n", 274 | " 'eps': 2 * voxel_size,\n", 275 | " 'min_points': 5,\n", 276 | " 'print_progress': False,\n", 277 | " },\n", 278 | " \n", 279 | " 'clustering_levels': [0.0, 0.25, 0.5, 0.75], # only used if 'scoring_method' == 'normalized_votes'\n", 280 | " 'bounding_box_type': 'axis_aligned', # ['axis_aligned', 'oriented']\n", 281 | " },\n", 282 | "}" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "acd5d34f", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "loc_outputs = localization_pipeline(viewpoints, localization_params, verbose=False)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "7f4645df-28f4-40a7-8ed5-8f3047ae9bd8", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "voxel_center_points = loc_outputs[\"voxel_center_points\"]\n", 303 | "voxel_scores = loc_outputs[\"voxel_scores\"]\n", 304 | "level_bbxes = loc_outputs[\"level_bbxes\"]" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "8ee7d6b3", 310 | "metadata": {}, 311 | "source": [ 312 | "## Visualize localizations" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "id": "18f71776-d212-4ed6-ae6a-bdfc879a54e0", 318 | "metadata": {}, 319 | "source": [ 320 | "Visualize the voxel voting results. Voxel points are colored by their corresponding number of votes." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "a5c4629f", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "voxel_center_points_color = cm.viridis(voxel_scores / voxel_scores.max())[:, :3]\n", 331 | "\n", 332 | "voxel_pcd = o3d.geometry.PointCloud()\n", 333 | "voxel_pcd.points = o3d.utility.Vector3dVector(voxel_center_points)\n", 334 | "voxel_pcd.colors = o3d.utility.Vector3dVector(voxel_center_points_color)\n", 335 | "\n", 336 | "o3d.visualization.draw_geometries([scene_mesh, voxel_pcd])" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "41a003d2", 342 | "metadata": {}, 343 | "source": [ 344 | "Visualize proposed localization bounding boxes. Bounding boxes are colored by their confidence levels corresponding to the minimum number of votes for voxels within the bounding box." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "0a833f66", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "confidences = [l for l, _ in level_bbxes]\n", 355 | "boxes = [b for _, b in level_bbxes]\n", 356 | "max_conf = np.max(confidences)\n", 357 | "\n", 358 | "boxes_linemeshes = []\n", 359 | "for conf, box in zip(confidences, boxes):\n", 360 | " color = cm.viridis(conf / max_conf)[:3]\n", 361 | " \n", 362 | " boxes_linemeshes += box_to_linemesh(\n", 363 | " box, \n", 364 | " color=color, \n", 365 | " radius=0.02\n", 366 | " ).cylinder_segments\n", 367 | " \n", 368 | "o3d.visualization.draw_geometries([scene_mesh] + boxes_linemeshes)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "5c95556e", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 3 (ipykernel)", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.8.10" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 5 401 | } 402 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | These instructions outline the pipeline for evaluating the Tag Map localizations against the coarse-localization metrics P2E and E2P as described in the paper. 3 | 4 | Currently the evaluation is only supported for the Matterport3D (MP3D) dataset which can be downloaded following the instructions [here](https://niessner.github.io/Matterport/). 5 | 6 | 7 | ## Setup 8 | The evaluation assumes that the MP3D data folder has the following structure: 9 | ``` 10 | 11 | ├── 12 | │ ├── undistorted_color_images 13 | │ ├── undistorted_depth_images 14 | │ ├── matterport_camera_poses 15 | │ ├── matterport_camera_intrinsics 16 | │ ├── house_segmentations 17 | | └── ... 18 | ├── 19 | │ ├── undistorted_color_images 20 | │ ├── undistorted_depth_images 21 | │ ├── matterport_camera_poses 22 | │ ├── matterport_camera_intrinsics 23 | │ ├── house_segmentations 24 | | └── ... 25 | └── ... 26 | ``` 27 | 28 | 29 | ## 1. Generate Tag Maps for all scenes 30 | The Tag Maps for all MP3D scenes can be generated using: 31 | 32 | ``` 33 | python scripts/generate_tag_maps_matterport.py \ 34 | --params config/tag_map_creation//matterport_ram.yaml \ 35 | --output_dir \ 36 | --matterport_dir 37 | ``` 38 | 39 | Alternatively, pre-generated Tag Maps can be downloaded [here](https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/mp3d_tag_maps.zip). Please read and agree to the [MP3D EULA](https://kaldir.vc.in.tum.de/matterport/MP_TOS.pdf) before downloading. 40 | 41 | 42 | 43 | ## 2. Generate scene lattice graphs 44 | Computing the coarse-localization metrics P2E and E2P requires computing the shortest paths between points in the scene. The shortest path computation is approximated using a lattice graph which spans the scene's free space while avoiding collisions with the scene geometry. Shortests paths are then computed and stored for each pair of nodes in the lattice graph. 45 | 46 | The lattice graphs and precomputed shortest paths for all MP3D scenes can be generated using: 47 | ``` 48 | python scripts/generate_lattice_graph_matterport.py \ 49 | --params config/lattice_graph_creation/matterport.yaml \ 50 | --output_dir \ 51 | --matterport_dir 52 | ``` 53 | 54 | Alternatively, pre-generated lattice graphs can be downloaded [here](https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/mp3d_lattice_graphs.zip) (61 GB). Please read and agree to the [MP3D EULA](https://kaldir.vc.in.tum.de/matterport/MP_TOS.pdf) before downloading. 55 | 56 | Lattice graphs can be visualized using the included script: 57 | ``` 58 | python scripts/visualize_lattice_graph_matterport.py \ 59 | --lattice_graph_path \ 60 | --matterport_dir 61 | ``` 62 | 63 | 64 | ## 3. Run the evaluation 65 | The evaluation is ran with the following command: 66 | ``` 67 | python scripts/evaluate_localization_matterport.py \ 68 | --params \ 69 | --tag_maps_dir \ 70 | --lattice_graphs_dir \ 71 | --output_dir \ 72 | --matterport_dir 73 | ``` 74 | 75 | Evaluations are done separately for the labeled objects and labeled regions/locations depending on the setting of the params file. For running the object and region evaluations the param files `config/evaluation/matterport_objects.yaml` and `config/evaluation/matterport_regions.yaml` can be used respectively. 76 | 77 | For each scene, the evaluation outputs are saved as a pickled Python dictionary. 78 | 79 | 80 | ## 4. Visualizing evaluation results 81 | The evaluation saves an output file for every scene in the dataset. A notebook for visualizing the evaluation outputs for a scene can be found under the `notebooks` folder. 82 | -------------------------------------------------------------------------------- /evaluation/config/evaluation/matterport_objects.yaml: -------------------------------------------------------------------------------- 1 | label_params: 2 | type: "object" # ["object", "region"] 3 | 4 | blacklisted_labels: [ 5 | "misc", "objects", "void", "unlabeled", 6 | "wall", "floor", "ceiling", 7 | ] 8 | 9 | viewpoint_kwargs: 10 | far_dist_fn: !python/lambda "lambda entry: entry.extras['depth_percentiles']['0.8']" 11 | near_dist_fn: null 12 | 13 | localization_kwargs: 14 | params: 15 | voxel_voting: 16 | viewpoint_weight: null # [null, 'confidence'] 17 | voxel_size: 0.2 18 | scoring_method: "normalized_votes" # ['normalized_votes', 'votes'] 19 | 20 | clustering: 21 | algorithm: "dbscan" # ['dbscan', 'hdbscan'] 22 | dbscan_kwargs: 23 | eps: 0.4 # 2 * voxel_size 24 | min_points: 5 25 | print_progress: false 26 | clustering_levels: [0.0, 0.25, 0.5, 0.75,] # only used if 'scoring_method' == 'normalized_votes' 27 | bounding_box_type: "axis_aligned" # ['axis_aligned', 'oriented'] -------------------------------------------------------------------------------- /evaluation/config/evaluation/matterport_regions.yaml: -------------------------------------------------------------------------------- 1 | label_params: 2 | type: "region" # ["object", "region"] 3 | 4 | blacklisted_labels: [ 5 | "other room", "junk", "no label", 6 | 7 | # no appropriate tag 8 | "dining booth", 9 | "entryway/foyer/lobby", 10 | "outdoor", 11 | ] 12 | 13 | viewpoint_kwargs: 14 | far_dist_fn: !python/lambda "lambda entry: entry.extras['depth_percentiles']['0.8']" 15 | near_dist_fn: null 16 | 17 | localization_kwargs: 18 | params: 19 | voxel_voting: 20 | viewpoint_weight: null # [null, 'confidence'] 21 | voxel_size: 0.2 22 | scoring_method: "normalized_votes" # ['normalized_votes', 'votes'] 23 | 24 | clustering: 25 | algorithm: "dbscan" # ['dbscan', 'hdbscan'] 26 | dbscan_kwargs: 27 | eps: 0.4 # 2 * voxel_size 28 | min_points: 5 29 | print_progress: false 30 | clustering_levels: [0.0, 0.25, 0.5, 0.75,] # only used if 'scoring_method' == 'normalized_votes' 31 | bounding_box_type: "axis_aligned" # ['axis_aligned', 'oriented'] -------------------------------------------------------------------------------- /evaluation/config/lattice_graph_creation/matterport.yaml: -------------------------------------------------------------------------------- 1 | lattice_graph_creation_params: 2 | lattice_grid_kwargs: 3 | grid_res: 0.5 4 | outer_pad: 0.1 5 | 6 | lattice_filter_kwargs: 7 | distance_threshold: 2.0 8 | within_mesh_threshold: 0.0 9 | kdtree_query_k: 10 10 | kdtree_query_num_workers: 8 11 | -------------------------------------------------------------------------------- /evaluation/config/tag_map_creation/matterport_ram.yaml: -------------------------------------------------------------------------------- 1 | model_params: 2 | model: ram 3 | model_config: 4 | ram_pretrained_path: ram_swin_large_14m.pth # set this to the path of the downloaded model checkpoint 5 | ram_image_size: 384 6 | vit: swin_l 7 | device: cuda 8 | 9 | tag_map_generation_params: 10 | matterport_viewpoint_near_dist: 0.2 11 | 12 | filtered_tagging_params: 13 | crop_border_proportions: [0.05, 0.1] 14 | 15 | depth_filtering_params: 16 | mean_threshold: 0.6 17 | quantile_thresholds: [ 18 | [0.5, 0.6], # [percentile, threshold] 19 | ] 20 | 21 | stored_depth_percentiles: [0.8] -------------------------------------------------------------------------------- /evaluation/config/tag_map_creation/matterport_ram_plus.yaml: -------------------------------------------------------------------------------- 1 | model_params: 2 | model: ram_plus 3 | model_config: 4 | ram_pretrained_path: ram_plus_swin_large_14m.pth # set this to the path of the downloaded model checkpoint 5 | ram_image_size: 384 6 | vit: swin_l 7 | device: cuda 8 | 9 | tag_map_generation_params: 10 | matterport_viewpoint_near_dist: 0.2 11 | 12 | filtered_tagging_params: 13 | crop_border_proportions: [0.05, 0.1] 14 | 15 | depth_filtering_params: 16 | mean_threshold: 0.6 17 | quantile_thresholds: [ 18 | [0.5, 0.6], # [percentile, threshold] 19 | ] 20 | 21 | stored_depth_percentiles: [0.8] -------------------------------------------------------------------------------- /evaluation/notebooks/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from typing import List 4 | 5 | from tag_mapping.evaluation import LatticeNavigationGraph 6 | from tag_mapping.utils import LineMesh 7 | 8 | 9 | def generate_lattice_graph_shortest_path_linemeshes( 10 | lattice_graph: LatticeNavigationGraph, node_inds_a: List, node_inds_b: List 11 | ): 12 | shortest_path_linemeshes = [] 13 | for a_ind in node_inds_a: 14 | spl = np.inf 15 | matched_l_ind = None 16 | for b_ind in node_inds_b: 17 | 18 | new_spl = lattice_graph.shortest_path_length(a_ind, b_ind) 19 | if new_spl == None: 20 | continue 21 | 22 | if new_spl < spl: 23 | spl = new_spl 24 | matched_l_ind = b_ind 25 | 26 | if matched_l_ind != None: 27 | sp_inds = lattice_graph.shortest_path(a_ind, matched_l_ind) 28 | 29 | sp_lines = np.zeros((len(sp_inds) - 1, 2)).astype(np.int32) 30 | sp_lines[:, 0] = np.arange(len(sp_inds) - 1) 31 | sp_lines[:, 1] = 1 + np.arange(len(sp_inds) - 1) 32 | 33 | sp_linemesh = LineMesh( 34 | points=lattice_graph.nodes_xyz[sp_inds], 35 | lines=sp_lines, 36 | colors=(0, 1, 1), 37 | radius=0.01, 38 | ) 39 | 40 | shortest_path_linemeshes += sp_linemesh.cylinder_segments 41 | 42 | return shortest_path_linemeshes 43 | -------------------------------------------------------------------------------- /evaluation/scripts/evaluate_localization_matterport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | from datetime import datetime 5 | 6 | from tag_mapping.datasets.matterport.evaluate_matterport_scan_object_localizations import ( 7 | evaluate_matterport_scan_object_localizations, 8 | ) 9 | from tag_mapping.datasets.matterport.evaluate_matterport_scan_region_localizations import ( 10 | evaluate_matterport_scan_region_localizations, 11 | ) 12 | from tag_mapping.utils import load_yaml_params 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser( 17 | description="Generate tag maps from Matterport scans" 18 | ) 19 | parser.add_argument("--params_path", type=str, help="Path to params file") 20 | parser.add_argument("--tag_maps_dir", type=str, help="Path to tag map file") 21 | parser.add_argument( 22 | "--lattice_graphs_dir", type=str, help="Path to lattice graph file" 23 | ) 24 | parser.add_argument("--output_dir", type=str, help="Path to output directory") 25 | parser.add_argument( 26 | "--output_name", type=str, help="Name of evaluation output directory" 27 | ) 28 | parser.add_argument( 29 | "--matterport_dir", type=str, help="Path to directory of matterport scans" 30 | ) 31 | parser.add_argument( 32 | "--scans", 33 | nargs="+", 34 | help="Scans to generate tag maps for. If not specified, all scans will in matterport_dir will be processed.", 35 | ) 36 | args = parser.parse_args() 37 | 38 | # Setup logger 39 | logger = logging.getLogger(__name__) 40 | logger.addHandler(logging.StreamHandler()) 41 | logger.setLevel(logging.INFO) 42 | 43 | # Read params 44 | params = load_yaml_params(args.params_path) 45 | 46 | label_type = params["label_params"]["type"] 47 | if label_type == "object": 48 | evaluate_matterport_scan_localization = ( 49 | evaluate_matterport_scan_object_localizations 50 | ) 51 | elif label_type == "region": 52 | evaluate_matterport_scan_localization = ( 53 | evaluate_matterport_scan_region_localizations 54 | ) 55 | else: 56 | raise ValueError(f"Invalid label type {params['label_params']['type']}") 57 | 58 | # Create output save directory 59 | output_name = ( 60 | f"matterport_{label_type}_evaluation" if args.output_name == None else args.output_name 61 | ) 62 | output_save_dir = os.path.join( 63 | args.output_dir, 64 | f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}", 65 | ) 66 | os.makedirs(output_save_dir, exist_ok=True) 67 | logger.info( 68 | f"created matterport evaluation outputs save directory {output_save_dir}" 69 | ) 70 | 71 | # Copy param file to output save dir 72 | os.system(f"cp {args.params_path} {output_save_dir}/_evaluation_params.yaml") 73 | 74 | scan_names = ( 75 | args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir)) 76 | ) 77 | for scan_name in scan_names: 78 | logger.info(f"\n\nrunning evaluation on scan {scan_name}") 79 | 80 | scan_dir = os.path.join(args.matterport_dir, f"{scan_name}") 81 | if not os.path.isdir(scan_dir): 82 | logger.warning(f"skipping due to non-existing scan directory {scan_dir}") 83 | continue 84 | 85 | tag_map_path = os.path.join(args.tag_maps_dir, f"{scan_name}.tagmap") 86 | if not os.path.isfile(tag_map_path): 87 | logger.warning(f"skipping due to non-existing tag map {tag_map_path}") 88 | continue 89 | 90 | lattice_graph_path = os.path.join( 91 | args.lattice_graphs_dir, f"{scan_name}_lattice_graph.pkl" 92 | ) 93 | if not os.path.isfile(lattice_graph_path): 94 | logger.warning( 95 | f"skipping due to non-existing lattice graph {lattice_graph_path}" 96 | ) 97 | continue 98 | 99 | try: 100 | evaluate_matterport_scan_localization( 101 | params=params, 102 | scan_dir=os.path.abspath(scan_dir), 103 | tag_map_path=os.path.abspath(tag_map_path), 104 | lattice_graph_path=os.path.abspath(lattice_graph_path), 105 | output_dir=output_save_dir, 106 | logger=logger, 107 | ) 108 | except Exception as e: 109 | logger.error(f"failed to generate tag map for scan {scan_name}") 110 | logger.error(e) 111 | continue 112 | -------------------------------------------------------------------------------- /evaluation/scripts/generate_lattice_graph_matterport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | from datetime import datetime 5 | 6 | import open3d as o3d 7 | 8 | from tag_mapping.evaluation import create_lattice_navigation_graph 9 | from tag_mapping.utils import load_yaml_params 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser( 14 | description="Generate lattice navigation graphs from Matterport scans" 15 | ) 16 | parser.add_argument("--params_path", type=str, help="Path to params file") 17 | parser.add_argument("--output_dir", type=str, help="Path to output directory") 18 | parser.add_argument( 19 | "--output_name", type=str, help="Name of evaluation output directory" 20 | ) 21 | parser.add_argument( 22 | "--matterport_dir", type=str, help="Path to directory of matterport scans" 23 | ) 24 | parser.add_argument( 25 | "--scans", 26 | nargs="+", 27 | help="Scans to generate lattice navigation graphs for. If not specified, all scans will in matterport_dir will be processed.", 28 | ) 29 | args = parser.parse_args() 30 | 31 | # Setup logger 32 | logger = logging.getLogger(__name__) 33 | logger.addHandler(logging.StreamHandler()) 34 | logger.setLevel(logging.INFO) 35 | 36 | # Read params 37 | params = load_yaml_params(args.params_path) 38 | 39 | # Create output save directory 40 | output_name = ( 41 | "matterport_lattice_graphs" if args.output_name == None else args.output_name 42 | ) 43 | output_save_dir = os.path.join( 44 | args.output_dir, 45 | f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}", 46 | ) 47 | os.makedirs(output_save_dir, exist_ok=True) 48 | logger.info( 49 | f"created matterport lattice graphs output save directory {output_save_dir}" 50 | ) 51 | 52 | # Copy param file to output save dir 53 | os.system(f"cp {args.params_path} {output_save_dir}/_gen_params.yaml") 54 | 55 | # Generate lattice navigation graph for each scan 56 | scan_names = ( 57 | args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir)) 58 | ) 59 | for scan_name in scan_names: 60 | logger.info(f"\n\ncreating lattice graph for scan {scan_name}") 61 | scan_dir = os.path.join(args.matterport_dir, f"{scan_name}") 62 | ply_file_path = os.path.join( 63 | scan_dir, "house_segmentations", f"{scan_name}.ply" 64 | ) 65 | mesh = o3d.io.read_triangle_mesh(ply_file_path) 66 | 67 | try: 68 | lattice_graph = create_lattice_navigation_graph( 69 | mesh, 70 | params=params["lattice_graph_creation_params"], 71 | print_progress=True, 72 | ) 73 | except Exception as e: 74 | logger.error(f"failed to generate tag map for scan {scan_name}") 75 | logger.error(e) 76 | continue 77 | 78 | save_path = os.path.join(output_save_dir, f"{scan_name}_lattice_graph.pkl") 79 | lattice_graph.save(save_path) 80 | logger.info(f"saved lattice graph to {save_path}") 81 | -------------------------------------------------------------------------------- /evaluation/scripts/generate_tag_maps_matterport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | from datetime import datetime 5 | 6 | from tag_mapping.models import RAMTagger, RAMPlusTagger 7 | 8 | from tag_mapping.datasets.matterport.generate_tag_map_from_matterport_scan import ( 9 | generate_tag_map_from_matterport_scan, 10 | ) 11 | from tag_mapping.utils import load_yaml_params 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser( 16 | description="Generate tag maps from Matterport scans" 17 | ) 18 | parser.add_argument("--params_path", type=str, help="Path to params file") 19 | parser.add_argument("--output_dir", type=str, help="Path to output directory") 20 | parser.add_argument( 21 | "--output_name", type=str, help="Name of evaluation output directory" 22 | ) 23 | parser.add_argument( 24 | "--matterport_dir", type=str, help="Path to directory of matterport scans" 25 | ) 26 | parser.add_argument( 27 | "--scans", 28 | nargs="+", 29 | help="Scans to generate tag maps for. If not specified, all scans will in matterport_dir will be processed.", 30 | ) 31 | args = parser.parse_args() 32 | 33 | # Setup logger 34 | logger = logging.getLogger(__name__) 35 | logger.addHandler(logging.StreamHandler()) 36 | logger.setLevel(logging.INFO) 37 | 38 | # Read params 39 | params = load_yaml_params(args.params_path) 40 | model_params = params["model_params"] 41 | tag_map_generation_params = params["tag_map_generation_params"] 42 | 43 | # Create output save directory 44 | output_name = ( 45 | "matterport_tag_maps" if args.output_name == None else args.output_name 46 | ) 47 | output_save_dir = os.path.join( 48 | args.output_dir, 49 | f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}", 50 | ) 51 | os.makedirs(output_save_dir, exist_ok=True) 52 | logger.info(f"created matterport tag maps output save directory {output_save_dir}") 53 | 54 | # Load tagging model 55 | if model_params["model"] == "ram": 56 | tagging_model = RAMTagger( 57 | config=model_params["model_config"], 58 | ) 59 | elif model_params["model"] == "ram_plus": 60 | tagging_model = RAMPlusTagger( 61 | config=model_params["model_config"], 62 | ) 63 | else: 64 | raise ValueError(f"Unsupported model type {model_params['model']}") 65 | 66 | # Copy param file to output save dir 67 | os.system(f"cp {args.params_path} {output_save_dir}/_gen_params.yaml") 68 | 69 | # Generate tag maps for each scan 70 | scan_names = ( 71 | args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir)) 72 | ) 73 | for scan_name in scan_names: 74 | scan_dir = os.path.join(args.matterport_dir, f"{scan_name}") 75 | 76 | if not os.path.isdir(scan_dir): 77 | logger.warning(f"skipping non-existing scan directory {scan_dir}") 78 | continue 79 | 80 | try: 81 | generate_tag_map_from_matterport_scan( 82 | params=tag_map_generation_params, 83 | tagging_model=tagging_model, 84 | scan_dir=scan_dir, 85 | output_dir=output_save_dir, 86 | logger=logger, 87 | ) 88 | except Exception as e: 89 | logger.error(f"failed to generate tag map for scan {scan_name}") 90 | logger.error(e) 91 | continue 92 | -------------------------------------------------------------------------------- /evaluation/scripts/visualize_lattice_graph_matterport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import open3d as o3d 5 | 6 | from tag_mapping.evaluation import LatticeNavigationGraph 7 | 8 | 9 | def load_and_visualize_lattice_graph( 10 | lattice_graph_path, 11 | matterport_scans_dir, 12 | ): 13 | scan_name = os.path.basename(lattice_graph_path).split('_')[0] 14 | ply_path = os.path.join( 15 | matterport_scans_dir, f"{scan_name}/house_segmentations/{scan_name}.ply" 16 | ) 17 | mesh = o3d.io.read_triangle_mesh(ply_path) 18 | lattice_graph = LatticeNavigationGraph.load(lattice_graph_path) 19 | o3d.visualization.draw_geometries( 20 | [ 21 | mesh, 22 | lattice_graph.o3d_nodes_pointcloud, 23 | lattice_graph.o3d_edges_lineset, 24 | ] 25 | ) 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument( 31 | "--lattice_graph_path", 32 | type=str, 33 | required=True, 34 | help="Path to the lattice graph file", 35 | ) 36 | parser.add_argument( 37 | "--matterport_dir", type=str, help="Path to directory of matterport scans" 38 | ) 39 | args = parser.parse_args() 40 | 41 | load_and_visualize_lattice_graph( 42 | args.lattice_graph_path, 43 | args.matterport_dir, 44 | ) 45 | -------------------------------------------------------------------------------- /tag_mapping/requirements.txt: -------------------------------------------------------------------------------- 1 | cvxpy==1.5.2 2 | hdbscan 3 | jupyterlab 4 | networkx 5 | open3d==0.17.0 6 | opencv-python 7 | pandas 8 | plyfile 9 | recognize-anything 10 | sentence-transformers -------------------------------------------------------------------------------- /tag_mapping/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='tag_mapping', 5 | version='0.1.0', 6 | packages=find_packages(), 7 | install_requires=[ 8 | 'recognize-anything', 9 | ], 10 | description='Package for building spatial maps from image tags', 11 | ) -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/__init__.py: -------------------------------------------------------------------------------- 1 | from .tag_map import TagMap, TagMapEntry 2 | 3 | from .pose_graph import PoseGraph 4 | 5 | import os 6 | 7 | TAG_MAPPING_ROOT_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 8 | """Absolute path to the tag mapping root dir.""" 9 | 10 | TAG_MAPPING_CONFIG_DIR = os.path.join(TAG_MAPPING_ROOT_DIR, 'config') -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/__init__.py: -------------------------------------------------------------------------------- 1 | from .file_utils import * 2 | 3 | from .mpcat40_ram_tags_mapping import MPCAT40_RAM_TAGS_MAPPING 4 | from .mp_region_ram_tags_mapping import MP_REGION_RAM_TAGS_MAPPING 5 | 6 | from .matterport_object_bounding_box import MatterportObjectBoundingBox 7 | from .matterport_region_bounding_box import MatterportRegionBoundingBox 8 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/category_mapping.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import yaml 6 | 7 | 8 | CATEGORY_MAPPING_LINK = "https://raw.githubusercontent.com/niessner/Matterport/master/metadata/category_mapping.tsv" 9 | CATEGORY_INDEX_MAPPING_PATH = ( 10 | Path(__file__).parent.absolute().joinpath("category_index_mapping.yaml") 11 | ) 12 | 13 | 14 | def get_category_index_mapping(): 15 | df = pd.read_csv(CATEGORY_MAPPING_LINK, sep="\t") 16 | df.replace(np.nan, "", inplace=True) # replace empty cells with empty string 17 | 18 | category_index_mapping = {} 19 | for _, row in df.iterrows(): 20 | index = ( 21 | row["index"] - 1 22 | ) # index actually starts from 0, but in the .tsv file it starts from 1 23 | 24 | mappings = {} 25 | for key, value in row.items(): 26 | if key == "index": 27 | continue 28 | mappings[key] = value 29 | 30 | category_index_mapping[index] = mappings 31 | 32 | with open(CATEGORY_INDEX_MAPPING_PATH, "w") as f: 33 | yaml.dump(category_index_mapping, f) 34 | 35 | 36 | def load_category_index_mapping(): 37 | with open(CATEGORY_INDEX_MAPPING_PATH, "r") as f: 38 | category_index_mapping = yaml.load(f, Loader=yaml.FullLoader) 39 | return category_index_mapping 40 | 41 | 42 | if __name__ == "__main__": 43 | get_category_index_mapping() 44 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/file_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | from typing import Tuple, List 4 | 5 | 6 | def read_matterport_image_file(image_filepath) -> Image.Image: 7 | image = Image.open(image_filepath) 8 | return image 9 | 10 | 11 | def read_matterport_depth_file(depth_filepath) -> Tuple[np.ndarray, Image.Image]: 12 | depth_image = Image.open(depth_filepath) 13 | SCALE_FACTOR = 4000 # https://github.com/vsislab/Matterport3D-Layout/issues/4 14 | depth = np.asarray(depth_image) / SCALE_FACTOR 15 | return depth, depth_image 16 | 17 | 18 | def read_matterport_pose_file(pose_filepath) -> np.ndarray: 19 | # NOTE: returns the pose from camera frame to the world frame 20 | with open(pose_filepath, "r") as file: 21 | lines = file.readlines() 22 | T_cam_to_world = np.array( 23 | [list(map(float, line.split(" ")[:-1])) for line in lines] 24 | ) 25 | return T_cam_to_world 26 | 27 | 28 | def read_matterport_intrinsics_file(intrinsics_filepath): 29 | with open(intrinsics_filepath, "r") as file: 30 | lines = file.readlines() 31 | line = lines[0] 32 | intrinsics = line.split(" ") 33 | 34 | width = int(intrinsics[0]) 35 | height = int(intrinsics[1]) 36 | fx = float(intrinsics[2]) 37 | fy = float(intrinsics[3]) 38 | cx = float(intrinsics[4]) 39 | cy = float(intrinsics[5]) 40 | d = [float(i) for i in intrinsics[6:]] 41 | 42 | return width, height, fx, fy, cx, cy, d 43 | 44 | 45 | class MatterportFilenameBridge: 46 | def __init__(self, frame_identifiers): 47 | self._frame_identifiers = frame_identifiers 48 | 49 | @classmethod 50 | def from_image_filename(cls, image_filename): 51 | frame_identifiers = image_filename.split(".")[0].split("_") 52 | frame_identifiers[1] = frame_identifiers[1][1:] 53 | return cls(frame_identifiers) 54 | 55 | @classmethod 56 | def from_pose_filename(cls, pose_filename): 57 | frame_identifiers = pose_filename.split(".")[0].split("_") 58 | frame_identifiers.remove("pose") 59 | return cls(frame_identifiers) 60 | 61 | @property 62 | def image_filename(self): 63 | return ( 64 | self._frame_identifiers[0] 65 | + "_i" 66 | + self._frame_identifiers[1] 67 | + "_" 68 | + self._frame_identifiers[2] 69 | + ".jpg" 70 | ) 71 | 72 | @property 73 | def depth_filename(self): 74 | return ( 75 | self._frame_identifiers[0] 76 | + "_d" 77 | + self._frame_identifiers[1] 78 | + "_" 79 | + self._frame_identifiers[2] 80 | + ".png" 81 | ) 82 | 83 | @property 84 | def pose_filename(self): 85 | return ( 86 | self._frame_identifiers[0] 87 | + "_pose_" 88 | + self._frame_identifiers[1] 89 | + "_" 90 | + self._frame_identifiers[2] 91 | + ".txt" 92 | ) 93 | 94 | 95 | import re 96 | from .matterport_object_bounding_box import MatterportObjectBoundingBox 97 | 98 | from .category_mapping import load_category_index_mapping 99 | 100 | MATTERPORT_CATEGORY_INDEX_MAPPING = load_category_index_mapping() 101 | 102 | 103 | def read_matterport_object_bounding_boxes( 104 | house_filepath, 105 | category_taxonomies=("category", "mpcat40"), 106 | ): 107 | """ 108 | Reads the object bounding box labels in a .house file. 109 | 110 | Args: 111 | house_filepath: A string that specifies the path to the .house file. 112 | category_taxonomies: A tuple of strings that specifies the taxonomies to use for 113 | categorizing the objects. 114 | 115 | Returns: 116 | A dictionary that maps each taxonomy to a dictionary mapping that taxonomy's labels 117 | to a list of the corresponding bounding boxes. 118 | """ 119 | for t in category_taxonomies: 120 | assert t in MATTERPORT_CATEGORY_INDEX_MAPPING[0], "Invalid taxonomy {}".format( 121 | t 122 | ) 123 | 124 | with open(house_filepath) as house_file: 125 | lines = house_file.readlines() 126 | 127 | boxes = [] 128 | for line in lines: 129 | if line[0] == "O": 130 | data = re.split(r" +", line) 131 | 132 | boxes.append( 133 | MatterportObjectBoundingBox( 134 | category_index=int(data[3]), 135 | center=np.array(data[4:7], dtype=float), 136 | a1=np.array(data[7:10], dtype=float), 137 | a2=np.array(data[10:13], dtype=float), 138 | r=np.array(data[13:16], dtype=float), 139 | ) 140 | ) 141 | 142 | out = {t: {} for t in category_taxonomies} 143 | for box in boxes: 144 | for t in category_taxonomies: 145 | try: 146 | t_label = MATTERPORT_CATEGORY_INDEX_MAPPING[box.category_index][t] 147 | if t_label not in out[t]: 148 | out[t][t_label] = [box] 149 | else: 150 | out[t][t_label].append(box) 151 | except KeyError: 152 | print( 153 | "[warning]: bounding box with invalid category_index {}".format( 154 | box.category_index 155 | ) 156 | ) 157 | 158 | return out 159 | 160 | 161 | from plyfile import PlyData 162 | 163 | 164 | def read_matterport_labeled_points( 165 | mesh_ply_filepath, 166 | category_taxonomies=("category", "mpcat40"), 167 | ): 168 | """ 169 | Gets labeled points from the .ply house segmentation file. 170 | Since the faces of the mesh are labeled, we compute the points as the mean of the vertices of the faces. 171 | 172 | Args: 173 | house_filepath: A string that specifies the path to the .house file. 174 | category_taxonomies: A tuple of strings that specifies the taxonomies to use for 175 | categorizing the objects. 176 | 177 | Returns: 178 | First returns a dictionary that maps each taxonomy to a dictionary mapping that taxonomy's labels 179 | to a list of the corresponding points. Second, returns the points as a numpy array. 180 | """ 181 | for t in category_taxonomies: 182 | assert t in MATTERPORT_CATEGORY_INDEX_MAPPING[0], "Invalid taxonomy {}".format( 183 | t 184 | ) 185 | 186 | plydata = PlyData.read(mesh_ply_filepath) 187 | vertex_xyz = np.zeros((len(plydata["vertex"]), 3)) 188 | vertex_xyz[:, 0] = plydata["vertex"]["x"] 189 | vertex_xyz[:, 1] = plydata["vertex"]["y"] 190 | vertex_xyz[:, 2] = plydata["vertex"]["z"] 191 | 192 | face_vertex_inds = np.vstack( # NOTE: this computation is slow 193 | plydata["face"]["vertex_indices"] 194 | ) 195 | face_center_xyz = np.mean(vertex_xyz[face_vertex_inds], axis=1) 196 | face_category_ids = plydata["face"]["category_id"] 197 | 198 | unique_category_ids = np.unique(face_category_ids) 199 | 200 | out = {t: {} for t in category_taxonomies} 201 | for category_id in unique_category_ids: 202 | for t in category_taxonomies: 203 | try: 204 | # NOTE: use category_id - 1 because for some reason the ply file 205 | # has category ids that are 1-indexed 206 | t_label = MATTERPORT_CATEGORY_INDEX_MAPPING[category_id - 1][t] 207 | face_inds = np.where(face_category_ids == category_id)[0] 208 | 209 | if t_label not in out[t]: 210 | out[t][t_label] = face_inds 211 | else: 212 | out[t][t_label] = np.concatenate((out[t][t_label], face_inds)) 213 | except KeyError: 214 | print("[warning]: face with invalid category_id {}".format(category_id)) 215 | 216 | return out, face_center_xyz 217 | 218 | 219 | # https://github.com/niessner/Matterport/blob/master/data_organization.md 220 | # fmt: off 221 | MATTERPORT_REGION_NAME_MAPPING = { 222 | "a": "bathroom", # (should have a toilet and a sink) 223 | "b": "bedroom", 224 | "c": "closet", 225 | "d": "dining room", # (includes “breakfast rooms” other rooms people mainly eat in) 226 | "e": "entryway/foyer/lobby", # (should be the front door, not any door) 227 | 228 | # "f": "familyroom", # (should be a room that a family hangs out in, not any area with couches) 229 | "f": "living room", 230 | 231 | "g": "garage", 232 | "h": "hallway", 233 | "i": "library", # (should be room like a library at a university, not an individual study) 234 | "j": "laundryroom/mudroom", # (place where people do laundry, etc.) 235 | "k": "kitchen", 236 | "l": "living room", # (should be the main “showcase” living room in a house, not any area with couches) 237 | "m": "meetingroom/conferenceroom", 238 | 239 | # "n": "lounge", # (any area where people relax in comfy chairs/couches that is not the family room or living room 240 | "n": "living room", 241 | 242 | "o": "office", # (usually for an individual, or a small set of people) 243 | "p": "porch/terrace/deck/driveway", # (must be outdoors on ground level) 244 | "r": "rec/game", # (should have recreational objects, like pool table, etc.) 245 | "s": "stairs", 246 | 247 | # "t": "toilet", # (should be a small room with ONLY a toilet) 248 | "t": "bathroom", 249 | 250 | "u": "utilityroom/toolroom", 251 | "v": "tv", # (must have theater-style seating) 252 | "w": "workout/gym/exercise", 253 | "x": "outdoor", # areas containing grass, plants, bushes, trees, etc. 254 | "y": "balcony", # (must be outside and must not be on ground floor) 255 | "B": "bar", 256 | "C": "classroom", 257 | "D": "dining booth", 258 | "S": "spa/sauna", 259 | 260 | "z": "other room", # (it is clearly a room, but the function is not clear) 261 | "Z": "junk", # (reflections of mirrors, random points floating in space, etc.) 262 | "-": "no label", 263 | } 264 | # fmt: on 265 | 266 | from .matterport_region_bounding_box import MatterportRegionBoundingBox 267 | 268 | 269 | def read_matterport_region_bounding_boxes( 270 | house_filepath, 271 | ): 272 | """ 273 | Reads the region bounding boxes in a .house file. 274 | 275 | Args: 276 | house_filepath: A string that specifies the path to the .house file. 277 | 278 | Returns: 279 | A dictionary that maps each region label to a list of the corresponding bounding boxes. 280 | """ 281 | with open(house_filepath) as house_file: 282 | lines = house_file.readlines() 283 | 284 | out = {} 285 | for line in lines: 286 | if line[0] == "R": 287 | data = re.split(r" +", line) 288 | 289 | min_bound = np.array(data[9:12], dtype=float) 290 | max_bound = np.array(data[12:15], dtype=float) 291 | if np.any(min_bound == max_bound): 292 | PAD = 1.0 293 | print( 294 | f"warning: region label box with zero volumn, padding each bound by {PAD}" 295 | ) 296 | min_bound = min_bound - PAD 297 | max_bound = max_bound + PAD 298 | 299 | box = MatterportRegionBoundingBox( 300 | label=MATTERPORT_REGION_NAME_MAPPING[data[5]], 301 | min_bound=min_bound, 302 | max_bound=max_bound, 303 | ) 304 | 305 | if box.label not in out: 306 | out[box.label] = [box] 307 | else: 308 | out[box.label].append(box) 309 | return out 310 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/generate_tag_map_from_matterport_scan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import numpy as np 4 | from tqdm import tqdm 5 | from typing import Dict, Union, Optional 6 | 7 | import uuid 8 | from tag_mapping import TagMap, TagMapEntry 9 | from tag_mapping.models import ImageTagger 10 | from tag_mapping.filtering import valid_depth_frame 11 | from tag_mapping.datasets.matterport import ( 12 | read_matterport_image_file, 13 | read_matterport_depth_file, 14 | read_matterport_pose_file, 15 | read_matterport_intrinsics_file, 16 | MatterportFilenameBridge, 17 | ) 18 | 19 | 20 | def generate_tag_map_from_matterport_scan( 21 | params: Dict, 22 | tagging_model: ImageTagger, 23 | scan_dir: Union[str, os.PathLike], 24 | output_dir: Union[str, os.PathLike], 25 | logger: Optional[logging.Logger] = None, 26 | ) -> None: 27 | """ 28 | Generate a tag map from a matterport scan. 29 | 30 | Args: 31 | params: Dictionary of parameters for tag map generation. 32 | tagging_model: The image tagging model which defines the method filtered_tag_image(). 33 | scan_dir: Path to the matterport scan directory. 34 | output_dir: Path of the directory to save the tag map. 35 | logger: Logger to use, if None a logger will be created at debug level. 36 | """ 37 | if logger is None: 38 | logger = logging.getLogger(__name__) 39 | logger.addHandler(logging.StreamHandler()) 40 | logger.setLevel(logging.DEBUG) 41 | 42 | scan_name = os.path.basename(scan_dir) 43 | logger.info(f"generating tag map from matterport scan {scan_name}") 44 | 45 | images_dir = os.path.join(scan_dir, "undistorted_color_images") 46 | depths_dir = os.path.join(scan_dir, "undistorted_depth_images") 47 | poses_dir = os.path.join(scan_dir, "matterport_camera_poses") 48 | intrinsics_dir = os.path.join(scan_dir, "matterport_camera_intrinsics") 49 | logger.info( 50 | f"reading images, depth images, and poses from: {images_dir}\n{depths_dir}\n{poses_dir}" 51 | ) 52 | 53 | # Averge intrinsics across all frames to get an estimate for the scan 54 | intrinsics = [] 55 | for filename in os.listdir(intrinsics_dir): 56 | intrinsics_filepath = os.path.join(intrinsics_dir, filename) 57 | width, height, fx, fy, cx, cy, d = read_matterport_intrinsics_file( 58 | intrinsics_filepath 59 | ) 60 | intrinsics.append([width, height, fx, fy]) 61 | intrinsics = np.array(intrinsics) 62 | intrinsics = np.mean(intrinsics, axis=0) 63 | width, height, fx, fy = intrinsics 64 | logger.info( 65 | f"mean intrinsics over the scan: {width:.0f} {height:.0f} {fx:.2f} {fy:.2f}" 66 | ) 67 | 68 | # Pack tag map metadata 69 | tag_map_metadata = { 70 | "scan_name": scan_name, 71 | "intrinsics": { 72 | "width": width, 73 | "height": height, 74 | "fx": fx, 75 | "fy": fy, 76 | "near_dist": params["matterport_viewpoint_near_dist"], 77 | }, 78 | "tagging_model": tagging_model.__class__.__name__, 79 | } 80 | 81 | # Start tag map generation 82 | logger.info("starting tag map generation") 83 | tag_map = TagMap(metadata=tag_map_metadata) 84 | 85 | skipped_frames = [] 86 | 87 | for image_filename in tqdm(os.listdir(images_dir)): 88 | filename_bridge = MatterportFilenameBridge.from_image_filename(image_filename) 89 | depth_filename = filename_bridge.depth_filename 90 | pose_filename = filename_bridge.pose_filename 91 | 92 | image = read_matterport_image_file(os.path.join(images_dir, image_filename)) 93 | depth, depth_image = read_matterport_depth_file( 94 | os.path.join(depths_dir, depth_filename) 95 | ) 96 | T_cam_to_world = read_matterport_pose_file( 97 | os.path.join(poses_dir, pose_filename) 98 | ) 99 | 100 | # skip frames with invalid depth values 101 | if not valid_depth_frame(depth, **params["depth_filtering_params"]): 102 | skipped_frames.append((image_filename, depth_filename)) 103 | continue 104 | 105 | # compute the tags and their confidences 106 | tags, confidences = tagging_model.filtered_tag_image( 107 | image, params=params["filtered_tagging_params"] 108 | ) 109 | 110 | # information to store about the depth frame 111 | depth_percentiles = { 112 | str(q): dq 113 | for q, dq in zip( 114 | params["stored_depth_percentiles"], 115 | np.quantile(depth, params["stored_depth_percentiles"]), 116 | ) 117 | } 118 | 119 | # pack data to store within a TagMapEntry and add it to the tag map 120 | entry_uuid = uuid.uuid4() 121 | entry = TagMapEntry( 122 | pose=T_cam_to_world, 123 | uuid=entry_uuid, 124 | extras={ 125 | "depth_percentiles": depth_percentiles, 126 | }, 127 | ) 128 | tag_map.add_entry(entry) 129 | 130 | # add associated tags to the database 131 | for tag, conf in zip(tags, confidences): 132 | tag_map.add_tag( 133 | tag, 134 | entry_uuid, 135 | extras={}, 136 | ) 137 | 138 | logger.info( 139 | f"finished tag map generation, skipped {len(skipped_frames)} frames with invalid depth values" 140 | ) 141 | 142 | # Save the tag map 143 | save_path = os.path.join(output_dir, f"{scan_name}.tagmap") 144 | tag_map.save(save_path) 145 | logger.info(f"saved tag map to {save_path}") 146 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/matterport_object_bounding_box.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import numpy as np 4 | import open3d as o3d 5 | 6 | 7 | @dataclass(frozen=True) 8 | class MatterportObjectBoundingBox: 9 | category_index: int 10 | center: np.ndarray 11 | a1: np.ndarray 12 | a2: np.ndarray 13 | r: np.ndarray 14 | 15 | def corners(self): 16 | """ 17 | Corners ordered following convention defined in Pytorch3D 18 | https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/ops/iou_box3d.py 19 | """ 20 | a1 = self.a1 / np.linalg.norm(self.a1) 21 | a2 = self.a2 / np.linalg.norm(self.a2) 22 | r1, r2, r3 = self.r 23 | a3 = np.cross(self.a1, self.a2) 24 | return np.array( 25 | [ 26 | self.center - r1 * a1 - r2 * a2 - r3 * a3, 27 | self.center + r1 * a1 - r2 * a2 - r3 * a3, 28 | self.center + r1 * a1 + r2 * a2 - r3 * a3, 29 | self.center - r1 * a1 + r2 * a2 - r3 * a3, 30 | 31 | self.center - r1 * a1 - r2 * a2 + r3 * a3, 32 | self.center + r1 * a1 - r2 * a2 + r3 * a3, 33 | self.center + r1 * a1 + r2 * a2 + r3 * a3, 34 | self.center - r1 * a1 + r2 * a2 + r3 * a3, 35 | ] 36 | ) 37 | 38 | def o3d_lineset(self, color=(0, 1, 0)): 39 | vertices = self.corners().astype(np.float64) 40 | 41 | lines = np.array( 42 | [ 43 | [0, 1], 44 | [0, 3], 45 | [1, 2], 46 | [2, 3], 47 | [0, 4], 48 | [1, 5], 49 | [2, 6], 50 | [3, 7], 51 | [4, 5], 52 | [4, 7], 53 | [5, 6], 54 | [6, 7], 55 | ] 56 | ).astype(np.int32) 57 | 58 | colors = np.tile(color, (12, 1)).astype(np.float64) 59 | 60 | lineset = o3d.geometry.LineSet() 61 | lineset.points = o3d.utility.Vector3dVector(vertices) 62 | lineset.lines = o3d.utility.Vector2iVector(lines) 63 | lineset.colors = o3d.utility.Vector3dVector(colors) 64 | 65 | return lineset 66 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/matterport_region_bounding_box.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import numpy as np 4 | import open3d as o3d 5 | 6 | 7 | @dataclass(frozen=True) 8 | class MatterportRegionBoundingBox: 9 | label: str 10 | min_bound: np.array 11 | max_bound: np.array 12 | 13 | def corners(self): 14 | """ 15 | Corners ordered following convention defined in Pytorch3D 16 | https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/ops/iou_box3d.py 17 | """ 18 | return np.array( 19 | [ 20 | self.min_bound, 21 | [self.max_bound[0], self.min_bound[1], self.min_bound[2]], 22 | [self.max_bound[0], self.max_bound[1], self.min_bound[2]], 23 | [self.min_bound[0], self.max_bound[1], self.min_bound[2]], 24 | [self.min_bound[0], self.min_bound[1], self.max_bound[2]], 25 | [self.max_bound[0], self.min_bound[1], self.max_bound[2]], 26 | self.max_bound, 27 | [self.min_bound[0], self.max_bound[1], self.max_bound[2]], 28 | ] 29 | ) 30 | 31 | def o3d_lineset(self, color=(0, 1, 0)): 32 | vertices = self.corners().astype(np.float64) 33 | 34 | lines = np.array( 35 | [ 36 | [0, 1], 37 | [0, 3], 38 | [1, 2], 39 | [2, 3], 40 | [0, 4], 41 | [1, 5], 42 | [2, 6], 43 | [3, 7], 44 | [4, 5], 45 | [4, 7], 46 | [5, 6], 47 | [6, 7], 48 | ] 49 | ).astype(np.int32) 50 | 51 | colors = np.tile(color, (12, 1)).astype(np.float64) 52 | 53 | lineset = o3d.geometry.LineSet() 54 | lineset.points = o3d.utility.Vector3dVector(vertices) 55 | lineset.lines = o3d.utility.Vector2iVector(lines) 56 | lineset.colors = o3d.utility.Vector3dVector(colors) 57 | 58 | return lineset 59 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/mp_region_ram_tags_mapping.py: -------------------------------------------------------------------------------- 1 | MP_REGION_RAM_TAGS_MAPPING = { 2 | "bathroom": ["bathroom"], 3 | 4 | "bedroom": ["bedroom"], 5 | 6 | "closet": ["closet"], 7 | 8 | "dining room": ["dining room"], 9 | 10 | # (should be the front door, not any door) 11 | "entryway/foyer/lobby": [], 12 | 13 | # (should be a room that a family hangs out in, not any area with couches) 14 | # NOTE: mapped to "living room" 15 | "familyroom": [], 16 | 17 | "garage": ["garage"], 18 | 19 | "hallway": ["hallway"], 20 | 21 | # (should be room like a library at a university, not an individual study) 22 | "library": ["library"], 23 | 24 | # (place where people do laundry, etc.) 25 | "laundryroom/mudroom": ["laundry room"], 26 | 27 | "kitchen": ["kitchen"], 28 | 29 | # (should be the main “showcase” living room in a house, not any area with couches) 30 | "living room": ["living room"], 31 | 32 | "meetingroom/conferenceroom": ["meeting room"], 33 | 34 | # (any area where people relax in comfy chairs/couches that is not the family room or living room 35 | # NOTE: mapped to "living room" 36 | "lounge": [], 37 | 38 | # (usually for an individual, or a small set of people) 39 | "office": ["office", "home office"], 40 | 41 | # (must be outdoors on ground level) 42 | "porch/terrace/deck/driveway": ["porch", "terrace", "deck", "driveway"], 43 | 44 | # (should have recreational objects, like pool table, etc.) 45 | "rec/game": ["recreation room"], 46 | 47 | "stairs": ["stairs", "stairwell"], 48 | 49 | # (should be a small room with ONLY a toilet) 50 | # NOTE: mapped to "bathroom" 51 | "toilet": [], 52 | 53 | "utilityroom/toolroom": ["utility room"], 54 | 55 | # (must have theater-style seating) 56 | "tv": ["cinema", "home theater", "theater"], 57 | 58 | "workout/gym/exercise": ["gym"], 59 | 60 | # areas containing grass, plants, bushes, trees, etc. 61 | "outdoor": ["outdoor"], 62 | 63 | # (must be outside and must not be on ground floor) 64 | "balcony": ["balcony"], 65 | 66 | "bar": ["bar"], 67 | 68 | "classroom": ["classroom"], 69 | 70 | "dining booth": [], 71 | 72 | "spa/sauna": ["sauna"], 73 | 74 | # (it is clearly a room, but the function is not clear) 75 | "other room": [], 76 | 77 | "junk": [], 78 | 79 | "no label": [], 80 | } 81 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/datasets/matterport/mpcat40_ram_tags_mapping.py: -------------------------------------------------------------------------------- 1 | MPCAT40_RAM_TAGS_MAPPING = { 2 | "wall": ["glass wall", "molding", "room divider", "tile wall", "wall", "wood wall"], 3 | "floor": ["bath mat", "carpet", "doormat", "floor", "floor mat", "landing", "mat"], 4 | "chair": [ 5 | "armchair", 6 | "beach chair", 7 | "bean bag chair", 8 | "beanbag", 9 | "chair", 10 | "computer chair", 11 | "feeding chair", 12 | "folding chair", 13 | "office chair", 14 | "rocking chair", 15 | "swivel chair", 16 | "throne", 17 | ], 18 | "door": [ 19 | "archway", 20 | "barn door", 21 | "bathroom door", 22 | "door", 23 | "doorway", 24 | "elevator door", 25 | "garage door", 26 | "glass door", 27 | "screen door", 28 | "shower door", 29 | ], 30 | "table": [ 31 | "altar", 32 | "billiard table", 33 | "changing table", 34 | "cocktail table", 35 | "computer desk", 36 | "dinning table", 37 | "foosball", 38 | "glass table", 39 | "kitchen table", 40 | "office desk", 41 | "picnic table", 42 | "poker table", 43 | "round table", 44 | "side table", 45 | "stand", 46 | "table", 47 | "vanity", 48 | "workbench", 49 | "writing desk", 50 | ], 51 | "picture": [ 52 | "art", 53 | "art print", 54 | "couple photo", 55 | "decorative picture", 56 | "drawing", 57 | "family photo", 58 | "group photo", 59 | "movie poster", 60 | "oil painting", 61 | "photo", 62 | "photo frame", 63 | "picture", 64 | "picture frame", 65 | "portrait", 66 | "poster", 67 | "publicity portrait", 68 | "reflection", 69 | "wedding photo", 70 | ], 71 | "cabinet": [ 72 | "armoire", 73 | "bathroom cabinet", 74 | "cabinet", 75 | "cabinetry", 76 | "closet", 77 | "file cabinet", 78 | "kitchen cabinet", 79 | "medicine cabinet", 80 | "side cabinet", 81 | "tv cabinet", 82 | "wine cabinet", 83 | ], 84 | "cushion": ["pillow", "throw pillow"], 85 | "window": [ 86 | "bathroom window", 87 | "bedroom window", 88 | "kitchen window", 89 | "office window", 90 | "shop window", 91 | "skylight", 92 | "window", 93 | "window frame", 94 | "window screen", 95 | ], 96 | "sofa": ["couch", "loveseat"], 97 | "bed": [ 98 | "bed", 99 | "bed frame", 100 | "bunk bed", 101 | "canopy bed", 102 | "cat bed", 103 | "dog bed", 104 | "futon", 105 | "hammock", 106 | "headboard", 107 | "hospital bed", 108 | "infant bed", 109 | "mattress", 110 | ], 111 | "curtain": ["curtain", "shower curtain"], 112 | "chest_of_drawers": ["bureau", "drawer", "dresser", "nightstand"], 113 | "plant": ["bush", "flower", "grass", "houseplant", "plant", "tree"], 114 | "sink": ["basin", "bathroom sink", "sink"], 115 | "stairs": ["ladder", "stair", "stairs", "stairwell"], 116 | "ceiling": ["ceiling", "roof"], 117 | "toilet": ["bidet", "toilet bowl", "toilet seat"], 118 | "stool": ["bar stool", "footrest", "music stool", "step stool", "stool"], 119 | "towel": [ 120 | "bath towel", 121 | "beach towel", 122 | "face towel", 123 | "hand towel", 124 | "paper towel", 125 | "towel", 126 | ], 127 | "mirror": [ 128 | "bathroom mirror", 129 | "car mirror", 130 | "cosmetics mirror", 131 | "mirror", 132 | "rearview mirror", 133 | "view mirror", 134 | ], 135 | "tv_monitor": [ 136 | "bulletin board", 137 | "computer monitor", 138 | "computer screen", 139 | "display", 140 | "monitor", 141 | "television", 142 | "whiteboard", 143 | ], 144 | "shower": ["shower", "shower door", "shower head"], 145 | "column": ["pillar", "post"], 146 | "bathtub": ["bath", "jacuzzi"], 147 | "counter": [ 148 | "bar", 149 | "buffet", 150 | "counter", 151 | "counter top", 152 | "island", 153 | "kitchen counter", 154 | "kitchen island", 155 | "wet bar", 156 | ], 157 | "fireplace": ["fireplace", "mantle"], 158 | "lighting": [ 159 | "bedside lamp", 160 | "chandelier", 161 | "christmas light", 162 | "lamp", 163 | "lamp shade", 164 | "lantern", 165 | "light", 166 | "light fixture", 167 | "lighting", 168 | "neon light", 169 | "oil lamp", 170 | "stage light", 171 | "table lamp", 172 | "wall lamp", 173 | ], 174 | "beam": ["beam"], 175 | "railing": ["balustrade", "rail"], 176 | "shelving": ["bookshelf", "easel", "shelf", "shelve", "spice rack"], 177 | "blinds": ["blind"], 178 | "gym_equipment": [ 179 | "barbell", 180 | "dumbbell", 181 | "stationary bicycle", 182 | "training bench", 183 | "treadmill", 184 | "weight", 185 | ], 186 | "seating": ["bench", "church bench", "park bench", "seat", "window seat"], 187 | "board_panel": ["board", "panel"], 188 | "furniture": ["armoire", "closet", "furniture"], 189 | "appliances": [ 190 | "appliance", 191 | "boiler", 192 | "coffee machine", 193 | "dish washer", 194 | "fridge", 195 | "gas stove", 196 | "hand dryer", 197 | "home appliance", 198 | "humidifier", 199 | "ice maker", 200 | "juicer", 201 | "microwave", 202 | "oven", 203 | "rice cooker", 204 | "sewing machine", 205 | "stove", 206 | "toaster", 207 | "vacuum", 208 | "waffle iron", 209 | "washer", 210 | "washing machine", 211 | ], 212 | "clothes": [ 213 | "baby clothe", 214 | "baseball hat", 215 | "bathrobe", 216 | "bathroom accessory", 217 | "bikini", 218 | "bikini top", 219 | "blouse", 220 | "christmas hat", 221 | "cloak", 222 | "clothing", 223 | "coat", 224 | "cocktail dress", 225 | "corset", 226 | "costume", 227 | "cowboy hat", 228 | "crop top", 229 | "denim jacket", 230 | "dress", 231 | "dress hat", 232 | "dress shirt", 233 | "dress shoe", 234 | "dress suit", 235 | "evening dress", 236 | "fur coat", 237 | "gown", 238 | "halter top", 239 | "hat", 240 | "headdress", 241 | "headscarf", 242 | "hoodie", 243 | "jacket", 244 | "jeans", 245 | "jockey cap", 246 | "kilt", 247 | "kimono", 248 | "lab coat", 249 | "lace dress", 250 | "laundry", 251 | "leather jacket", 252 | "maxi dress", 253 | "miniskirt", 254 | "overcoat", 255 | "pants", 256 | "pantyhose", 257 | "polo neck", 258 | "polo shirt", 259 | "raincoat", 260 | "robe", 261 | "safety vest", 262 | "scarf", 263 | "shirt", 264 | "ski jacket", 265 | "sports coat", 266 | "straw hat", 267 | "sun hat", 268 | "suspenders", 269 | "sweat pant", 270 | "sweater", 271 | "sweatshirt", 272 | "t shirt", 273 | "t-shirt", 274 | "trench coat", 275 | "underclothes", 276 | "vest", 277 | "visor", 278 | "waterproof jacket", 279 | "wedding dress", 280 | "wrap dress", 281 | ], 282 | "misc": [], 283 | "objects": [], 284 | "void": [], 285 | "unlabeled": [], 286 | } 287 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .lattice_navigation_graph import LatticeNavigationGraph, create_lattice_navigation_graph 2 | from .lattice_graph_utils import assign_label_box_lattice_graph_nodes, assign_proposal_box_lattice_graph_nodes -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/evaluation/lattice_graph_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import open3d as o3d 3 | 4 | from tag_mapping.utils import nearest_points_in_box, o3d_check_lines_collision 5 | from .lattice_navigation_graph import LatticeNavigationGraph 6 | 7 | 8 | def assign_label_box_lattice_graph_nodes( 9 | lattice_graph: LatticeNavigationGraph, 10 | rcs: o3d.t.geometry.RaycastingScene, 11 | box_corners: np.ndarray, 12 | enable_inflation: bool = True, 13 | ogn_dist_threshold: float = 1.0, 14 | ): 15 | """ 16 | Assigns to a labeled bounding box nodes of the lattice graph. 17 | The assignment includes nodes of the following types: 18 | - Nodes within the labeled bounding box 19 | - Nodes who's shortest straight line path to the labeled bounding box 20 | is within the object goal nav distance threshold and collision free 21 | 22 | The object goal nav distance threshold is nominally defined as 1m following 23 | the Habitat challenge evaluation criteria: 24 | https://aihabitat.org/challenge/2023/ 25 | 26 | Args: 27 | lattice_graph: LatticeNavigationGraph 28 | rcs: Open3d RaycastingScene 29 | box_corners: (8,3) array of labeled bounding box corners 30 | enable_inflation: If True, inflate the box to find additional points within 31 | the object goal nav distance threshold 32 | ogn_dist_threshold: Object goal nav distance threshold. 33 | Only used if enable_inflation is True. 34 | 35 | Returns: 36 | assigned_node_inds: List of node indices assigned to the labeled bounding box. 37 | Note that this list could be empty if no nodes are assigned 38 | """ 39 | nodes_xyz = lattice_graph.nodes_xyz 40 | 41 | # Construct the convex hull (i.e. minimum oriented bounding box) from the box_corners 42 | o3d_box_corners = o3d.geometry.PointCloud() 43 | o3d_box_corners.points = o3d.utility.Vector3dVector(box_corners) 44 | obb = o3d_box_corners.get_minimal_oriented_bounding_box() 45 | 46 | # Get the indicies of the nodes within the initial bounding box 47 | in_box_inds = obb.get_point_indices_within_bounding_box( 48 | o3d.utility.Vector3dVector(nodes_xyz) 49 | ) 50 | 51 | # Inflate the box to find additional points within the object goal nav distance 52 | if enable_inflation: 53 | inflated_obb_extent = obb.extent.copy() 54 | inflated_obb_extent += 2 * ogn_dist_threshold 55 | 56 | inflated_obb = o3d.geometry.OrientedBoundingBox() 57 | inflated_obb.center = obb.center.copy() 58 | inflated_obb.R = obb.R.copy() 59 | inflated_obb.extent = inflated_obb_extent 60 | 61 | in_inflated_box_inds = inflated_obb.get_point_indices_within_bounding_box( 62 | o3d.utility.Vector3dVector(nodes_xyz) 63 | ) 64 | 65 | # Consider now only the nodes in the inflated box but NOT in the original box 66 | near_box_inds = [ind for ind in in_inflated_box_inds if ind not in in_box_inds] 67 | 68 | if len(near_box_inds) > 0: 69 | near_box_nodes = nodes_xyz[near_box_inds].reshape(-1, 3) 70 | proj_box_nodes = nearest_points_in_box( 71 | box_corners, obb.center, near_box_nodes 72 | ) 73 | 74 | near_box_dists = np.linalg.norm(near_box_nodes - proj_box_nodes, axis=1) 75 | within_ogn_dist = near_box_dists <= ogn_dist_threshold 76 | 77 | collision_mask = o3d_check_lines_collision( 78 | rcs, near_box_nodes, proj_box_nodes 79 | ) 80 | 81 | valid_mask = np.logical_and(~collision_mask, within_ogn_dist) 82 | near_box_inds = np.array(near_box_inds)[valid_mask].tolist() 83 | 84 | assigned_node_inds = in_box_inds + near_box_inds 85 | else: 86 | assigned_node_inds = in_box_inds 87 | 88 | return assigned_node_inds 89 | 90 | 91 | def assign_proposal_box_lattice_graph_nodes( 92 | lattice_graph: LatticeNavigationGraph, 93 | rcs: o3d.t.geometry.RaycastingScene, 94 | box_corners: np.ndarray, 95 | ): 96 | """ 97 | Assigns to a proposed bounding box nodes of the lattice graph. 98 | 99 | First we check if the box already contains nodes, if it does then 100 | we return those nodes. Otherwise we inflate the box and find nodes 101 | nearby to the box. 102 | 103 | We only assign nearby nodes which are collision free to their projected 104 | point in the box and who's projected point is within that node's voxel. 105 | 106 | Args: 107 | lattice_graph: LatticeNavigationGraph 108 | rcs: Open3d RaycastingScene 109 | box_corners: (8,3) array of labeled bounding box corners 110 | 111 | Returns: 112 | assigned_node_inds: List of node indices assigned to the labeled bounding box. 113 | Note that this list could be empty if no nodes are assigned 114 | """ 115 | nodes_xyz = lattice_graph.nodes_xyz 116 | lattice_grid_res = lattice_graph.grid_resolution 117 | 118 | # Construct the convex hull (i.e. minimum oriented bounding box) from the box_corners 119 | o3d_box_corners = o3d.geometry.PointCloud() 120 | o3d_box_corners.points = o3d.utility.Vector3dVector(box_corners) 121 | obb = o3d_box_corners.get_minimal_oriented_bounding_box() 122 | 123 | # Get the indicies of the nodes within the initial bounding box 124 | in_box_inds = obb.get_point_indices_within_bounding_box( 125 | o3d.utility.Vector3dVector(nodes_xyz) 126 | ) 127 | 128 | # End if the box already contains nodes 129 | if len(in_box_inds) > 0: 130 | return in_box_inds 131 | 132 | # Find additional nodes by inflating the bounding box 133 | MAGIC_EXTENT_SCALING_CONSTANT = 2 * 1.414 # NOTE: 1.414 ~ sqrt(2) 134 | 135 | inflated_obb_extent = obb.extent.copy() 136 | inflated_obb_extent = np.maximum( 137 | inflated_obb_extent, MAGIC_EXTENT_SCALING_CONSTANT * lattice_grid_res 138 | ) 139 | 140 | inflated_obb = o3d.geometry.OrientedBoundingBox() 141 | inflated_obb.center = obb.center.copy() 142 | inflated_obb.R = obb.R.copy() 143 | inflated_obb.extent = inflated_obb_extent 144 | 145 | near_box_inds = inflated_obb.get_point_indices_within_bounding_box( 146 | o3d.utility.Vector3dVector(nodes_xyz) 147 | ) 148 | 149 | # Assign no nodes if inflating does not find any nodes 150 | if len(near_box_inds) == 0: 151 | return [] 152 | 153 | near_box_nodes = nodes_xyz[near_box_inds].reshape(-1, 3) 154 | proj_box_nodes = nearest_points_in_box(box_corners, obb.center, near_box_nodes) 155 | 156 | # Check if the projected points are within the node's voxel using infinity norm 157 | near_box_dists = np.linalg.norm(near_box_nodes - proj_box_nodes, axis=1, ord=np.inf) 158 | in_node_voxel = near_box_dists <= (lattice_grid_res / 2) 159 | 160 | collision_mask = o3d_check_lines_collision(rcs, near_box_nodes, proj_box_nodes) 161 | 162 | valid_mask = np.logical_and(~collision_mask, in_node_voxel) 163 | near_box_inds = np.array(near_box_inds)[valid_mask].tolist() 164 | 165 | return near_box_inds 166 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference_filters import * 2 | from .image_filters import * 3 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/filtering/image_filters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from typing import Iterable, Tuple, Optional 4 | 5 | 6 | def valid_depth_frame( 7 | depth_frame: np.ndarray, 8 | mean_threshold: Optional[float] = None, 9 | quantile_thresholds: Optional[Iterable[Tuple[float, float]]] = None, 10 | ) -> bool: 11 | """ 12 | Used to filter out frames of up-close views that are unlikely to be informative. 13 | 14 | Args: 15 | depth_frame: Depth frame to check. 16 | mean_threshold: minimum threshold on mean of the depth frame. 17 | Set to None to skip mean threshold check. 18 | quantile_thresholds: list of tuples of quantiles their minimum depth thresholds. 19 | Set to None to skip quantile threshold check. 20 | 21 | Returns: 22 | True if the depth frame is valid, False otherwise. 23 | """ 24 | 25 | valid_depths_mask = np.logical_and(~np.isnan(depth_frame), ~np.isinf(depth_frame)) 26 | 27 | if not np.any(valid_depths_mask): 28 | return False 29 | 30 | if mean_threshold != None: 31 | if np.mean(depth_frame[valid_depths_mask]) < mean_threshold: 32 | return False 33 | 34 | if quantile_thresholds != None: 35 | quantiles = np.quantile( 36 | depth_frame[valid_depths_mask], [q for q, _ in quantile_thresholds] 37 | ) 38 | if np.any(quantiles < [thresh for _, thresh in quantile_thresholds]): 39 | return False 40 | 41 | return True 42 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/filtering/inference_filters.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | from PIL import Image 4 | 5 | 6 | def compute_unlikely_tags_center_crop_ensemble( 7 | image: Image.Image, 8 | image_tags: Iterable[str], 9 | cc_proportions: Iterable[float], 10 | tagging_model, 11 | ) -> Iterable[str]: 12 | """ 13 | Finds unlikely tags in a set of tags for an image by running the 14 | model on center cropped versions of the original image 15 | 16 | Args: 17 | image: original image 18 | image_tags: tags of the original image 19 | cc_proportions: list of border crop proportions 20 | tagging_model: tagging model 21 | 22 | Returns: 23 | set of unlikely tags 24 | """ 25 | 26 | def center_crop(img, crop_border_proportion): 27 | assert crop_border_proportion < 0.5 28 | width, height = img.size 29 | return img.crop( 30 | ( 31 | crop_border_proportion * width, 32 | crop_border_proportion * height, 33 | width * (1 - crop_border_proportion), 34 | height * (1 - crop_border_proportion), 35 | ) 36 | ) 37 | 38 | cc_images = [center_crop(image, ccp) for ccp in cc_proportions] 39 | 40 | unlikely_tags_set = set() 41 | for cc_image in cc_images: 42 | cc_image_tags = tagging_model.tag_image(cc_image)["tags"] 43 | 44 | unlikely_tags = [tag for tag in image_tags if tag not in cc_image_tags] 45 | unlikely_tags_set.update(unlikely_tags) 46 | 47 | return unlikely_tags_set 48 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/localization/__init__.py: -------------------------------------------------------------------------------- 1 | from .viewpoint import * 2 | from .clustering import * 3 | from .voxel_voting import * 4 | from .pipeline import * -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/localization/clustering.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import hdbscan 4 | import numpy as np 5 | import open3d as o3d 6 | 7 | 8 | def cluster_points_dbscan( 9 | points: Union[np.ndarray, o3d.geometry.PointCloud], **dbscan_kwargs 10 | ) -> np.ndarray: 11 | """ 12 | Cluster points using DBSCAN implementation from Open3D 13 | http://www.open3d.org/docs/release/python_api/open3d.geometry.PointCloud.html#open3d.geometry.PointCloud.cluster_dbscan 14 | 15 | Args: 16 | points: (N, 3) array of points or Open3D point cloud 17 | **dbscan_kwargs: keyword arguments to pass to dbscan 18 | 19 | Returns: 20 | (N,) array of cluster labels 21 | """ 22 | if isinstance(points, np.ndarray): 23 | pcd = o3d.geometry.PointCloud() 24 | pcd.points = o3d.utility.Vector3dVector(points) 25 | elif isinstance(points, o3d.geometry.PointCloud): 26 | pcd = points 27 | else: 28 | raise ValueError("points must be either an array or an Open3D point cloud") 29 | 30 | labels = np.array(pcd.cluster_dbscan(**dbscan_kwargs)).astype(np.int32) 31 | return labels 32 | 33 | 34 | def cluster_points_hdbscan( 35 | points: Union[np.ndarray, o3d.geometry.PointCloud], **hdbscan_kwargs 36 | ) -> np.ndarray: 37 | """ 38 | Cluster points using HDBSCAN implementation from hdbscan package 39 | https://github.com/scikit-learn-contrib/hdbscan 40 | 41 | Args: 42 | points: (N, 3) array of points or Open3D point cloud 43 | **hdbscan_kwargs: keyword arguments to pass to hdbscan 44 | 45 | Returns: 46 | (N,) array of cluster labels 47 | """ 48 | if isinstance(points, np.ndarray): 49 | X = points 50 | elif isinstance(points, o3d.geometry.PointCloud): 51 | X = np.asarray(points.points) 52 | else: 53 | raise ValueError("points must be either an array or an Open3D point cloud") 54 | 55 | hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_kwargs) 56 | 57 | labels = hdbscan_clusterer.fit_predict(X).astype(np.int32) 58 | return labels 59 | 60 | 61 | def cluster_points( 62 | points: Union[np.ndarray, o3d.geometry.PointCloud], 63 | algorithm: str, 64 | **algorithm_kwargs, 65 | ) -> np.ndarray: 66 | """ 67 | Cluster points using the specified algorithm 68 | 69 | Args: 70 | points: (N, 3) array of points or Open3D point cloud 71 | algorithm: algorithm to use for clustering 72 | one of 'dbscan' or 'hdbscan' 73 | **algorithm_kwargs: keyword arguments to pass to the clustering algorithm 74 | 75 | Returns: 76 | (N,) array of cluster labels 77 | """ 78 | if algorithm == "dbscan": 79 | labels = cluster_points_dbscan(points, **algorithm_kwargs) 80 | elif algorithm == "hdbscan": 81 | labels = cluster_points_hdbscan(points, **algorithm_kwargs) 82 | else: 83 | raise ValueError( 84 | "Invalid algorithm: {}. Must be dbscan or hdbscan".format(algorithm) 85 | ) 86 | return labels 87 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/localization/pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import open3d as o3d 3 | 4 | from typing import Any, Callable, Dict, Iterable, Optional 5 | 6 | from tag_mapping import TagMapEntry 7 | from .clustering import cluster_points 8 | from .viewpoint import Viewpoint 9 | from .voxel_voting import grid_voxel_voting 10 | 11 | 12 | def localization_pipeline( 13 | viewpoints: Iterable[Viewpoint], 14 | params: Dict[str, Any], 15 | verbose: bool = False, 16 | ) -> Dict[str, Any]: 17 | """ 18 | Tag map localization pipeline. 19 | 20 | Args: 21 | viewpoints: Iterable of viewpoints 22 | params: Dictionary of parameters for the pipeline. 23 | 24 | Returns: 25 | Dictionary of results from the pipeline. 26 | "voxel_center_points": (N,3) array of voxel center points 27 | "voxel_scores": (N,) array of scores for each voxel 28 | "level_bbxes": Dictionary mapping clustering level to the bounding boxes 29 | at that level. 30 | """ 31 | vv_params, cl_params = (params["voxel_voting"], params["clustering"]) 32 | 33 | ### Voxel voting 34 | if vv_params["viewpoint_weight"] == None: 35 | viewpoint_weight = None 36 | elif vv_params["viewpoint_weight"] == "confidence": 37 | viewpoint_weight = np.array([vp.extras["confidence"] for vp in viewpoints]) 38 | else: 39 | raise ValueError( 40 | 'Invalid viewpoint_weight {}. Must be None or "confidence"'.format( 41 | vv_params["viewpoint_weight"] 42 | ) 43 | ) 44 | 45 | voxel_center_points, votes = grid_voxel_voting( 46 | viewpoints, vv_params["voxel_size"], viewpoint_weight 47 | ) 48 | 49 | # handle case where voxel voting fills no voxels 50 | # then voxel_center_points and votes will be 51 | # (0,3) and (0,) respectively 52 | if voxel_center_points.shape[0] == 0: 53 | return { 54 | "voxel_center_points": voxel_center_points, 55 | "voxel_scores": votes, 56 | "level_bbxes": [], 57 | } 58 | 59 | if vv_params["scoring_method"] == "normalized_votes": 60 | voxel_scores = votes / np.max(votes) 61 | clustering_levels = cl_params["clustering_levels"] 62 | 63 | def score_to_votes(score): 64 | score = score * np.max(votes) 65 | if score == 0.0: 66 | return 1 67 | else: 68 | if score != int(score): 69 | return int(np.ceil(score)) 70 | else: 71 | return int(score) 72 | 73 | elif vv_params["scoring_method"] == "votes": 74 | voxel_scores = votes 75 | voxel_levels = np.unique(votes) 76 | clustering_levels = range(1, voxel_levels.max() + 1) 77 | score_to_votes = lambda score: score 78 | else: 79 | raise ValueError( 80 | 'Invalid scoring_method {}. Must be "normalized_votes" or "votes"'.format( 81 | vv_params["scoring_method"] 82 | ) 83 | ) 84 | 85 | ### Clustering 86 | if cl_params["algorithm"] == "dbscan": 87 | cluster_fn = lambda pcd: cluster_points( 88 | pcd, algorithm="dbscan", **cl_params["dbscan_kwargs"] 89 | ) 90 | elif cl_params["algorithm"] == "hdbscan": 91 | cluster_fn = lambda pcd: cluster_points( 92 | pcd, algorithm="hdbscan", **cl_params["hdbscan_kwargs"] 93 | ) 94 | else: 95 | raise ValueError( 96 | 'Invalid algorithm {}. Must be "dbscan" or "hdbscan"'.format( 97 | cl_params["algorithm"] 98 | ) 99 | ) 100 | 101 | def bb_fn(pcd, bb_type): 102 | if bb_type == "axis_aligned": 103 | box = pcd.get_axis_aligned_bounding_box() 104 | 105 | # Pad zero dimensions to avoid zero-volume bounding boxes 106 | if box.volume() == 0.0: 107 | if verbose: 108 | print( 109 | "[warning]: bounding box with zero volume, padding zero length dimensions to voxel_size." 110 | ) 111 | min_bound = box.get_min_bound() 112 | max_bound = box.get_max_bound() 113 | zero_dims = np.where(min_bound == max_bound)[0] 114 | min_bound[zero_dims] -= vv_params["voxel_size"] / 2 115 | max_bound[zero_dims] += vv_params["voxel_size"] / 2 116 | box = o3d.geometry.AxisAlignedBoundingBox(min_bound, max_bound) 117 | 118 | elif bb_type == "oriented": 119 | raise NotImplementedError( 120 | "Oriented bounding boxes not implemented yet." 121 | ) # TODO: implement? 122 | 123 | else: 124 | raise ValueError( 125 | 'Invalid bounding_box_type {}. Must be "axis_aligned" or "oriented"'.format( 126 | bb_type 127 | ) 128 | ) 129 | assert box.volume() > 0.0 130 | return box 131 | 132 | level_bbxes = [] 133 | for level in clustering_levels: 134 | level_pcd = o3d.geometry.PointCloud() 135 | level_pcd.points = o3d.utility.Vector3dVector( 136 | voxel_center_points[voxel_scores >= level] 137 | ) 138 | 139 | if len(level_pcd.points) == 0: 140 | if verbose: 141 | print( 142 | "[warning]: no more points past level {}, stopping.".format(level) 143 | ) 144 | break 145 | 146 | cluster_labels = cluster_fn(level_pcd) 147 | 148 | for i in range(cluster_labels.max() + 1): 149 | cluster_inds = np.where(cluster_labels == i)[0] 150 | cluster_pcd = level_pcd.select_by_index(cluster_inds) 151 | 152 | cluster_box = bb_fn(cluster_pcd, bb_type=cl_params["bounding_box_type"]) 153 | 154 | level_bbxes.append((level, cluster_box)) 155 | 156 | ## Apply non-max suppression to the clustered regions 157 | level_bbxes = sorted(level_bbxes, key=lambda x: x[0], reverse=True) 158 | remove_box = len(level_bbxes) * [False] 159 | for i in range(len(level_bbxes) - 1): 160 | l1, p1 = level_bbxes[i] 161 | for j in range(i + 1, len(level_bbxes)): 162 | # skip if p2 already marked for removal 163 | if remove_box[j]: 164 | continue 165 | 166 | l2, p2 = level_bbxes[j] 167 | 168 | if l1 == l2: 169 | continue 170 | 171 | p1_in_p2 = _box_contains_box(p2, p1) 172 | if p1_in_p2: 173 | remove_box[j] = True 174 | 175 | level_bbxes = [lb for i, lb in enumerate(level_bbxes) if not remove_box[i]] 176 | 177 | # Map scores used for clustering back to the more easily interpretable votes 178 | level_bbxes = [(score_to_votes(l), b) for l, b in level_bbxes] 179 | 180 | return { 181 | "voxel_center_points": voxel_center_points, 182 | "voxel_scores": voxel_scores, 183 | "level_bbxes": level_bbxes, # TODO return instead a list of scored bounding boxes 184 | } 185 | 186 | 187 | def tagmap_entries_to_viewpoints( 188 | entries: Iterable[TagMapEntry], 189 | intrinsics: Dict[str, Any], 190 | store_extras_keys: Iterable[str] = [], 191 | far_dist_fn: Optional[Callable[[TagMapEntry], float]] = None, 192 | near_dist_fn: Optional[Callable[[TagMapEntry], float]] = None, 193 | ) -> Iterable[Viewpoint]: 194 | """ 195 | Helper function to convert an iterable of tag map entries their corresponding 196 | viewpoints. 197 | 198 | Args: 199 | entries: Iterable of entries from a tag map. 200 | intrinsics: Dictionary of camera intrinsics, must define 201 | ["width", "height", "fx", "fy"] 202 | store_extras_keys: Iterable of keys of the entry extras to store in 203 | the viewpoint extras. 204 | far_dist_fn: Function to compute the viewpoint's far distance from a query entry. 205 | If None, the viewpoint's far distance is set to intrinsics["far_dist"] 206 | near_dist_fn: Function to compute the viewpoint's near distance from a query entry 207 | If None, the viewpoint's near distance is set to intrinsics["near_dist"] 208 | """ 209 | 210 | if far_dist_fn == None: 211 | far_dist_fn = lambda entry: intrinsics["far_dist"] 212 | 213 | if near_dist_fn == None: 214 | near_dist_fn = lambda entry: intrinsics["near_dist"] 215 | 216 | viewpoints = [] 217 | for entry in entries: 218 | try: 219 | far_dist = far_dist_fn(entry) 220 | except Exception as e: 221 | print("Error in far_dist_fn, using value in intrinsics: {}.".format(e)) 222 | 223 | try: 224 | near_dist = near_dist_fn(entry) 225 | except Exception as e: 226 | print("Error in near_dist_fn, using value in intrinsics: {}.".format(e)) 227 | 228 | extras = {k: v for k, v in entry.extras.items() if k in store_extras_keys} 229 | 230 | if far_dist <= near_dist: 231 | # skip creating viewpoint if far_dist <= near_dist 232 | continue 233 | 234 | vp = Viewpoint.from_intrinsics( 235 | extrinsic_matrix=entry.pose, 236 | width=intrinsics["width"], 237 | height=intrinsics["height"], 238 | fx=intrinsics["fx"], 239 | fy=intrinsics["fy"], 240 | near_dist=near_dist, 241 | far_dist=far_dist, 242 | extras=extras, 243 | ) 244 | viewpoints.append(vp) 245 | 246 | return viewpoints 247 | 248 | 249 | def _box_contains_box(box1, box2): 250 | """ 251 | Helper function that returns True if box1 wholly contains box2. 252 | """ 253 | # TODO implement this for other box types 254 | if ( 255 | type(box1) != o3d.geometry.AxisAlignedBoundingBox 256 | or type(box2) != o3d.geometry.AxisAlignedBoundingBox 257 | ): 258 | raise NotImplementedError( 259 | "Unsupported box type, must be AxisAlignedBoundingBox" 260 | ) 261 | 262 | box1_min_bound, box1_max_bound = (box1.get_min_bound(), box1.get_max_bound()) 263 | box2_min_bound, box2_max_bound = (box2.get_min_bound(), box2.get_max_bound()) 264 | return np.all(box1_min_bound <= box2_min_bound) and np.all( 265 | box1_max_bound >= box2_max_bound 266 | ) 267 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/localization/viewpoint.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Union 2 | 3 | import numpy as np 4 | import open3d as o3d 5 | 6 | 7 | class Viewpoint: 8 | """ 9 | A viewpoint where the local frame has Z forward 10 | following traditional camera frame conventions 11 | """ 12 | 13 | def __init__( 14 | self, 15 | extrinsic_matrix: np.ndarray, 16 | w_fov: float, 17 | h_fov: float, 18 | near_dist: Optional[float] = None, 19 | far_dist: Optional[float] = None, 20 | extras: Optional[dict] = None, 21 | ) -> None: 22 | """ 23 | Args: 24 | extrinsic_matrix: 4x4 transformation matrix from local to world frame 25 | w_fov: horizontal field of view in radians 26 | h_fov: vertical field of view in radians 27 | near_dist: distance to near plane 28 | far_dist: distance to far plane 29 | extras: optional dictionary of extra data 30 | """ 31 | self._extrinsic_matrix = extrinsic_matrix 32 | self._w_fov, self._h_fov = (w_fov, h_fov) 33 | 34 | self._near_dist, self._far_dist = (near_dist, far_dist) 35 | if near_dist != None and far_dist != None: 36 | assert near_dist < far_dist, "near_dist must be less than far_dist" 37 | 38 | self._extras = extras 39 | 40 | @classmethod 41 | def from_intrinsics( 42 | cls, 43 | extrinsic_matrix: np.ndarray, 44 | width: int, 45 | height: int, 46 | fx: float, 47 | fy: float, 48 | near_dist: Optional[float] = None, 49 | far_dist: Optional[float] = None, 50 | extras: Optional[dict] = None, 51 | ) -> "Viewpoint": 52 | """ 53 | Construct a viewpoint for camera intrinsics parameters 54 | """ 55 | return cls( 56 | extrinsic_matrix, 57 | 2 * np.arctan(width / (2 * fx)), 58 | 2 * np.arctan(height / (2 * fy)), 59 | near_dist, 60 | far_dist, 61 | extras, 62 | ) 63 | 64 | def within_viewpoint(self, points: np.ndarray) -> np.ndarray: 65 | """ 66 | Returns a boolean array indicating whether each point is within the 67 | viewpoint's frustum 68 | 69 | Args: 70 | points: Nx3 array of points in the world frame 71 | 72 | Returns: 73 | (N,) boolean array 74 | """ 75 | # transform points to the viewpoint's local frame 76 | points_local = (points - self.origin) @ self.R 77 | 78 | # compute w and h ray angles for the grid points and filter out 79 | # points which are outside the field of view 80 | w = np.arctan(points_local[:, 0] / (points_local[:, 2] + 1e-6)) 81 | h = np.arctan(points_local[:, 1] / (points_local[:, 2] + 1e-6)) 82 | 83 | inside = np.logical_and( 84 | np.logical_and(w < (self._w_fov / 2), w > -(self._w_fov / 2)), 85 | np.logical_and(h < (self._h_fov / 2), h > -(self._h_fov / 2)), 86 | ) 87 | 88 | # check depth bounds 89 | d = points_local[:, 2] 90 | 91 | if self._near_dist != None: 92 | inside = np.logical_and(inside, d > self._near_dist) 93 | else: 94 | inside = np.logical_and(inside, d > 0) 95 | 96 | if self._far_dist != None: 97 | inside = np.logical_and(inside, d < self._far_dist) 98 | 99 | return inside 100 | 101 | @property 102 | def extras(self) -> Union[dict, None]: 103 | return self._extras 104 | 105 | @property 106 | def R(self) -> np.ndarray: 107 | """ 108 | Rotation matrix from the viewpoint's local frame to the world frame 109 | """ 110 | return self._extrinsic_matrix[:3, :3] 111 | 112 | @property 113 | def origin(self) -> np.ndarray: 114 | """ 115 | The origin of the viewpoint's local frame in the world frame 116 | """ 117 | return self._extrinsic_matrix[:3, 3] 118 | 119 | @property 120 | def fov(self) -> Tuple[float, float]: 121 | """ 122 | Returns the horizontal and vertical field of view in radians 123 | """ 124 | return self._w_fov, self._h_fov 125 | 126 | @property 127 | def bounding_rays(self) -> np.ndarray: 128 | """ 129 | Returns the four bounding rays expressed 130 | in the world frame 131 | 132 | Returns a 4x3 array of unit vectors with each row 133 | representing a ray in the order of: 134 | top-left corner 135 | bottom-left corner 136 | top-right corner 137 | bottom-right corner 138 | """ 139 | if not hasattr(self, "_bounding_rays"): 140 | dx, dy = (np.tan(self._w_fov / 2), np.tan(self._h_fov / 2)) 141 | 142 | rays = np.array( 143 | [ 144 | [-dx, -dy, 1.0], # top-left corner 145 | [-dx, dy, 1.0], # bottom-left corner 146 | [dx, -dy, 1.0], # top-right corner 147 | [dx, dy, 1.0], # bottom-right corner 148 | ] 149 | ) 150 | rays = rays / np.linalg.norm(rays, axis=1, keepdims=True) 151 | 152 | # transform rays to the world frame 153 | self._bounding_rays = rays @ self._extrinsic_matrix[:3, :3].T 154 | 155 | return self._bounding_rays 156 | 157 | @property 158 | def frustum_points(self) -> np.ndarray: 159 | """ 160 | Returns the eight points of the viewpoint's frustum 161 | 162 | Returns a 8x3 array of points in the order of: 163 | near top-left corner 164 | near bottom-left corner 165 | near top-right corner 166 | near bottom-right corner 167 | 168 | far top-left corner 169 | far bottom-left corner 170 | far top-right corner 171 | far bottom-right corner 172 | 173 | NOTE: if near_dist or far_dist are None, the corresponding far and near 174 | dists are set to 0 and 1 respectively 175 | """ 176 | if not hasattr(self, "_frustum_points"): 177 | ez_world = self.R[:, -1] 178 | d = ez_world.T @ self.bounding_rays[0] 179 | near_factor = 0 if self._near_dist == None else self._near_dist / d 180 | far_factor = 1 if self._far_dist == None else self._far_dist / d 181 | 182 | self._frustum_points = np.array( 183 | [ 184 | self.origin + self.bounding_rays[0] * near_factor, 185 | self.origin + self.bounding_rays[1] * near_factor, 186 | self.origin + self.bounding_rays[2] * near_factor, 187 | self.origin + self.bounding_rays[3] * near_factor, 188 | self.origin + self.bounding_rays[0] * far_factor, 189 | self.origin + self.bounding_rays[1] * far_factor, 190 | self.origin + self.bounding_rays[2] * far_factor, 191 | self.origin + self.bounding_rays[3] * far_factor, 192 | ] 193 | ).astype(np.float64) 194 | 195 | return self._frustum_points 196 | 197 | @property 198 | def aabb(self) -> o3d.geometry.AxisAlignedBoundingBox: 199 | """ 200 | Returns the axis-aligned bounding box of the viewpoint's frustum 201 | """ 202 | if not hasattr(self, "_aabb"): 203 | self._aabb = o3d.geometry.AxisAlignedBoundingBox.create_from_points( 204 | o3d.utility.Vector3dVector(self.frustum_points) 205 | ) 206 | return self._aabb 207 | 208 | def o3d_lineset(self, color=(0, 0, 1)) -> o3d.geometry.LineSet: 209 | """ 210 | Return the an Open3D lineset of the viewpoint for visualization 211 | """ 212 | points = self.frustum_points 213 | 214 | lines = np.array( 215 | [ 216 | [0, 4], 217 | [1, 5], 218 | [2, 6], 219 | [3, 7], 220 | ] 221 | ).astype(np.int32) 222 | 223 | if self._near_dist != None: 224 | near_plane_lines = np.array( 225 | [ 226 | [0, 1], 227 | [0, 2], 228 | [1, 3], 229 | [2, 3], 230 | ] 231 | ).astype(np.int32) 232 | lines = np.concatenate([lines, near_plane_lines], axis=0) 233 | 234 | if self._far_dist != None: 235 | far_plane_lines = np.array( 236 | [ 237 | [4, 5], 238 | [4, 6], 239 | [5, 7], 240 | [6, 7], 241 | ] 242 | ).astype(np.int32) 243 | lines = np.concatenate([lines, far_plane_lines], axis=0) 244 | 245 | lineset = o3d.geometry.LineSet() 246 | lineset.points = o3d.utility.Vector3dVector(points) 247 | lineset.lines = o3d.utility.Vector2iVector(lines) 248 | lineset.colors = o3d.utility.Vector3dVector(np.tile(color, (lines.shape[0], 1))) 249 | 250 | return lineset 251 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/localization/voxel_voting.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Optional, Tuple 2 | from .viewpoint import Viewpoint 3 | 4 | import numpy as np 5 | 6 | 7 | def voxel_voting( 8 | points: np.ndarray, 9 | voxel_size: float, 10 | point_weights: Optional[np.ndarray] = None, 11 | ) -> Tuple[np.ndarray, np.ndarray]: 12 | """ 13 | Voxel voting based localization given a set of points. 14 | 15 | Args: 16 | points: (N,3) array of points 17 | voxel_size: Size of the voxels 18 | point_weights: (N,) array of weights for each point. If None, all points 19 | are weighted equally. 20 | 21 | Returns: 22 | A point cloud of the voxel centers (N,3) and the votes for each voxel (N,) 23 | """ 24 | voxel_coords = np.floor(points / voxel_size) 25 | keys = voxel_coords.astype(np.int32) 26 | _, inds, inverse_inds, counts = np.unique( 27 | keys, axis=0, return_index=True, return_inverse=True, return_counts=True 28 | ) 29 | 30 | if point_weights is None: 31 | votes = counts 32 | else: 33 | votes = np.zeros(len(inds)) 34 | for i in range(len(inds)): 35 | votes[i] = np.sum(point_weights[inverse_inds == i]) 36 | 37 | voxel_centers = (voxel_coords[inds] + 0.5) * voxel_size 38 | 39 | return voxel_centers, votes 40 | 41 | 42 | def grid_voxel_voting( 43 | viewpoints: Iterable[Viewpoint], 44 | voxel_size: float, 45 | viewpoint_weight: Optional[Iterable[float]] = None, 46 | ) -> Tuple[np.ndarray, np.ndarray]: 47 | """ 48 | Voxel voting based localization given a set of viewpoints. 49 | 1. For each viewpoint, generate a set of grid points inside the viewpoint 50 | where the grid is aligned with the world frame 51 | 2. Merge grid points into a single point cloud 52 | 3. Voxelize the point cloud and count the number of points in each voxel 53 | where the count can be weighted by viewpoint_weight 54 | 55 | Args: 56 | viewpoints: Iterable of viewpoints 57 | voxel_size: Size of the voxels 58 | viewpoint_weight: Iterable of weights for each viewpoint. If None, all 59 | viewpoints are weighted equally. 60 | 61 | Returns: 62 | A point cloud of the voxel centers (N,3) and the votes for each voxel (N,) 63 | """ 64 | if viewpoint_weight is not None: 65 | assert len(viewpoint_weight) == len(viewpoints) 66 | 67 | grid_points = [] 68 | grid_point_weights = [] 69 | for i, vp in enumerate(viewpoints): 70 | aabb_max_bound = vp.aabb.get_max_bound() 71 | aabb_min_bound = vp.aabb.get_min_bound() 72 | 73 | range_N = np.ceil((aabb_max_bound - aabb_min_bound) / voxel_size) 74 | 75 | xx, yy, zz = np.meshgrid( 76 | voxel_size * np.arange(range_N[0]) + aabb_min_bound[0], 77 | voxel_size * np.arange(range_N[1]) + aabb_min_bound[1], 78 | voxel_size * np.arange(range_N[2]) + aabb_min_bound[2], 79 | ) 80 | 81 | vp_grid_points = np.concatenate( 82 | [c.reshape(-1, 1) for c in [xx, yy, zz]], axis=1 83 | ) 84 | 85 | # get only the points within the viewpoint 86 | inside = vp.within_viewpoint(vp_grid_points) 87 | vp_grid_points = vp_grid_points[inside] 88 | 89 | grid_points.append(vp_grid_points) 90 | 91 | if viewpoint_weight is not None: 92 | grid_point_weights.append( 93 | viewpoint_weight[i] * np.ones(vp_grid_points.shape[0]) 94 | ) 95 | 96 | if len(grid_points) == 0: 97 | return np.zeros((0, 3)), np.zeros((0,)) 98 | 99 | grid_points = np.concatenate(grid_points, axis=0) 100 | grid_point_weights = ( 101 | None if viewpoint_weight is None else np.concatenate(grid_point_weights, axis=0) 102 | ) 103 | 104 | return voxel_voting(grid_points, voxel_size, grid_point_weights) 105 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .image_tagger import ImageTagger 2 | from .ram_tagger import RAMTagger 3 | from .ram_plus_tagger import RAMPlusTagger 4 | 5 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/models/image_tagger.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, List, Tuple 3 | from PIL import Image 4 | 5 | 6 | class ImageTagger(ABC): 7 | """ 8 | Abstract base class for all image taggers 9 | """ 10 | 11 | @abstractmethod 12 | def tag_image(self, image: Image.Image) -> Tuple[List, List]: 13 | """ 14 | Forwards the tagging model and returns the tags and confidences 15 | """ 16 | raise NotImplementedError 17 | 18 | @abstractmethod 19 | def filtered_tag_image( 20 | self, image: Image.Image, params: Dict[str, Any] 21 | ) -> Tuple[List, List]: 22 | """ 23 | Forwards the model and applies additional inference filtering to remove unlikely tags 24 | """ 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/models/ram_plus_tagger.py: -------------------------------------------------------------------------------- 1 | from tag_mapping.models.ram_tagger import RAMTagger 2 | 3 | from ram import get_transform 4 | from ram.models import ram_plus 5 | 6 | 7 | class RAMPlusTagger(RAMTagger): 8 | def _init_model(self, config) -> None: 9 | # override RAM model to load RAM++ model 10 | self._device = config["device"] 11 | 12 | self._model = ram_plus( 13 | pretrained=config["ram_pretrained_path"], 14 | image_size=config["ram_image_size"], 15 | vit=config["vit"], 16 | ) 17 | self._model.to(self._device) 18 | self._model.eval() 19 | 20 | self._transform = get_transform(config["ram_image_size"]) 21 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/models/ram_tagger.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from typing import Dict, List, Tuple, Any 4 | from PIL import Image 5 | 6 | from ram import get_transform, inference_ram 7 | from ram.models import ram 8 | 9 | from tag_mapping.models.image_tagger import ImageTagger 10 | from tag_mapping.filtering import compute_unlikely_tags_center_crop_ensemble 11 | 12 | 13 | class RAMTagger(ImageTagger): 14 | """ 15 | Wrapper for the Recognize-Anything tagging model 16 | """ 17 | 18 | def __init__(self, config) -> None: 19 | self._init_model(config) 20 | 21 | def _init_model(self, config) -> None: 22 | self._device = config["device"] 23 | 24 | self._model = ram( 25 | pretrained=config["ram_pretrained_path"], 26 | image_size=config["ram_image_size"], 27 | vit=config["vit"], 28 | ) 29 | self._model.to(self._device) 30 | self._model.eval() 31 | 32 | self._transform = get_transform(config["ram_image_size"]) 33 | 34 | def tag_image(self, image: Image.Image) -> Tuple[List, List]: 35 | """ 36 | Forwards the tagging model and returns the tags and confidences 37 | """ 38 | tags, confidences = inference_ram( 39 | self._transform(image).unsqueeze(0).to(self._device), self._model 40 | ) 41 | tags = tags.split(" | ") 42 | return {"tags": tags, "confidences": confidences} 43 | 44 | def override_class_thresholds(self, thresholds: Dict[str, float]) -> None: 45 | for cls, t in thresholds.items(): 46 | try: 47 | self._model.override_class_threshold(cls, t) 48 | except Exception as e: 49 | print("Couldn't override threshold for {} because: {}".format(cls, e)) 50 | 51 | def filtered_tag_image( 52 | self, image: Image.Image, params: Dict[str, Any] 53 | ) -> Tuple[List, List]: 54 | """ 55 | Forwards the model and applies additional inference filtering to remove unlikely tags 56 | """ 57 | out = self.tag_image(image) 58 | tags, confidences = (out["tags"], out["confidences"]) 59 | 60 | # filter tags 61 | unlikely_tags = compute_unlikely_tags_center_crop_ensemble( 62 | image, 63 | tags, 64 | params["crop_border_proportions"], 65 | self, 66 | ) 67 | 68 | filtered_tags = [tag for tag in tags if tag not in unlikely_tags] 69 | filtered_tag_confidences = [ 70 | conf for tag, conf in zip(tags, confidences) if tag not in unlikely_tags 71 | ] 72 | return filtered_tags, filtered_tag_confidences 73 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/pose_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import networkx as nx 4 | 5 | 6 | class PoseGraph: 7 | """ 8 | Class implementing methods for working with a pose graph. 9 | 10 | Currently, this class is a wrapper providing additional functionality on top 11 | of a pose graph generated elsewhere. 12 | It does not support generating pose graphs or modification of the stored pose graph! 13 | """ 14 | 15 | def __init__(self, points, edges): 16 | """ 17 | Args: 18 | points: np.ndarray (N, m) of m dimensional points 19 | edges: np.ndarray (E, 2) of indices into points 20 | """ 21 | self._nodes = points 22 | 23 | self._graph = nx.Graph() 24 | self._graph.add_nodes_from(np.arange(points.shape[0])) 25 | 26 | edge_lengths = np.linalg.norm(points[edges[:, 0]] - points[edges[:, 1]], axis=1) 27 | for (i, j), length in zip(edges, edge_lengths): 28 | self._graph.add_edge(i, j, length=length) 29 | 30 | # make sure that the graph is connected 31 | if not nx.is_connected(self._graph): 32 | raise ValueError( 33 | "Cannot create pose graph with arguments representing a disconnected graph" 34 | ) 35 | 36 | def closest_node_idx(self, point): 37 | """ 38 | Args: 39 | point: np.ndarray (m,) of a m dimensional point 40 | 41 | Returns: 42 | index of the closest node 43 | """ 44 | dists = np.linalg.norm(self._nodes - point, axis=1) 45 | return np.argmin(dists) 46 | 47 | def closest_node(self, point): 48 | """ 49 | Args: 50 | point: np.ndarray (m,) of a m dimensional point 51 | 52 | Returns: 53 | coordinates of the closest node 54 | """ 55 | return self._nodes[self.closest_node_idx(point)] 56 | 57 | def shortest_path(self, start_point, end_point): 58 | """ 59 | Args: 60 | start_point: np.ndarray (m,) of a m dimensional point 61 | end_point: np.ndarray (m,) of a m dimensional point 62 | 63 | Returns: 64 | list of indices of nodes on the shortest path 65 | """ 66 | start_idx = self.closest_node_idx(start_point) 67 | end_idx = self.closest_node_idx(end_point) 68 | 69 | return nx.shortest_path(self._graph, start_idx, end_idx, weight="length") 70 | 71 | def shortest_path_length(self, start_point, end_point): 72 | """ 73 | Args: 74 | start_point: np.ndarray (m,) of a m dimensional point 75 | end_point: np.ndarray (m,) of a m dimensional point 76 | 77 | Returns: 78 | length of the shortest path 79 | """ 80 | start_idx = self.closest_node_idx(start_point) 81 | end_idx = self.closest_node_idx(end_point) 82 | 83 | return nx.shortest_path_length(self._graph, start_idx, end_idx, weight="length") 84 | 85 | def save(self, save_dir): 86 | """ 87 | Args: 88 | save_dir: path to save the graph to 89 | """ 90 | if not os.path.exists(save_dir): 91 | os.makedirs(save_dir) 92 | np.save(os.path.join(save_dir, "edges.npy"), np.array(self._graph.edges)) 93 | np.save(os.path.join(save_dir, "node_coords.npy"), self._nodes) 94 | 95 | @classmethod 96 | def load(cls, load_dir): 97 | """ 98 | Args: 99 | load_dir: path to load the graph from 100 | 101 | Returns: 102 | PoseGraph object 103 | """ 104 | edges = np.load(os.path.join(load_dir, "edges.npy")) 105 | nodes = np.load(os.path.join(load_dir, "node_coords.npy")) 106 | return cls(nodes, edges) 107 | 108 | @property 109 | def nodes(self): 110 | return self._nodes.copy() 111 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/tag_map.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import uuid 5 | 6 | from dataclasses import dataclass 7 | from typing import Dict, Any 8 | 9 | 10 | @dataclass(frozen=True) 11 | class TagMapEntry: 12 | pose: np.ndarray 13 | uuid: uuid.UUID 14 | extras: Dict[str, Any] = None 15 | 16 | 17 | @dataclass(frozen=True) 18 | class TagDBEntry: 19 | entry_uuid: uuid.UUID 20 | extras: Dict[str, Any] = None 21 | 22 | 23 | class TagMap: 24 | def __init__(self, metadata: Dict[str, Any]): 25 | self._metadata = metadata 26 | self._tags_db = {} 27 | self._entry_db = {} 28 | 29 | def add_entry(self, entry: TagMapEntry): 30 | """ 31 | Add entry (i.e. observed frame) to the tag map. 32 | 33 | NOTE: This method raises a ValueError if the entry's uuid is 34 | already in the entry database. 35 | """ 36 | if entry.uuid in self._entry_db: 37 | raise ValueError("uuid already in the entry database") 38 | self._entry_db[entry.uuid] = entry 39 | 40 | def add_tag(self, tag: str, entry_uuid: uuid.UUID, extras: Dict[str, Any] = None): 41 | """ 42 | Associates a tag with an entry in the tag map. 43 | 44 | Args: 45 | tag: The tag to associate with the entry. 46 | entry_uuid: The uuid of the entry to associate with the tag. 47 | extras: Any extra data to associate with the tag. 48 | """ 49 | if entry_uuid not in self._entry_db: 50 | raise ValueError("uuid not in the entry database") 51 | 52 | tag_db_entry = TagDBEntry(entry_uuid, extras) 53 | if tag not in self._tags_db: 54 | self._tags_db[tag] = [tag_db_entry] 55 | else: 56 | self._tags_db[tag].append(tag_db_entry) 57 | 58 | def add_extra(self, extra_name: str, extra_data: Any, overwrite: bool = False): 59 | """ 60 | Add extra data to the tag map (e.g. pose graph). 61 | 62 | Args: 63 | extra_name: The name of the extra data. 64 | extra_data: The extra data. 65 | overwrite: Whether to overwrite the extra data if it already exists. 66 | """ 67 | if not hasattr(self, "_extras"): 68 | self._extras = {extra_name: extra_data} 69 | else: 70 | if extra_name in self._extras and not overwrite: 71 | raise ValueError( 72 | "Extra {} already stored in tag map, set overwrite=True to overwrite".format( 73 | extra_name 74 | ) 75 | ) 76 | else: 77 | self._extras[extra_name] = extra_data 78 | 79 | def query(self, tag: str, return_uuids: bool = False): 80 | """ 81 | Query the tag map for all entries associated with a tag. 82 | 83 | Returns: 84 | A list of TagMapEntry objects associated with the tag 85 | or None if the tag is not in the tag map. 86 | """ 87 | if tag not in self._tags_db: 88 | print("{} not in the tag map".format(tag)) 89 | return None 90 | 91 | entry_uuids = [e.entry_uuid for e in self._tags_db[tag]] 92 | entries = [self._entry_db[id] for id in entry_uuids] 93 | tag_extras = [e.extras for e in self._tags_db[tag]] 94 | 95 | # pack tag extras into entry.extras 96 | for entry, tag_extra in zip(entries, tag_extras): 97 | if entry.extras is None: 98 | entry.extras = {} 99 | 100 | if tag_extra is not None: 101 | for key, value in tag_extra.items(): 102 | entry.extras[key] = value 103 | 104 | if return_uuids: 105 | return entries, entry_uuids 106 | else: 107 | return entries 108 | 109 | def save(self, save_path): 110 | save_dir = os.path.dirname(save_path) 111 | if not os.path.exists(save_dir): 112 | os.makedirs(save_dir) 113 | with open(save_path, "wb") as file: 114 | pickle.dump(self, file) 115 | 116 | @classmethod 117 | def load(cls, save_path): 118 | with open(save_path, "rb") as file: 119 | obj = pickle.load(file) 120 | if isinstance(obj, cls): 121 | return obj 122 | else: 123 | raise ValueError("Loaded object is not an instance of TagMap") 124 | 125 | @property 126 | def metadata(self): 127 | return self._metadata 128 | 129 | @property 130 | def unique_objects(self): 131 | return self._tags_db.keys() 132 | 133 | @property 134 | def num_entries(self): 135 | return len(self._entry_db) 136 | 137 | @property 138 | def num_tags(self): 139 | return len(self._tags_db) 140 | 141 | @property 142 | def extras(self): 143 | return self._extras 144 | 145 | def __contains__(self, tag: str): 146 | return tag in self.unique_objects 147 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .nearest_points_in_box import nearest_points_in_box 2 | from .collision_check import o3d_check_lines_collision 3 | from .get_box_corners import get_box_corners 4 | 5 | from .load_yaml_params import load_yaml_params 6 | 7 | from .line_mesh import LineMesh, box_to_linemesh -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/collision_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import open3d as o3d 3 | 4 | 5 | def o3d_check_lines_collision( 6 | rcs: o3d.t.geometry.RaycastingScene, 7 | lines_start: np.ndarray, 8 | lines_end: np.ndarray, 9 | ) -> np.ndarray: 10 | """ 11 | Uses open3d.t.geometry.RaycastingScene to check for collisions between lines and a mesh. 12 | 13 | Args: 14 | rcs: open3d.t.geometry.RaycastingScene of the mesh 15 | lines_start: (N,3) array of line start points 16 | lines_end: (N,3) array of line end points 17 | 18 | Returns: 19 | (N,) boolean array of whether each line collides with the mesh 20 | """ 21 | edge_vectors = lines_end - lines_start 22 | edge_lengths = np.linalg.norm(edge_vectors, axis=1) 23 | 24 | # IMPORTANT: normalize ray direction vector 25 | ray_directions = edge_vectors / edge_lengths[:, np.newaxis] 26 | 27 | rays = np.concatenate([lines_start, ray_directions], axis=1) 28 | rays = o3d.core.Tensor(rays, dtype=o3d.core.Dtype.Float32) 29 | res = rcs.cast_rays(rays) 30 | 31 | collision_mask = np.asarray(res["t_hit"]) < edge_lengths 32 | 33 | return collision_mask 34 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/get_box_corners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from open3d.geometry import AxisAlignedBoundingBox, OrientedBoundingBox 4 | from tag_mapping.datasets.matterport import ( 5 | MatterportObjectBoundingBox, 6 | MatterportRegionBoundingBox, 7 | ) 8 | 9 | 10 | def get_box_corners(box) -> np.ndarray: 11 | """ 12 | Helper function that takes in a box of a supported type and 13 | outputs its corners as an array of shape (8, 3) in the following order: 14 | 15 | (4) +---------+. (5) 16 | | ` . | ` . 17 | | (0) +---+-----+ (1) 18 | | | | | 19 | (7) +-----+---+. (6)| 20 | ` . | ` . | 21 | (3) ` +---------+ (2) 22 | 23 | Args: 24 | box: a box of a supported type 25 | 26 | Returns: 27 | corners: (8, 3) array of corners 28 | """ 29 | if type(box) == AxisAlignedBoundingBox: 30 | min_bound = box.get_min_bound() 31 | max_bound = box.get_max_bound() 32 | corners = np.array( 33 | [ 34 | min_bound, 35 | [max_bound[0], min_bound[1], min_bound[2]], 36 | [max_bound[0], max_bound[1], min_bound[2]], 37 | [min_bound[0], max_bound[1], min_bound[2]], 38 | [min_bound[0], min_bound[1], max_bound[2]], 39 | [max_bound[0], min_bound[1], max_bound[2]], 40 | max_bound, 41 | [min_bound[0], max_bound[1], max_bound[2]], 42 | ] 43 | ) 44 | 45 | elif type(box) == OrientedBoundingBox: 46 | raise NotImplementedError 47 | 48 | elif type(box) == MatterportObjectBoundingBox: 49 | corners = box.corners() 50 | 51 | elif type(box) == MatterportRegionBoundingBox: 52 | corners = box.corners() 53 | 54 | else: 55 | raise ValueError(f"Unsupported box type {type(box)}") 56 | 57 | return corners 58 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/line_mesh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import open3d as o3d 3 | 4 | """ 5 | This file contains a workaround LineMesh class for Open3D which is a lineset with cylinders instead of lines. 6 | This is useful for visualizing lines of different thicknesses in Open3D. 7 | 8 | From: 9 | https://github.com/isl-org/Open3D/pull/738#issuecomment-564785941 10 | https://github.com/isl-org/Open3D/pull/738#issuecomment-697027818 11 | """ 12 | 13 | 14 | def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])): 15 | """ 16 | Aligns vector a to vector b with axis angle rotation 17 | """ 18 | if np.array_equal(a, b): 19 | return None, None 20 | axis_ = np.cross(a, b) 21 | axis_ = axis_ / np.linalg.norm(axis_) 22 | angle = np.arccos(np.dot(a, b)) 23 | 24 | return axis_, angle 25 | 26 | 27 | def normalized(a, axis=-1, order=2): 28 | """Normalizes a numpy array of points""" 29 | l2 = np.atleast_1d(np.linalg.norm(a, order, axis)) 30 | l2[l2 == 0] = 1 31 | return a / np.expand_dims(l2, axis), l2 32 | 33 | 34 | class LineMesh(object): 35 | def __init__(self, points, lines, colors=[0, 1, 0], radius=0.05): 36 | """Creates a line represented as sequence of cylinder triangular meshes 37 | 38 | Arguments: 39 | points {ndarray} -- Numpy array of ponts Nx3. 40 | 41 | Keyword Arguments: 42 | colors {list} -- list of colors, or single color of the line (default: {[0, 1, 0]}) 43 | radius {float} -- radius of cylinder (default: {0.15}) 44 | """ 45 | self.points = np.array(points) 46 | self.lines = np.array(lines) 47 | self.colors = np.array(colors) 48 | self.radius = radius 49 | self.cylinder_segments = [] 50 | 51 | self.create_line_mesh() 52 | 53 | def create_line_mesh(self): 54 | first_points = self.points[self.lines[:, 0], :] 55 | second_points = self.points[self.lines[:, 1], :] 56 | line_segments = second_points - first_points 57 | line_segments_unit, line_lengths = normalized(line_segments) 58 | 59 | z_axis = np.array([0, 0, 1]) 60 | # Create triangular mesh cylinder segments of line 61 | for i in range(line_segments_unit.shape[0]): 62 | line_segment = line_segments_unit[i, :] 63 | line_length = line_lengths[i] 64 | # get axis angle rotation to allign cylinder with line segment 65 | axis, angle = align_vector_to_another(z_axis, line_segment) 66 | # Get translation vector 67 | translation = first_points[i, :] + line_segment * line_length * 0.5 68 | # create cylinder and apply transformations 69 | cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder( 70 | self.radius, line_length 71 | ) 72 | cylinder_segment = cylinder_segment.translate(translation, relative=False) 73 | if axis is not None: 74 | axis_a = axis * angle 75 | cylinder_segment = cylinder_segment.rotate( 76 | R=o3d.geometry.get_rotation_matrix_from_axis_angle(axis_a), 77 | center=cylinder_segment.get_center(), 78 | ) 79 | # color cylinder 80 | color = self.colors if self.colors.ndim == 1 else self.colors[i, :] 81 | cylinder_segment.paint_uniform_color(color) 82 | 83 | self.cylinder_segments.append(cylinder_segment) 84 | 85 | def add_line(self, vis): 86 | """Adds this line to the visualizer""" 87 | for cylinder in self.cylinder_segments: 88 | vis.add_geometry(cylinder) 89 | 90 | def remove_line(self, vis): 91 | """Removes this line from the visualizer""" 92 | for cylinder in self.cylinder_segments: 93 | vis.remove_geometry(cylinder) 94 | 95 | 96 | ########################################################################################## 97 | 98 | from tag_mapping.utils import get_box_corners 99 | 100 | 101 | def box_to_linemesh(box, color=(0, 1, 0), radius=0.02): 102 | """ 103 | Get a LineMesh from a box type. 104 | 105 | The box type must be supported by get_box_corners() as we assume that 106 | get_box_corners() will return the corners in the expected order 107 | """ 108 | box_points = get_box_corners(box) 109 | 110 | box_lines = np.array( 111 | [ 112 | [0, 1], 113 | [0, 3], 114 | [1, 2], 115 | [2, 3], 116 | [0, 4], 117 | [1, 5], 118 | [2, 6], 119 | [3, 7], 120 | [4, 5], 121 | [4, 7], 122 | [5, 6], 123 | [6, 7], 124 | ] 125 | ) 126 | 127 | return LineMesh( 128 | points=box_points, 129 | lines=box_lines, 130 | colors=color, 131 | radius=radius, 132 | ) 133 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/load_yaml_params.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | def load_yaml_params(params_path): 5 | """ 6 | Modified yaml safe loading to allow for yaml to load python lambdas given by !python/lambda 7 | """ 8 | 9 | def yaml_lambda_constructor(loader, node): 10 | value = loader.construct_scalar(node) 11 | return eval(value) 12 | 13 | yaml.SafeLoader.add_constructor("!python/lambda", yaml_lambda_constructor) 14 | 15 | with open(params_path, "r") as f: 16 | params = yaml.safe_load(f) 17 | return params 18 | -------------------------------------------------------------------------------- /tag_mapping/tag_mapping/utils/nearest_points_in_box.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cvxpy as cp 3 | 4 | from typing import Tuple 5 | 6 | 7 | def nearest_points_in_box( 8 | box_corners: np.ndarray, 9 | box_center: np.ndarray, 10 | points: np.ndarray, 11 | solve_kwargs=None, 12 | ) -> np.ndarray: 13 | """ 14 | Computes the points that are closests to the given points, bounded 15 | within the box by solving a QP. 16 | 17 | Args: 18 | box_corners: (8,3) array of box corners in order defined in _box_hrep() 19 | box_center: (3,) array of center coordinate of the box 20 | points: (N,3) array of the given points 21 | solve_kwargs: Keyword arguments to pass to cp.Problem.solve(), 22 | e.g. the solver, verbose, etc. 23 | 24 | Returns: 25 | (N,3) array of the closest points 26 | """ 27 | if solve_kwargs is None: 28 | solve_kwargs = {"verbose": False, "solver": cp.ECOS} 29 | 30 | box_A, box_b = _box_hrep(box_corners, box_center) 31 | 32 | N = points.shape[0] 33 | X = cp.Variable((3, N)) 34 | objective = cp.Minimize(cp.sum_squares(X - points.T)) 35 | 36 | # NOTE: use reshape to allow broadcasting of the inequality 37 | constraints = [box_A @ X <= (box_b + box_A @ box_center).reshape(-1, 1)] 38 | 39 | prob = cp.Problem(objective, constraints) 40 | prob.solve(**solve_kwargs) 41 | if prob.status not in ["optimal", "optimal_inaccurate"]: 42 | raise RuntimeError("Closest point QP did not reach optimal solution") 43 | 44 | return X.value.T 45 | 46 | 47 | def _box_hrep(corners: np.ndarray, center: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 48 | """ 49 | Computes H-rep arrays A and b such that a point p is in 50 | the box if A(p - center) <= b 51 | 52 | Args: 53 | corners: (8, 3) corners in the relative order outlined as follows: 54 | (4) +---------+. (5) 55 | | ` . | ` . 56 | | (0) +---+-----+ (1) 57 | | | | | 58 | (7) +-----+---+. (6)| 59 | ` . | ` . | 60 | (3) ` +---------+ (2) 61 | 62 | center: (3,) center of the box 63 | 64 | Returns: 65 | A: (6, 3) 66 | b: (6,) 67 | """ 68 | 69 | def face_hrep(c, vo, vx, vy): 70 | ex = vx - vo 71 | ey = vy - vo 72 | n = np.cross(ex, ey) 73 | n /= np.linalg.norm(n) 74 | d = np.dot(n, vo - c) 75 | return n, d 76 | 77 | face_hreps = [ 78 | face_hrep(center, corners[1], corners[0], corners[2]), 79 | face_hrep(center, corners[4], corners[5], corners[7]), 80 | face_hrep(center, corners[2], corners[3], corners[6]), 81 | face_hrep(center, corners[3], corners[0], corners[7]), 82 | face_hrep(center, corners[0], corners[1], corners[4]), 83 | face_hrep(center, corners[1], corners[2], corners[5]), 84 | ] 85 | 86 | A = np.concatenate([n.reshape(1, -1) for n, _ in face_hreps], axis=0) 87 | b = np.array([d for _, d in face_hreps]) 88 | 89 | return A, b 90 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | 190 | Copyright (c) 2022 OPPO 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | https://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ram/configs/*.json 2 | include ram/configs/swin/*.json 3 | include ram/data/*.txt 4 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/datasets/openimages_common_214/imgs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/datasets/openimages_common_214/imgs/.gitkeep -------------------------------------------------------------------------------- /thirdparty/recognize-anything/datasets/openimages_common_214/openimages_common_214_ram_taglist.txt: -------------------------------------------------------------------------------- 1 | accident 2 | accordion 3 | plane 4 | airport 5 | antelope 6 | apple 7 | art gallery 8 | eggplant 9 | auditorium 10 | autumn 11 | baboon 12 | backpack 13 | bakery 14 | bamboo 15 | banana 16 | barbecue 17 | bed 18 | bedroom 19 | clock 20 | bicycle 21 | bikini 22 | birthday cake 23 | blackberry 24 | blueberry 25 | pig 26 | bookcase 27 | bridge 28 | broccoli 29 | bus 30 | butterfly 31 | calculator 32 | calendar 33 | camping 34 | candle 35 | candy 36 | cannon 37 | canyon 38 | car 39 | carousel 40 | cat 41 | cave 42 | ceiling 43 | cheese 44 | cheetah 45 | chef 46 | chicken 47 | christmas 48 | christmas tree 49 | clover 50 | coral 51 | corn 52 | courtyard 53 | crab 54 | lobster 55 | crocodile 56 | crosswalk 57 | crow 58 | cucumber 59 | cup 60 | currency 61 | dachshund 62 | deer 63 | desert 64 | die 65 | dinosaur 66 | dog 67 | dolphin 68 | doodle 69 | dragonfly 70 | drum 71 | duck 72 | dumbbell 73 | easter egg 74 | egg 75 | elephant 76 | faucet 77 | ferris wheel 78 | fire 79 | fireman 80 | firework 81 | flamingo 82 | flower 83 | football 84 | fountain 85 | fox 86 | fridge 87 | frog 88 | ham 89 | gas stove 90 | giraffe 91 | glacier 92 | glove 93 | goat 94 | goose 95 | gorilla 96 | grape 97 | guitar 98 | gull 99 | gym 100 | halloween 101 | hamburger 102 | hamster 103 | handbag 104 | hedgehog 105 | helicopter 106 | horse 107 | hummingbird 108 | jellyfish 109 | kangaroo 110 | kimono 111 | kite 112 | ladybird 113 | laptop 114 | leg 115 | mailbox 116 | library 117 | lightning 118 | lily 119 | lion 120 | lizard 121 | luggage 122 | mannequin 123 | map 124 | mask 125 | mattress 126 | microphone 127 | microwave 128 | monkey 129 | moon 130 | mosque 131 | mouse 132 | mushroom 133 | nebula 134 | sea 135 | ostrich 136 | palm tree 137 | paper 138 | pasta 139 | patient 140 | pavilion 141 | pear 142 | pebble 143 | penguin 144 | pet 145 | piano 146 | picture frame 147 | pine 148 | pineapple 149 | pizza 150 | police car 151 | pomegranate 152 | poodle 153 | popcorn 154 | stamp 155 | power station 156 | printer 157 | pumpkin 158 | raccoon 159 | rainbow 160 | rat 161 | restroom 162 | ring 163 | run 164 | salad 165 | sandwich 166 | sausage 167 | shark 168 | sheet music 169 | shrine 170 | snowboard 171 | snake 172 | sparrow 173 | squirrel 174 | stage 175 | starfish 176 | statue 177 | steering wheel 178 | stream 179 | street art 180 | street light 181 | submarine 182 | suite 183 | surfboard 184 | sushi 185 | swan 186 | tattoo 187 | teddy 188 | tennis court 189 | tennis racket 190 | tiger 191 | toast 192 | toilet bowl 193 | toy 194 | tractor 195 | train 196 | trampoline 197 | treadmill 198 | truck 199 | tunnel 200 | turkey 201 | vending machine 202 | waffle 203 | walnut 204 | washing machine 205 | water buffalo 206 | waterfall 207 | watermelon 208 | wheat 209 | wheelchair 210 | windmill 211 | winter 212 | wolf 213 | woodpecker 214 | zebra 215 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/datasets/openimages_common_214/openimages_common_214_tag2text_tagidlist.txt: -------------------------------------------------------------------------------- 1 | 3 2 | 8 3 | 16 4 | 19 5 | 21 6 | 33 7 | 44 8 | 50 9 | 58 10 | 61 11 | 71 12 | 77 13 | 84 14 | 96 15 | 117 16 | 139 17 | 142 18 | 147 19 | 180 20 | 200 21 | 202 22 | 206 23 | 244 24 | 267 25 | 317 26 | 321 27 | 347 28 | 361 29 | 380 30 | 387 31 | 398 32 | 407 33 | 471 34 | 486 35 | 489 36 | 509 37 | 514 38 | 530 39 | 568 40 | 590 41 | 595 42 | 612 43 | 622 44 | 626 45 | 654 46 | 658 47 | 664 48 | 684 49 | 699 50 | 704 51 | 717 52 | 720 53 | 727 54 | 760 55 | 773 56 | 786 57 | 787 58 | 812 59 | 814 60 | 817 61 | 843 62 | 855 63 | 856 64 | 907 65 | 950 66 | 955 67 | 957 68 | 1023 69 | 1042 70 | 1056 71 | 1066 72 | 1091 73 | 1094 74 | 1108 75 | 1141 76 | 1148 77 | 1152 78 | 1168 79 | 1174 80 | 1187 81 | 1231 82 | 1235 83 | 1246 84 | 1276 85 | 1277 86 | 1305 87 | 1308 88 | 1344 89 | 1359 90 | 1362 91 | 1393 92 | 1394 93 | 1410 94 | 1411 95 | 1468 96 | 1504 97 | 1524 98 | 1536 99 | 1540 100 | 1542 101 | 1546 102 | 1553 103 | 1572 104 | 1574 105 | 1606 106 | 1610 107 | 1615 108 | 1655 109 | 1672 110 | 1680 111 | 1682 112 | 1687 113 | 1691 114 | 1692 115 | 1711 116 | 1712 117 | 1713 118 | 1719 119 | 1727 120 | 1733 121 | 1761 122 | 1770 123 | 1782 124 | 1784 125 | 1786 126 | 1803 127 | 1812 128 | 1816 129 | 1820 130 | 1829 131 | 1831 132 | 1841 133 | 1845 134 | 1878 135 | 1882 136 | 1931 137 | 1940 138 | 1944 139 | 1947 140 | 1974 141 | 1975 142 | 1977 143 | 2009 144 | 2031 145 | 2035 146 | 2052 147 | 2065 148 | 2110 149 | 2113 150 | 2138 151 | 2149 152 | 2154 153 | 2157 154 | 2174 155 | 2178 156 | 2184 157 | 2185 158 | 2202 159 | 2222 160 | 2233 161 | 2291 162 | 2301 163 | 2302 164 | 2317 165 | 2320 166 | 2351 167 | 2354 168 | 2373 169 | 2383 170 | 2393 171 | 2403 172 | 2413 173 | 2415 174 | 2417 175 | 2423 176 | 2449 177 | 2454 178 | 2455 179 | 2472 180 | 2494 181 | 2495 182 | 2528 183 | 2541 184 | 2543 185 | 2553 186 | 2563 187 | 2589 188 | 2603 189 | 2654 190 | 2656 191 | 2658 192 | 2676 193 | 2690 194 | 2693 195 | 2700 196 | 2708 197 | 2720 198 | 2721 199 | 2729 200 | 2732 201 | 2734 202 | 2756 203 | 2786 204 | 2792 205 | 2801 206 | 2821 207 | 2851 208 | 2887 209 | 2906 210 | 2909 211 | 2924 212 | 2929 213 | 2966 214 | 2980 215 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/datasets/openimages_rare_200/imgs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/datasets/openimages_rare_200/imgs/.gitkeep -------------------------------------------------------------------------------- /thirdparty/recognize-anything/datasets/openimages_rare_200/openimages_rare_200_ram_taglist.txt: -------------------------------------------------------------------------------- 1 | Aerial photography 2 | Aircraft engine 3 | Ale 4 | Aloe 5 | Amphibian 6 | Angling 7 | Anole 8 | Antique car 9 | Arcade game 10 | Arthropod 11 | Assault rifle 12 | Athletic shoe 13 | Auto racing 14 | Backlighting 15 | Bagpipes 16 | Ball game 17 | Barbecue chicken 18 | Barechested 19 | Barquentine 20 | Beef tenderloin 21 | Billiard room 22 | Billiards 23 | Bird of prey 24 | Black swan 25 | Black-and-white 26 | Blond 27 | Boating 28 | Bonbon 29 | Bottled water 30 | Bouldering 31 | Bovine 32 | Bratwurst 33 | Breadboard 34 | Briefs 35 | Brisket 36 | Brochette 37 | Calabaza 38 | Camera operator 39 | Canola 40 | Childbirth 41 | Chordophone 42 | Church bell 43 | Classical sculpture 44 | Close-up 45 | Cobblestone 46 | Coca-cola 47 | Combat sport 48 | Comics 49 | Compact car 50 | Computer speaker 51 | Cookies and crackers 52 | Coral reef fish 53 | Corn on the cob 54 | Cosmetics 55 | Crocodilia 56 | Digital camera 57 | Dishware 58 | Divemaster 59 | Dobermann 60 | Dog walking 61 | Domestic rabbit 62 | Domestic short-haired cat 63 | Double-decker bus 64 | Drums 65 | Electric guitar 66 | Electric piano 67 | Electronic instrument 68 | Equestrianism 69 | Equitation 70 | Erinaceidae 71 | Extreme sport 72 | Falafel 73 | Figure skating 74 | Filling station 75 | Fire apparatus 76 | Firearm 77 | Flatbread 78 | Floristry 79 | Forklift truck 80 | Freight transport 81 | Fried food 82 | Fried noodles 83 | Frigate 84 | Frozen yogurt 85 | Frying 86 | Full moon 87 | Galleon 88 | Glacial landform 89 | Gliding 90 | Go-kart 91 | Goats 92 | Grappling 93 | Great white shark 94 | Gumbo 95 | Gun turret 96 | Hair coloring 97 | Halter 98 | Headphones 99 | Heavy cruiser 100 | Herding 101 | High-speed rail 102 | Holding hands 103 | Horse and buggy 104 | Horse racing 105 | Hound 106 | Hunting knife 107 | Hurdling 108 | Inflatable 109 | Jackfruit 110 | Jeans 111 | Jiaozi 112 | Junk food 113 | Khinkali 114 | Kitesurfing 115 | Lawn game 116 | Leaf vegetable 117 | Lechon 118 | Lifebuoy 119 | Locust 120 | Lumpia 121 | Luxury vehicle 122 | Machine tool 123 | Medical imaging 124 | Melee weapon 125 | Microcontroller 126 | Middle ages 127 | Military person 128 | Military vehicle 129 | Milky way 130 | Miniature Poodle 131 | Modern dance 132 | Molluscs 133 | Monoplane 134 | Motorcycling 135 | Musical theatre 136 | Narcissus 137 | Nest box 138 | Newsagent's shop 139 | Nile crocodile 140 | Nordic skiing 141 | Nuclear power plant 142 | Orator 143 | Outdoor shoe 144 | Parachuting 145 | Pasta salad 146 | Peafowl 147 | Pelmeni 148 | Perching bird 149 | Performance car 150 | Personal water craft 151 | Pit bull 152 | Plant stem 153 | Pork chop 154 | Portrait photography 155 | Primate 156 | Procyonidae 157 | Prosciutto 158 | Public speaking 159 | Racewalking 160 | Ramen 161 | Rear-view mirror 162 | Residential area 163 | Ribs 164 | Rice ball 165 | Road cycling 166 | Roller skating 167 | Roman temple 168 | Rowing 169 | Rural area 170 | Sailboat racing 171 | Scaled reptile 172 | Scuba diving 173 | Senior citizen 174 | Shallot 175 | Shinto shrine 176 | Shooting range 177 | Siberian husky 178 | Sledding 179 | Soba 180 | Solar energy 181 | Sport climbing 182 | Sport utility vehicle 183 | Steamed rice 184 | Stemware 185 | Sumo 186 | Surfing Equipment 187 | Team sport 188 | Touring car 189 | Toy block 190 | Trampolining 191 | Underwater diving 192 | Vegetarian food 193 | Wallaby 194 | Water polo 195 | Watercolor paint 196 | Whiskers 197 | Wind wave 198 | Woodwind instrument 199 | Yakitori 200 | Zeppelin 201 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/1641173_2291260800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/1641173_2291260800.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/demo/demo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo1.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/demo/demo2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo2.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/demo/demo3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo3.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/demo/demo4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo4.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/experiment_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/experiment_comparison.png -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/localization_and_recognition.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/localization_and_recognition.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/openset_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/openset_example.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/ram_grounded_sam.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/ram_grounded_sam.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/tag2text_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tag2text_framework.png -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/tag2text_grounded_sam.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tag2text_grounded_sam.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/images/tagging_results.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tagging_results.jpg -------------------------------------------------------------------------------- /thirdparty/recognize-anything/inference_ram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Recognize Anything Model (RAM) 3 | * Written by Xinyu Huang 4 | ''' 5 | import argparse 6 | import numpy as np 7 | import random 8 | import time 9 | 10 | import torch 11 | 12 | from PIL import Image 13 | from ram.models import ram 14 | from ram import inference_ram as inference 15 | from ram import get_transform 16 | 17 | import matplotlib.pyplot as plt 18 | 19 | parser = argparse.ArgumentParser( 20 | description='RAM inference for tagging') 21 | parser.add_argument('--image', 22 | metavar='DIR', 23 | help='path to dataset', 24 | default='images/1641173_2291260800.jpg') 25 | parser.add_argument('--pretrained', 26 | metavar='DIR', 27 | help='path to pretrained model', 28 | default='pretrained/ram_swin_large_14m.pth') 29 | parser.add_argument('--image-size', 30 | default=384, 31 | type=int, 32 | metavar='N', 33 | help='input image size (default: 384)') 34 | 35 | 36 | if __name__ == "__main__": 37 | 38 | args = parser.parse_args() 39 | 40 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 41 | 42 | transform = get_transform(image_size=args.image_size) 43 | 44 | #######load model 45 | model = ram(pretrained=args.pretrained, 46 | image_size=args.image_size, 47 | vit='swin_l') 48 | model.eval() 49 | 50 | model = model.to(device) 51 | 52 | image = transform(Image.open(args.image)).unsqueeze(0).to(device) 53 | 54 | print('image shape: ', image.shape) 55 | plt.imshow(image.squeeze().permute(1,2,0).cpu().numpy()) 56 | plt.show() 57 | 58 | start_inference_time = time.time() 59 | res = inference(image, model) 60 | print('Inference time: ', time.time() - start_inference_time) 61 | 62 | print("Image Tags: ", res[0]) 63 | print("Confidence: ", " | ".join(["{:.3f}".format(conf) for conf in res[1]])) 64 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/inference_ram_combined.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Recognize Anything Model (RAM) inference on seen AND unseen classes 3 | ''' 4 | import argparse 5 | import numpy as np 6 | import random 7 | 8 | import torch 9 | 10 | from PIL import Image 11 | from ram.models import ram 12 | from ram import inference_ram_openset as inference 13 | from ram import get_transform 14 | 15 | from ram.utils import build_openset_label_embedding 16 | from torch import nn 17 | 18 | parser = argparse.ArgumentParser( 19 | description='RAM inference for tagging') 20 | parser.add_argument('--image', 21 | metavar='DIR', 22 | help='path to dataset', 23 | default='images/openset_example.jpg') 24 | parser.add_argument('--pretrained', 25 | metavar='DIR', 26 | help='path to pretrained model', 27 | default='pretrained/ram_swin_large_14m.pth') 28 | parser.add_argument('--image-size', 29 | default=384, 30 | type=int, 31 | metavar='N', 32 | help='input image size (default: 384)') 33 | 34 | 35 | if __name__ == "__main__": 36 | 37 | args = parser.parse_args() 38 | 39 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 40 | 41 | transform = get_transform(image_size=args.image_size) 42 | 43 | #######load model 44 | model = ram(pretrained=args.pretrained, 45 | image_size=args.image_size, 46 | vit='swin_l') 47 | 48 | model.eval() 49 | 50 | model = model.to(device) 51 | 52 | #######set openset interference 53 | openset_label_embedding, openset_categories = build_openset_label_embedding() 54 | 55 | model.tag_list = np.concatenate( 56 | (model.tag_list, np.array(openset_categories))) 57 | 58 | model.label_embed = nn.Parameter(torch.cat( 59 | (model.label_embed, openset_label_embedding.float()))) 60 | 61 | model.num_class = len(model.tag_list) 62 | 63 | # the threshold for unseen categories is often lower 64 | openset_class_threshold = torch.ones(len(openset_categories)) * 0.5 65 | model.class_threshold = torch.cat( 66 | (model.class_threshold, openset_class_threshold)) 67 | ####### 68 | 69 | image = transform(Image.open(args.image)).unsqueeze(0).to(device) 70 | 71 | res = inference(image, model) 72 | print("Image Tags: ", res) 73 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/inference_ram_openset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Recognize Anything Model (RAM) inference on unseen classes 3 | * Written by Xinyu Huang 4 | ''' 5 | import argparse 6 | import numpy as np 7 | import random 8 | 9 | import torch 10 | 11 | from PIL import Image 12 | from ram.models import ram 13 | from ram import inference_ram_openset as inference 14 | from ram import get_transform 15 | 16 | from ram.utils import build_openset_label_embedding 17 | from torch import nn 18 | 19 | parser = argparse.ArgumentParser( 20 | description='RAM inference for tagging') 21 | parser.add_argument('--image', 22 | metavar='DIR', 23 | help='path to dataset', 24 | default='images/openset_example.jpg') 25 | parser.add_argument('--pretrained', 26 | metavar='DIR', 27 | help='path to pretrained model', 28 | default='pretrained/ram_swin_large_14m.pth') 29 | parser.add_argument('--image-size', 30 | default=384, 31 | type=int, 32 | metavar='N', 33 | help='input image size (default: 384)') 34 | 35 | 36 | if __name__ == "__main__": 37 | 38 | args = parser.parse_args() 39 | 40 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 41 | 42 | transform = get_transform(image_size=args.image_size) 43 | 44 | #######load model 45 | model = ram(pretrained=args.pretrained, 46 | image_size=args.image_size, 47 | vit='swin_l') 48 | 49 | #######set openset interference 50 | openset_label_embedding, openset_categories = build_openset_label_embedding() 51 | 52 | model.tag_list = np.array(openset_categories) 53 | 54 | model.label_embed = nn.Parameter(openset_label_embedding.float()) 55 | 56 | model.num_class = len(openset_categories) 57 | # the threshold for unseen categories is often lower 58 | model.class_threshold = torch.ones(model.num_class) * 0.5 59 | ####### 60 | 61 | model.eval() 62 | 63 | model = model.to(device) 64 | 65 | image = transform(Image.open(args.image)).unsqueeze(0).to(device) 66 | 67 | res = inference(image, model) 68 | print("Image Tags: ", res) 69 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/inference_tag2text.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Tag2Text Model 3 | * Written by Xinyu Huang 4 | ''' 5 | import argparse 6 | import numpy as np 7 | import random 8 | 9 | import torch 10 | 11 | from PIL import Image 12 | from ram.models import tag2text 13 | from ram import inference_tag2text as inference 14 | from ram import get_transform 15 | 16 | 17 | parser = argparse.ArgumentParser( 18 | description='Tag2Text inferece for tagging and captioning') 19 | parser.add_argument('--image', 20 | metavar='DIR', 21 | help='path to dataset', 22 | default='images/1641173_2291260800.jpg') 23 | parser.add_argument('--pretrained', 24 | metavar='DIR', 25 | help='path to pretrained model', 26 | default='pretrained/tag2text_swin_14m.pth') 27 | parser.add_argument('--image-size', 28 | default=384, 29 | type=int, 30 | metavar='N', 31 | help='input image size (default: 448)') 32 | parser.add_argument('--thre', 33 | default=0.68, 34 | type=float, 35 | metavar='N', 36 | help='threshold value') 37 | parser.add_argument('--specified-tags', 38 | default='None', 39 | help='User input specified tags') 40 | 41 | 42 | if __name__ == "__main__": 43 | 44 | args = parser.parse_args() 45 | 46 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 47 | 48 | transform = get_transform(image_size=args.image_size) 49 | 50 | # delete some tags that may disturb captioning 51 | # 127: "quarter"; 2961: "back", 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one" 52 | delete_tag_index = [127,2961, 3351, 3265, 3338, 3355, 3359] 53 | 54 | #######load model 55 | model = tag2text(pretrained=args.pretrained, 56 | image_size=args.image_size, 57 | vit='swin_b', 58 | delete_tag_index=delete_tag_index) 59 | model.threshold = args.thre # threshold for tagging 60 | model.eval() 61 | 62 | model = model.to(device) 63 | 64 | image = transform(Image.open(args.image)).unsqueeze(0).to(device) 65 | 66 | res = inference(image, model, args.specified_tags) 67 | print("Model Identified Tags: ", res[0]) 68 | print("User Specified Tags: ", res[1]) 69 | print("Image Caption: ", res[2]) 70 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import inference_tag2text, inference_ram, inference_ram_openset 2 | from .transform import get_transform 3 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/configs/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/configs/q2l_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 4, 15 | "num_hidden_layers": 2, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "add_tag_cross_attention": false 22 | } -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/configs/swin/config_swinB_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth", 3 | "vision_width": 1024, 4 | "image_res": 384, 5 | "window_size": 12, 6 | "embed_dim": 128, 7 | "depths": [ 2, 2, 18, 2 ], 8 | "num_heads": [ 4, 8, 16, 32 ] 9 | } -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/configs/swin/config_swinL_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth", 3 | "vision_width": 1536, 4 | "image_res": 384, 5 | "window_size": 12, 6 | "embed_dim": 192, 7 | "depths": [ 2, 2, 18, 2 ], 8 | "num_heads": [ 6, 12, 24, 48 ] 9 | } -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/inference.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Inference of RAM and Tag2Text Models 3 | * Written by Xinyu Huang 4 | ''' 5 | import torch 6 | 7 | 8 | def inference_tag2text(image, model, input_tag="None"): 9 | 10 | with torch.no_grad(): 11 | caption, tag_predict = model.generate(image, 12 | tag_input=None, 13 | max_length=50, 14 | return_tag_predict=True) 15 | 16 | if input_tag == '' or input_tag == 'none' or input_tag == 'None': 17 | return tag_predict[0], None, caption[0] 18 | 19 | # If user input specified tags: 20 | else: 21 | input_tag_list = [] 22 | input_tag_list.append(input_tag.replace(',', ' | ')) 23 | 24 | with torch.no_grad(): 25 | caption, input_tag = model.generate(image, 26 | tag_input=input_tag_list, 27 | max_length=50, 28 | return_tag_predict=True) 29 | 30 | return tag_predict[0], input_tag[0], caption[0] 31 | 32 | 33 | def inference_ram(image, model): 34 | 35 | with torch.no_grad(): 36 | tags, confidences = model.generate_tag(image) 37 | 38 | return tags[0], confidences[0] 39 | 40 | 41 | def inference_ram_openset(image, model): 42 | 43 | with torch.no_grad(): 44 | tags = model.generate_tag_openset(image) 45 | 46 | return tags[0] 47 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ram import ram, RAM 2 | from .ram_plus import ram_plus, RAM_plus 3 | from .tag2text import tag2text 4 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/models/ram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * The Recognize Anything Model (RAM) 3 | * Written by Xinyu Huang 4 | ''' 5 | import json 6 | import warnings 7 | 8 | import numpy as np 9 | import torch 10 | from torch import nn 11 | 12 | from .bert import BertConfig, BertModel 13 | from .swin_transformer import SwinTransformer 14 | from .utils import * 15 | 16 | warnings.filterwarnings("ignore") 17 | 18 | 19 | 20 | class RAM(nn.Module): 21 | def __init__(self, 22 | med_config=f'{CONFIG_PATH}/configs/med_config.json', 23 | image_size=384, 24 | vit='base', 25 | vit_grad_ckpt=False, 26 | vit_ckpt_layer=0, 27 | prompt='a picture of ', 28 | threshold=0.68, 29 | delete_tag_index=[], 30 | tag_list=f'{CONFIG_PATH}/data/ram_tag_list.txt'): 31 | r""" The Recognize Anything Model (RAM) inference module. 32 | RAM is a strong image tagging model, which can recognize any common category with high accuracy. 33 | Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/ 34 | 35 | Args: 36 | med_config (str): path for the mixture of encoder-decoder model's configuration file 37 | image_size (int): input image size 38 | vit (str): model size of vision transformer 39 | threshold (int): tagging threshold 40 | delete_tag_index (list): delete some tags that may disturb captioning 41 | """ 42 | super().__init__() 43 | 44 | # create image encoder 45 | self.image_size = image_size 46 | if vit == 'swin_b': 47 | if image_size == 224: 48 | vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' 49 | elif image_size == 384: 50 | vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' 51 | vision_config = read_json(vision_config_path) 52 | assert image_size == vision_config['image_res'] 53 | # assert config['patch_size'] == 32 54 | vision_width = vision_config['vision_width'] 55 | 56 | self.visual_encoder = SwinTransformer( 57 | img_size=vision_config['image_res'], 58 | patch_size=4, 59 | in_chans=3, 60 | embed_dim=vision_config['embed_dim'], 61 | depths=vision_config['depths'], 62 | num_heads=vision_config['num_heads'], 63 | window_size=vision_config['window_size'], 64 | mlp_ratio=4., 65 | qkv_bias=True, 66 | drop_rate=0.0, 67 | drop_path_rate=0.1, 68 | ape=False, 69 | patch_norm=True, 70 | use_checkpoint=False) 71 | 72 | elif vit == 'swin_l': 73 | if image_size == 224: 74 | vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' 75 | elif image_size == 384: 76 | vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' 77 | vision_config = read_json(vision_config_path) 78 | assert image_size == vision_config['image_res'] 79 | # assert config['patch_size'] == 32 80 | vision_width = vision_config['vision_width'] 81 | 82 | self.visual_encoder = SwinTransformer( 83 | img_size=vision_config['image_res'], 84 | patch_size=4, 85 | in_chans=3, 86 | embed_dim=vision_config['embed_dim'], 87 | depths=vision_config['depths'], 88 | num_heads=vision_config['num_heads'], 89 | window_size=vision_config['window_size'], 90 | mlp_ratio=4., 91 | qkv_bias=True, 92 | drop_rate=0.0, 93 | drop_path_rate=0.1, 94 | ape=False, 95 | patch_norm=True, 96 | use_checkpoint=False) 97 | 98 | else: 99 | self.visual_encoder, vision_width = create_vit( 100 | vit, image_size, vit_grad_ckpt, vit_ckpt_layer) 101 | 102 | # create tokenzier 103 | self.tokenizer = init_tokenizer() 104 | 105 | # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder 106 | # create image-tag interaction encoder 107 | encoder_config = BertConfig.from_json_file(med_config) 108 | encoder_config.encoder_width = 512 109 | self.tag_encoder = BertModel(config=encoder_config, 110 | add_pooling_layer=False) 111 | 112 | self.delete_tag_index = delete_tag_index 113 | self.prompt = prompt 114 | self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 115 | 116 | # load tag list 117 | self.tag_list = self.load_tag_list(tag_list) 118 | 119 | # create image-tag recognition decoder 120 | self.threshold = threshold 121 | self.num_class = len(self.tag_list) 122 | q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json') 123 | q2l_config.encoder_width = 512 124 | self.tagging_head = BertModel(config=q2l_config, 125 | add_pooling_layer=False) 126 | self.tagging_head.resize_token_embeddings(len(self.tokenizer)) 127 | # self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) 128 | self.label_embed = nn.Parameter(torch.zeros(self.num_class, q2l_config.encoder_width)) 129 | 130 | if q2l_config.hidden_size != 512: 131 | self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size) 132 | else: 133 | self.wordvec_proj = nn.Identity() 134 | 135 | self.fc = nn.Linear(q2l_config.hidden_size, 1) 136 | 137 | self.del_selfattention() 138 | 139 | # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" 140 | tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', 141 | ' ') 142 | self.image_proj = nn.Linear(vision_width, 512) 143 | # self.label_embed = nn.Parameter(torch.load(f'{CONFIG_PATH}/data/textual_label_embedding.pth',map_location='cpu').float()) 144 | 145 | # adjust thresholds for some tags 146 | self.class_threshold = torch.ones(self.num_class) * self.threshold 147 | ram_class_threshold_path = f'{CONFIG_PATH}/data/ram_tag_list_threshold.txt' 148 | with open(ram_class_threshold_path, 'r', encoding='utf-8') as f: 149 | ram_class_threshold = [float(s.strip()) for s in f] 150 | for key,value in enumerate(ram_class_threshold): 151 | self.class_threshold[key] = value 152 | 153 | def override_class_threshold(self, cls, threshold): 154 | assert (type(cls) == str) and (type(threshold) == float) 155 | try: 156 | cls_idx = int(np.where(cls == self.tag_list)[0]) 157 | except: 158 | raise ValueError('{} not in the tag list'.format(cls)) 159 | assert ( 160 | threshold >= 0.0 and threshold <= 1.0 161 | ), "threshold must be between 0 and 1" 162 | self.class_threshold[cls_idx] = threshold 163 | 164 | def load_tag_list(self, tag_list_file): 165 | with open(tag_list_file, 'r', encoding="utf-8") as f: 166 | tag_list = f.read().splitlines() 167 | tag_list = np.array(tag_list) 168 | return tag_list 169 | 170 | # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label 171 | def del_selfattention(self): 172 | del self.tagging_head.embeddings 173 | for layer in self.tagging_head.encoder.layer: 174 | del layer.attention 175 | 176 | def generate_tag(self, 177 | image, 178 | threshold=0.68, 179 | tag_input=None, 180 | ): 181 | 182 | label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) 183 | 184 | image_embeds = self.image_proj(self.visual_encoder(image)) 185 | image_atts = torch.ones(image_embeds.size()[:-1], 186 | dtype=torch.long).to(image.device) 187 | 188 | # recognized image tags using image-tag recogntiion decoder 189 | image_cls_embeds = image_embeds[:, 0, :] 190 | image_spatial_embeds = image_embeds[:, 1:, :] 191 | 192 | bs = image_spatial_embeds.shape[0] 193 | label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) 194 | tagging_embed = self.tagging_head( 195 | encoder_embeds=label_embed, 196 | encoder_hidden_states=image_embeds, 197 | encoder_attention_mask=image_atts, 198 | return_dict=False, 199 | mode='tagging', 200 | ) 201 | 202 | class_scores = torch.sigmoid( 203 | self.fc(tagging_embed[0]).squeeze(-1)) 204 | 205 | targets = torch.where( 206 | class_scores > self.class_threshold.to(image.device), 207 | torch.tensor(1.0).to(image.device), 208 | torch.zeros(self.num_class).to(image.device)) 209 | 210 | tag = targets.cpu().numpy() 211 | tag[:,self.delete_tag_index] = 0 212 | tag_output = [] 213 | tag_confidences = [] 214 | for b in range(bs): 215 | index = np.argwhere(tag[b] == 1) 216 | confidences = class_scores[b, index].cpu().numpy().reshape(-1) 217 | tag_confidences.append(confidences) 218 | token = self.tag_list[index].squeeze(axis=1) 219 | tag_output.append(' | '.join(token)) 220 | 221 | return tag_output, tag_confidences 222 | 223 | def generate_tag_openset(self, 224 | image, 225 | threshold=0.68, 226 | tag_input=None, 227 | ): 228 | 229 | label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) 230 | 231 | image_embeds = self.image_proj(self.visual_encoder(image)) 232 | image_atts = torch.ones(image_embeds.size()[:-1], 233 | dtype=torch.long).to(image.device) 234 | 235 | # recognized image tags using image-tag recogntiion decoder 236 | image_cls_embeds = image_embeds[:, 0, :] 237 | image_spatial_embeds = image_embeds[:, 1:, :] 238 | 239 | bs = image_spatial_embeds.shape[0] 240 | label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) 241 | tagging_embed = self.tagging_head( 242 | encoder_embeds=label_embed, 243 | encoder_hidden_states=image_embeds, 244 | encoder_attention_mask=image_atts, 245 | return_dict=False, 246 | mode='tagging', 247 | ) 248 | 249 | class_scores = torch.sigmoid(self.fc(tagging_embed[0]).squeeze(-1)) 250 | 251 | targets = torch.where( 252 | class_scores > self.class_threshold.to(image.device), 253 | torch.tensor(1.0).to(image.device), 254 | torch.zeros(self.num_class).to(image.device)) 255 | 256 | tag = targets.cpu().numpy() 257 | tag[:,self.delete_tag_index] = 0 258 | tag_output = [] 259 | for b in range(bs): 260 | index = np.argwhere(tag[b] == 1) 261 | token = self.tag_list[index].squeeze(axis=1) 262 | tag_output.append(' | '.join(token)) 263 | 264 | # TODO also return tag confidences!!! 265 | 266 | return tag_output 267 | 268 | 269 | # load RAM pretrained model parameters 270 | def ram(pretrained='', **kwargs): 271 | model = RAM(**kwargs) 272 | if pretrained: 273 | if kwargs['vit'] == 'swin_b': 274 | model, msg = load_checkpoint_swinbase(model, pretrained, kwargs) 275 | elif kwargs['vit'] == 'swin_l': 276 | model, msg = load_checkpoint_swinlarge(model, pretrained, kwargs) 277 | else: 278 | model, msg = load_checkpoint(model, pretrained) 279 | print('vit:', kwargs['vit']) 280 | # print('msg', msg) 281 | return model 282 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/transform.py: -------------------------------------------------------------------------------- 1 | from torchvision.transforms import Normalize, Compose, Resize, ToTensor 2 | 3 | 4 | def get_transform(image_size=384): 5 | return Compose([ 6 | lambda image: image.convert("RGB"), 7 | Resize((image_size, image_size)), 8 | ToTensor(), 9 | Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 10 | ]) 11 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .metrics import get_mAP, get_PR 2 | from .openset_utils import build_openset_label_embedding 3 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from numpy import ndarray 5 | 6 | 7 | def get_mAP( 8 | preds: ndarray, 9 | gt_file: str, 10 | taglist: List[str] 11 | ) -> Tuple[float, ndarray]: 12 | assert preds.shape[1] == len(taglist) 13 | 14 | # When mapping categories from test datasets to our system, there might be 15 | # multiple vs one situation due to different semantic definitions of tags. 16 | # So there can be duplicate tags in `taglist`. This special case is taken 17 | # into account. 18 | tag2idxs = {} 19 | for idx, tag in enumerate(taglist): 20 | if tag not in tag2idxs: 21 | tag2idxs[tag] = [] 22 | tag2idxs[tag].append(idx) 23 | 24 | # build targets 25 | targets = np.zeros_like(preds) 26 | with open(gt_file, "r") as f: 27 | lines = [line.strip("\n").split(",") for line in f.readlines()] 28 | assert len(lines) == targets.shape[0] 29 | for i, line in enumerate(lines): 30 | for tag in line[1:]: 31 | targets[i, tag2idxs[tag]] = 1.0 32 | 33 | # compute average precision for each class 34 | APs = np.zeros(preds.shape[1]) 35 | for k in range(preds.shape[1]): 36 | APs[k] = _average_precision(preds[:, k], targets[:, k]) 37 | 38 | return APs.mean(), APs 39 | 40 | 41 | def _average_precision(output: ndarray, target: ndarray) -> float: 42 | epsilon = 1e-8 43 | 44 | # sort examples 45 | indices = output.argsort()[::-1] 46 | # Computes prec@i 47 | total_count_ = np.cumsum(np.ones((len(output), 1))) 48 | 49 | target_ = target[indices] 50 | ind = target_ == 1 51 | pos_count_ = np.cumsum(ind) 52 | total = pos_count_[-1] 53 | pos_count_[np.logical_not(ind)] = 0 54 | pp = pos_count_ / total_count_ 55 | precision_at_i_ = np.sum(pp) 56 | precision_at_i = precision_at_i_ / (total + epsilon) 57 | 58 | return precision_at_i 59 | 60 | 61 | def get_PR( 62 | pred_file: str, 63 | gt_file: str, 64 | taglist: List[str] 65 | ) -> Tuple[float, float, ndarray, ndarray]: 66 | # When mapping categories from test datasets to our system, there might be 67 | # multiple vs one situation due to different semantic definitions of tags. 68 | # So there can be duplicate tags in `taglist`. This special case is taken 69 | # into account. 70 | tag2idxs = {} 71 | for idx, tag in enumerate(taglist): 72 | if tag not in tag2idxs: 73 | tag2idxs[tag] = [] 74 | tag2idxs[tag].append(idx) 75 | 76 | # build preds 77 | with open(pred_file, "r", encoding="utf-8") as f: 78 | lines = [line.strip().split(",") for line in f.readlines()] 79 | preds = np.zeros((len(lines), len(tag2idxs)), dtype=bool) 80 | for i, line in enumerate(lines): 81 | for tag in line[1:]: 82 | preds[i, tag2idxs[tag]] = True 83 | 84 | # build targets 85 | with open(gt_file, "r", encoding="utf-8") as f: 86 | lines = [line.strip().split(",") for line in f.readlines()] 87 | targets = np.zeros((len(lines), len(tag2idxs)), dtype=bool) 88 | for i, line in enumerate(lines): 89 | for tag in line[1:]: 90 | targets[i, tag2idxs[tag]] = True 91 | 92 | assert preds.shape == targets.shape 93 | 94 | # calculate P and R 95 | TPs = ( preds & targets).sum(axis=0) # noqa: E201, E222 96 | FPs = ( preds & ~targets).sum(axis=0) # noqa: E201, E222 97 | FNs = (~preds & targets).sum(axis=0) # noqa: E201, E222 98 | eps = 1.e-9 99 | Ps = TPs / (TPs + FPs + eps) 100 | Rs = TPs / (TPs + FNs + eps) 101 | 102 | return Ps.mean(), Rs.mean(), Ps, Rs 103 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/ram/utils/openset_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | from clip import clip 7 | 8 | 9 | def article(name): 10 | return "an" if name[0] in "aeiou" else "a" 11 | 12 | 13 | def processed_name(name, rm_dot=False): 14 | # _ for lvis 15 | # / for obj365 16 | res = name.replace("_", " ").replace("/", " or ").lower() 17 | if rm_dot: 18 | res = res.rstrip(".") 19 | return res 20 | 21 | 22 | single_template = ["a photo of a {}."] 23 | 24 | multiple_templates = [ 25 | "There is {article} {} in the scene.", 26 | "There is the {} in the scene.", 27 | "a photo of {article} {} in the scene.", 28 | "a photo of the {} in the scene.", 29 | "a photo of one {} in the scene.", 30 | "itap of {article} {}.", 31 | "itap of my {}.", # itap: I took a picture of 32 | "itap of the {}.", 33 | "a photo of {article} {}.", 34 | "a photo of my {}.", 35 | "a photo of the {}.", 36 | "a photo of one {}.", 37 | "a photo of many {}.", 38 | "a good photo of {article} {}.", 39 | "a good photo of the {}.", 40 | "a bad photo of {article} {}.", 41 | "a bad photo of the {}.", 42 | "a photo of a nice {}.", 43 | "a photo of the nice {}.", 44 | "a photo of a cool {}.", 45 | "a photo of the cool {}.", 46 | "a photo of a weird {}.", 47 | "a photo of the weird {}.", 48 | "a photo of a small {}.", 49 | "a photo of the small {}.", 50 | "a photo of a large {}.", 51 | "a photo of the large {}.", 52 | "a photo of a clean {}.", 53 | "a photo of the clean {}.", 54 | "a photo of a dirty {}.", 55 | "a photo of the dirty {}.", 56 | "a bright photo of {article} {}.", 57 | "a bright photo of the {}.", 58 | "a dark photo of {article} {}.", 59 | "a dark photo of the {}.", 60 | "a photo of a hard to see {}.", 61 | "a photo of the hard to see {}.", 62 | "a low resolution photo of {article} {}.", 63 | "a low resolution photo of the {}.", 64 | "a cropped photo of {article} {}.", 65 | "a cropped photo of the {}.", 66 | "a close-up photo of {article} {}.", 67 | "a close-up photo of the {}.", 68 | "a jpeg corrupted photo of {article} {}.", 69 | "a jpeg corrupted photo of the {}.", 70 | "a blurry photo of {article} {}.", 71 | "a blurry photo of the {}.", 72 | "a pixelated photo of {article} {}.", 73 | "a pixelated photo of the {}.", 74 | "a black and white photo of the {}.", 75 | "a black and white photo of {article} {}.", 76 | "a plastic {}.", 77 | "the plastic {}.", 78 | "a toy {}.", 79 | "the toy {}.", 80 | "a plushie {}.", 81 | "the plushie {}.", 82 | "a cartoon {}.", 83 | "the cartoon {}.", 84 | "an embroidered {}.", 85 | "the embroidered {}.", 86 | "a painting of the {}.", 87 | "a painting of a {}.", 88 | ] 89 | 90 | 91 | openimages_rare_unseen = ['Aerial photography', 92 | 'Aircraft engine', 93 | 'Ale', 94 | 'Aloe', 95 | 'Amphibian', 96 | 'Angling', 97 | 'Anole', 98 | 'Antique car', 99 | 'Arcade game', 100 | 'Arthropod', 101 | 'Assault rifle', 102 | 'Athletic shoe', 103 | 'Auto racing', 104 | 'Backlighting', 105 | 'Bagpipes', 106 | 'Ball game', 107 | 'Barbecue chicken', 108 | 'Barechested', 109 | 'Barquentine', 110 | 'Beef tenderloin', 111 | 'Billiard room', 112 | 'Billiards', 113 | 'Bird of prey', 114 | 'Black swan', 115 | 'Black-and-white', 116 | 'Blond', 117 | 'Boating', 118 | 'Bonbon', 119 | 'Bottled water', 120 | 'Bouldering', 121 | 'Bovine', 122 | 'Bratwurst', 123 | 'Breadboard', 124 | 'Briefs', 125 | 'Brisket', 126 | 'Brochette', 127 | 'Calabaza', 128 | 'Camera operator', 129 | 'Canola', 130 | 'Childbirth', 131 | 'Chordophone', 132 | 'Church bell', 133 | 'Classical sculpture', 134 | 'Close-up', 135 | 'Cobblestone', 136 | 'Coca-cola', 137 | 'Combat sport', 138 | 'Comics', 139 | 'Compact car', 140 | 'Computer speaker', 141 | 'Cookies and crackers', 142 | 'Coral reef fish', 143 | 'Corn on the cob', 144 | 'Cosmetics', 145 | 'Crocodilia', 146 | 'Digital camera', 147 | 'Dishware', 148 | 'Divemaster', 149 | 'Dobermann', 150 | 'Dog walking', 151 | 'Domestic rabbit', 152 | 'Domestic short-haired cat', 153 | 'Double-decker bus', 154 | 'Drums', 155 | 'Electric guitar', 156 | 'Electric piano', 157 | 'Electronic instrument', 158 | 'Equestrianism', 159 | 'Equitation', 160 | 'Erinaceidae', 161 | 'Extreme sport', 162 | 'Falafel', 163 | 'Figure skating', 164 | 'Filling station', 165 | 'Fire apparatus', 166 | 'Firearm', 167 | 'Flatbread', 168 | 'Floristry', 169 | 'Forklift truck', 170 | 'Freight transport', 171 | 'Fried food', 172 | 'Fried noodles', 173 | 'Frigate', 174 | 'Frozen yogurt', 175 | 'Frying', 176 | 'Full moon', 177 | 'Galleon', 178 | 'Glacial landform', 179 | 'Gliding', 180 | 'Go-kart', 181 | 'Goats', 182 | 'Grappling', 183 | 'Great white shark', 184 | 'Gumbo', 185 | 'Gun turret', 186 | 'Hair coloring', 187 | 'Halter', 188 | 'Headphones', 189 | 'Heavy cruiser', 190 | 'Herding', 191 | 'High-speed rail', 192 | 'Holding hands', 193 | 'Horse and buggy', 194 | 'Horse racing', 195 | 'Hound', 196 | 'Hunting knife', 197 | 'Hurdling', 198 | 'Inflatable', 199 | 'Jackfruit', 200 | 'Jeans', 201 | 'Jiaozi', 202 | 'Junk food', 203 | 'Khinkali', 204 | 'Kitesurfing', 205 | 'Lawn game', 206 | 'Leaf vegetable', 207 | 'Lechon', 208 | 'Lifebuoy', 209 | 'Locust', 210 | 'Lumpia', 211 | 'Luxury vehicle', 212 | 'Machine tool', 213 | 'Medical imaging', 214 | 'Melee weapon', 215 | 'Microcontroller', 216 | 'Middle ages', 217 | 'Military person', 218 | 'Military vehicle', 219 | 'Milky way', 220 | 'Miniature Poodle', 221 | 'Modern dance', 222 | 'Molluscs', 223 | 'Monoplane', 224 | 'Motorcycling', 225 | 'Musical theatre', 226 | 'Narcissus', 227 | 'Nest box', 228 | 'Newsagent\'s shop', 229 | 'Nile crocodile', 230 | 'Nordic skiing', 231 | 'Nuclear power plant', 232 | 'Orator', 233 | 'Outdoor shoe', 234 | 'Parachuting', 235 | 'Pasta salad', 236 | 'Peafowl', 237 | 'Pelmeni', 238 | 'Perching bird', 239 | 'Performance car', 240 | 'Personal water craft', 241 | 'Pit bull', 242 | 'Plant stem', 243 | 'Pork chop', 244 | 'Portrait photography', 245 | 'Primate', 246 | 'Procyonidae', 247 | 'Prosciutto', 248 | 'Public speaking', 249 | 'Racewalking', 250 | 'Ramen', 251 | 'Rear-view mirror', 252 | 'Residential area', 253 | 'Ribs', 254 | 'Rice ball', 255 | 'Road cycling', 256 | 'Roller skating', 257 | 'Roman temple', 258 | 'Rowing', 259 | 'Rural area', 260 | 'Sailboat racing', 261 | 'Scaled reptile', 262 | 'Scuba diving', 263 | 'Senior citizen', 264 | 'Shallot', 265 | 'Shinto shrine', 266 | 'Shooting range', 267 | 'Siberian husky', 268 | 'Sledding', 269 | 'Soba', 270 | 'Solar energy', 271 | 'Sport climbing', 272 | 'Sport utility vehicle', 273 | 'Steamed rice', 274 | 'Stemware', 275 | 'Sumo', 276 | 'Surfing Equipment', 277 | 'Team sport', 278 | 'Touring car', 279 | 'Toy block', 280 | 'Trampolining', 281 | 'Underwater diving', 282 | 'Vegetarian food', 283 | 'Wallaby', 284 | 'Water polo', 285 | 'Watercolor paint', 286 | 'Whiskers', 287 | 'Wind wave', 288 | 'Woodwind instrument', 289 | 'Yakitori', 290 | 'Zeppelin'] 291 | 292 | 293 | def build_openset_label_embedding(categories=None): 294 | if categories is None: 295 | categories = openimages_rare_unseen 296 | model, _ = clip.load("ViT-B/16") 297 | templates = multiple_templates 298 | 299 | run_on_gpu = torch.cuda.is_available() 300 | 301 | with torch.no_grad(): 302 | openset_label_embedding = [] 303 | for category in categories: 304 | texts = [ 305 | template.format( 306 | processed_name(category, rm_dot=True), article=article(category) 307 | ) 308 | for template in templates 309 | ] 310 | texts = [ 311 | "This is " + text if text.startswith("a") or text.startswith("the") else text 312 | for text in texts 313 | ] 314 | texts = clip.tokenize(texts) # tokenize 315 | if run_on_gpu: 316 | texts = texts.cuda() 317 | model = model.cuda() 318 | text_embeddings = model.encode_text(texts) 319 | text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True) 320 | text_embedding = text_embeddings.mean(dim=0) 321 | text_embedding /= text_embedding.norm() 322 | openset_label_embedding.append(text_embedding) 323 | openset_label_embedding = torch.stack(openset_label_embedding, dim=1) 324 | if run_on_gpu: 325 | openset_label_embedding = openset_label_embedding.cuda() 326 | 327 | openset_label_embedding = openset_label_embedding.t() 328 | return openset_label_embedding, categories 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/requirements.txt: -------------------------------------------------------------------------------- 1 | timm==0.4.12 2 | transformers==4.15.0 3 | fairscale==0.4.4 4 | pycocoevalcap 5 | torch 6 | torchvision 7 | Pillow 8 | scipy 9 | git+https://github.com/openai/CLIP.git 10 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = recognize-anything 3 | version = 0.0.1 4 | description = Recognize Anything Model and Tag2Text Model 5 | 6 | [options] 7 | packages = find: 8 | include_package_data = True 9 | 10 | [options.packages.find] 11 | exclude = 12 | datasets 13 | images 14 | outputs 15 | pretrained 16 | -------------------------------------------------------------------------------- /thirdparty/recognize-anything/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | setuptools.setup() 3 | --------------------------------------------------------------------------------