├── .gitignore
├── LICENSE
├── README.md
├── demos
    ├── README.md
    ├── build_tag_map.ipynb
    ├── download_demo_data.sh
    └── localization.ipynb
├── evaluation
    ├── README.md
    ├── config
    │   ├── evaluation
    │   │   ├── matterport_objects.yaml
    │   │   └── matterport_regions.yaml
    │   ├── lattice_graph_creation
    │   │   └── matterport.yaml
    │   └── tag_map_creation
    │   │   ├── matterport_ram.yaml
    │   │   └── matterport_ram_plus.yaml
    ├── notebooks
    │   ├── helpers.py
    │   └── visualize_eval_output_matterport.ipynb
    └── scripts
    │   ├── evaluate_localization_matterport.py
    │   ├── generate_lattice_graph_matterport.py
    │   ├── generate_tag_maps_matterport.py
    │   └── visualize_lattice_graph_matterport.py
├── tag_mapping
    ├── requirements.txt
    ├── setup.py
    └── tag_mapping
    │   ├── __init__.py
    │   ├── datasets
    │       └── matterport
    │       │   ├── __init__.py
    │       │   ├── category_index_mapping.yaml
    │       │   ├── category_mapping.py
    │       │   ├── evaluate_matterport_scan_object_localizations.py
    │       │   ├── evaluate_matterport_scan_region_localizations.py
    │       │   ├── file_utils.py
    │       │   ├── generate_tag_map_from_matterport_scan.py
    │       │   ├── matterport_object_bounding_box.py
    │       │   ├── matterport_region_bounding_box.py
    │       │   ├── mp_region_ram_tags_mapping.py
    │       │   └── mpcat40_ram_tags_mapping.py
    │   ├── evaluation
    │       ├── __init__.py
    │       ├── lattice_graph_utils.py
    │       └── lattice_navigation_graph.py
    │   ├── filtering
    │       ├── __init__.py
    │       ├── image_filters.py
    │       └── inference_filters.py
    │   ├── localization
    │       ├── __init__.py
    │       ├── clustering.py
    │       ├── pipeline.py
    │       ├── viewpoint.py
    │       └── voxel_voting.py
    │   ├── models
    │       ├── __init__.py
    │       ├── image_tagger.py
    │       ├── ram_plus_tagger.py
    │       └── ram_tagger.py
    │   ├── pose_graph.py
    │   ├── tag_map.py
    │   └── utils
    │       ├── __init__.py
    │       ├── collision_check.py
    │       ├── get_box_corners.py
    │       ├── line_mesh.py
    │       ├── load_yaml_params.py
    │       └── nearest_points_in_box.py
└── thirdparty
    └── recognize-anything
        ├── LICENSE
        ├── MANIFEST.in
        ├── README.md
        ├── batch_inference.py
        ├── datasets
            ├── openimages_common_214
            │   ├── imgs
            │   │   └── .gitkeep
            │   ├── openimages_common_214_ram_annots.txt
            │   ├── openimages_common_214_ram_taglist.txt
            │   ├── openimages_common_214_tag2text_idannots.txt
            │   └── openimages_common_214_tag2text_tagidlist.txt
            └── openimages_rare_200
            │   ├── imgs
            │       └── .gitkeep
            │   ├── openimages_rare_200_ram_annots.txt
            │   └── openimages_rare_200_ram_taglist.txt
        ├── images
            ├── 1641173_2291260800.jpg
            ├── demo
            │   ├── demo1.jpg
            │   ├── demo2.jpg
            │   ├── demo3.jpg
            │   └── demo4.jpg
            ├── experiment_comparison.png
            ├── localization_and_recognition.jpg
            ├── openset_example.jpg
            ├── ram_grounded_sam.jpg
            ├── tag2text_framework.png
            ├── tag2text_grounded_sam.jpg
            └── tagging_results.jpg
        ├── inference_ram.py
        ├── inference_ram_combined.py
        ├── inference_ram_openset.py
        ├── inference_tag2text.py
        ├── ram
            ├── __init__.py
            ├── configs
            │   ├── med_config.json
            │   ├── q2l_config.json
            │   └── swin
            │   │   ├── config_swinB_384.json
            │   │   └── config_swinL_384.json
            ├── data
            │   ├── ram_tag_list.txt
            │   ├── ram_tag_list_chinese.txt
            │   ├── ram_tag_list_threshold.txt
            │   └── tag_list.txt
            ├── inference.py
            ├── models
            │   ├── __init__.py
            │   ├── bert.py
            │   ├── ram.py
            │   ├── ram_plus.py
            │   ├── swin_transformer.py
            │   ├── tag2text.py
            │   ├── utils.py
            │   └── vit.py
            ├── transform.py
            └── utils
            │   ├── __init__.py
            │   ├── metrics.py
            │   └── openset_utils.py
        ├── recognize_anything_demo.ipynb
        ├── requirements.txt
        ├── setup.cfg
        └── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Pytorch checkpoints
163 | *.pth
164 | 
165 | demo_data/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models
 2 | 
 3 | [Mike Zhang](https://mikez.xyz), [Kaixian Qu](https://www.linkedin.com/in/kaixian-qu-66a86215a), [Vaishakh Patil](https://www.linkedin.com/in/vaishakhpatil), [Cesar Cadena](https://n.ethz.ch/~cesarc), [Marco Hutter](https://rsl.ethz.ch/the-lab/people/person-detail.MTIxOTEx.TGlzdC8yNDQxLC0xNDI1MTk1NzM1.html)
 4 | 
 5 | 
 6 | [[Project Page](https://tag-mapping.github.io/)] [[Paper](https://arxiv.org/abs/2409.15451)]
 7 | 
 8 | 
 9 | ![overview](https://tag-mapping.github.io/media/images/method_overview.svg)
10 | 
11 | 
12 | ### Abstract
13 | Large Language Models (LLM) have emerged as a tool for robots to generate task plans using common sense reasoning. For the LLM to generate actionable plans, scene context must be provided, often through a map. Recent works have shifted from explicit maps with fixed semantic classes to implicit open vocabulary maps based on queryable embeddings capable of representing any semantic class. However, embeddings cannot directly report the scene context as they are implicit, requiring further processing for LLM integration. To address this, we propose an explicit text-based map that can represent thousands of semantic classes while easily integrating with LLMs due to their text-based nature by building upon large-scale image recognition models. We study how entities in our map can be localized and show through evaluations that our text-based map localizations perform comparably to those from open vocabulary maps while using two to four orders of magnitude less memory. Real-robot experiments demonstrate the grounding of an LLM with the text-based map to solve user tasks.
14 | 
15 | 
16 | ---
17 | ## Installation
18 | 
19 | Create a virtual environment.
20 | ```
21 | virtualenv -p python3.8 <env name>
22 | source <env name>/bin/activate
23 | pip install --upgrade pip
24 | ```
25 | 
26 | Install torch
27 | ```
28 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
29 | ```
30 | 
31 | Install the image tagging model. Currently, this repo only supports the [Recognized Anything](https://github.com/xinyu1205/recognize-anything) set of image tagging models.
32 | ```
33 | pip install -r thirdparty/recognize-anything/requirements.txt
34 | pip install -e thirdparty/recognize-anything/.
35 | ```
36 | 
37 | Download image tagging model checkpoints
38 | ```
39 | # Recognize Anything Model (RAM)
40 | wget -P <path_to_checkpoint> https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/ram_swin_large_14m.pth
41 | 
42 | # Recognize Anything Plus Model (RAM++)
43 | wget -P <path_to_checkpoint> https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth
44 | ```
45 | 
46 | 
47 | Install the `tag_mapping` package
48 | ```
49 | pip install -r tag_mapping/requirements.txt
50 | pip install -e tag_mapping/.
51 | ```
52 | 
53 | 
54 | ---
55 | ## Demos
56 | 
57 | Notebooks demonstrating the Tag Map construction and localization pipelines can be found in the `demos` folder.
58 | 
59 | ---
60 | ## Evaluation
61 | The `evaluation` folder contains instructions and scripts for evaluating the Tag Map localizations. 
62 | 
63 | 
64 | ---
65 | ## Citation
66 | If you found our paper or code useful, please cite:
67 | ```
68 | @inproceedings{zhang2024tagmap,
69 |   author  = {Zhang, Mike and Qu, Kaixian and Patil, Vaishakh and Cadena, Cesar and Hutter, Marco},
70 |   title   = {Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models},
71 |   journal = {Conference on Robot Learning (CoRL)},
72 |   year    = {2024},
73 | }
74 | ```


--------------------------------------------------------------------------------
/demos/README.md:
--------------------------------------------------------------------------------
 1 | # Demos
 2 | 
 3 | Please first run the scipt to download the demo data:
 4 | ```
 5 | ./download_demo_data.sh
 6 | ```
 7 | 
 8 | There are two demo notebooks provided:
 9 | - `build_tag_map.ipynb`: Goes through the Tag Map construction process for the provided demo scene.
10 | - `localization.ipynb`: Runs the coarse localization pipeline over a Tag Map of the demo scene.


--------------------------------------------------------------------------------
/demos/download_demo_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | wget https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/demo_data.zip
4 | echo "Unzipping demo data"
5 | unzip -q demo_data.zip
6 | rm demo_data.zip
7 | echo "Done downloading and unzipping demo data"
8 | 


--------------------------------------------------------------------------------
/demos/localization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "0fa25531",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "\n",
 12 |     "import numpy as np\n",
 13 |     "import open3d as o3d\n",
 14 |     "\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "from matplotlib import cm\n",
 17 |     "\n",
 18 |     "import ipywidgets as widgets\n",
 19 |     "from IPython.display import display"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "id": "ea59c01a",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from tag_mapping.datasets.matterport import (\n",
 30 |     "    read_matterport_image_file,\n",
 31 |     "    read_matterport_depth_file,\n",
 32 |     "    MatterportFilenameBridge\n",
 33 |     ")\n",
 34 |     "\n",
 35 |     "from tag_mapping import TagMap\n",
 36 |     "\n",
 37 |     "from tag_mapping.localization import tagmap_entries_to_viewpoints, localization_pipeline\n",
 38 |     "\n",
 39 |     "from tag_mapping.utils import box_to_linemesh"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "6f1b59f7",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Load scene data\n",
 48 |     "Please first download the demo data by running `download_demo_data.sh`."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "15091f3d",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "scene_dir = 'demo_data'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "2e42bcc5",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "tag_map = TagMap.load(f'{scene_dir}/scene.tagmap')\n",
 69 |     "intrinsics = tag_map.metadata[\"intrinsics\"]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "54726254",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "images_dir = os.path.join(scene_dir, 'color')\n",
 80 |     "depths_dir = os.path.join(scene_dir, 'depth')\n",
 81 |     "poses_dir = os.path.join(scene_dir, 'poses')\n",
 82 |     "mesh_path = os.path.join(scene_dir, 'mesh.ply')"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "c3ba44ef",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Load and visualize the mesh"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "48e91139",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "scene_mesh = o3d.io.read_triangle_mesh(mesh_path)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "3c5ce219",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "o3d.visualization.draw_geometries([scene_mesh])"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "id": "3f0609f7",
116 |    "metadata": {},
117 |    "source": [
118 |     "## Localize a selected tag\n",
119 |     "\n",
120 |     "Select a tag recognized in the scene to localize"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "f27dc6c5",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "options = sorted(list(tag_map.unique_objects))\n",
131 |     "query_dropdown = widgets.Dropdown(options=options, description='Select an tag:')\n",
132 |     "display(query_dropdown)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "0822b061-1fa0-4710-8f72-0aa895711867",
138 |    "metadata": {},
139 |    "source": [
140 |     "Retrieve corresponding viewpoints for the selected tag.\n",
141 |     "\n",
142 |     "__Rerun this block after changing the selection__"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "858838ca",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "query_entries = tag_map.query(query_dropdown.value)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "id": "8f7d4677-406e-4010-83db-c52f85489fbb",
158 |    "metadata": {},
159 |    "source": [
160 |     "Show the images for a few of the viewpoints corresponding to the tag"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "c7d7d528",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "max_show = 6\n",
171 |     "num_show = min(len(query_entries), max_show)\n",
172 |     "\n",
173 |     "fig, axes = plt.subplots(1, num_show, figsize=(3*num_show, 6))\n",
174 |     "\n",
175 |     "for i in range(num_show):\n",
176 |     "    entry = query_entries[i]\n",
177 |     "    image_filename = entry.extras['image_filename']\n",
178 |     "    conf = entry.extras['confidence']\n",
179 |     "    \n",
180 |     "    image = read_matterport_image_file(\n",
181 |     "        os.path.join(images_dir, image_filename))\n",
182 |     "    \n",
183 |     "    try:\n",
184 |     "        ax = axes[i]\n",
185 |     "    except TypeError:\n",
186 |     "        ax = axes\n",
187 |     "        \n",
188 |     "    ax.imshow(image)\n",
189 |     "    ax.set_xticks([])\n",
190 |     "    ax.set_yticks([])\n",
191 |     "    ax.set_title(f'confidence: {conf:.2f}')\n",
192 |     "    ax.set_aspect(1)\n",
193 |     "\n",
194 |     "plt.show()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "id": "b5d9c2da",
200 |    "metadata": {},
201 |    "source": [
202 |     "## Compute coarse-grained localizations in 3D for the selected tag\n",
203 |     "For each viewpoint corresponding to the selected tag, we first get their frustums in 3D."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "id": "c79bc958",
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "viewpoints = tagmap_entries_to_viewpoints(\n",
214 |     "    entries=query_entries,\n",
215 |     "    intrinsics=intrinsics,\n",
216 |     "\n",
217 |     "    # set the near plane of the viewpoint frustum to a constant distance away\n",
218 |     "    near_dist_fn=lambda x: 0.2,\n",
219 |     "    \n",
220 |     "    # the far plane of the viewpoint frustum is set as the 80th percentile depth value\n",
221 |     "    # of each viewpoint\n",
222 |     "    far_dist_fn=lambda entry: entry.extras['depth_percentiles']['0.8'],\n",
223 |     ")"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "id": "13dffe4c-cd21-4efb-bd51-829709ae22cd",
229 |    "metadata": {},
230 |    "source": [
231 |     "Visualize the retrieved viewpoint frustums"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "id": "5423e3f7",
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "o3d.visualization.draw_geometries([scene_mesh] + [vp.o3d_lineset(color=np.random.rand(3)) for vp in viewpoints])"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "id": "fad8af37-b98c-4edc-b1ff-1fd53f74f8ba",
247 |    "metadata": {},
248 |    "source": [
249 |     "### Localization pipeline\n",
250 |     "The localization pipeline takes as input the frustums of the retrieved viewpoints and performs a voting procedure over voxels in the scene to generate localized regions for the selected tag.\n",
251 |     "\n",
252 |     "The final output is a set of proposed localizations for the tag, represented as bounding boxes, along with the confidence level (min number of votes) for each bounding box."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "id": "e139603a",
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "voxel_size = 0.2\n",
263 |     "\n",
264 |     "localization_params = {\n",
265 |     "    'voxel_voting': {\n",
266 |     "        'viewpoint_weight': None,  # [None, 'confidence']\n",
267 |     "        'voxel_size': voxel_size,\n",
268 |     "        'scoring_method': 'normalized_votes',  # ['normalized_votes', 'votes']\n",
269 |     "    },\n",
270 |     "    \n",
271 |     "    'clustering': {\n",
272 |     "        'algorithm': 'dbscan',  # ['dbscan', 'hdbscan']\n",
273 |     "        'dbscan_kwargs': {\n",
274 |     "            'eps': 2 * voxel_size,\n",
275 |     "            'min_points': 5,\n",
276 |     "            'print_progress': False,\n",
277 |     "        },\n",
278 |     "        \n",
279 |     "        'clustering_levels': [0.0, 0.25, 0.5, 0.75],  # only used if 'scoring_method' == 'normalized_votes'\n",
280 |     "        'bounding_box_type': 'axis_aligned',  # ['axis_aligned', 'oriented']\n",
281 |     "    },\n",
282 |     "}"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "acd5d34f",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "loc_outputs = localization_pipeline(viewpoints, localization_params, verbose=False)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "id": "7f4645df-28f4-40a7-8ed5-8f3047ae9bd8",
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "voxel_center_points = loc_outputs[\"voxel_center_points\"]\n",
303 |     "voxel_scores = loc_outputs[\"voxel_scores\"]\n",
304 |     "level_bbxes = loc_outputs[\"level_bbxes\"]"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "id": "8ee7d6b3",
310 |    "metadata": {},
311 |    "source": [
312 |     "## Visualize localizations"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "id": "18f71776-d212-4ed6-ae6a-bdfc879a54e0",
318 |    "metadata": {},
319 |    "source": [
320 |     "Visualize the voxel voting results. Voxel points are colored by their corresponding number of votes."
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "id": "a5c4629f",
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "voxel_center_points_color = cm.viridis(voxel_scores / voxel_scores.max())[:, :3]\n",
331 |     "\n",
332 |     "voxel_pcd = o3d.geometry.PointCloud()\n",
333 |     "voxel_pcd.points = o3d.utility.Vector3dVector(voxel_center_points)\n",
334 |     "voxel_pcd.colors = o3d.utility.Vector3dVector(voxel_center_points_color)\n",
335 |     "\n",
336 |     "o3d.visualization.draw_geometries([scene_mesh, voxel_pcd])"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "id": "41a003d2",
342 |    "metadata": {},
343 |    "source": [
344 |     "Visualize proposed localization bounding boxes. Bounding boxes are colored by their confidence levels corresponding to the minimum number of votes for voxels within the bounding box."
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "id": "0a833f66",
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "confidences = [l for l, _ in level_bbxes]\n",
355 |     "boxes = [b for _, b in level_bbxes]\n",
356 |     "max_conf = np.max(confidences)\n",
357 |     "\n",
358 |     "boxes_linemeshes = []\n",
359 |     "for conf, box in zip(confidences, boxes):\n",
360 |     "    color = cm.viridis(conf / max_conf)[:3]\n",
361 |     "    \n",
362 |     "    boxes_linemeshes += box_to_linemesh(\n",
363 |     "        box, \n",
364 |     "        color=color, \n",
365 |     "        radius=0.02\n",
366 |     "    ).cylinder_segments\n",
367 |     "    \n",
368 |     "o3d.visualization.draw_geometries([scene_mesh] + boxes_linemeshes)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "5c95556e",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": []
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "Python 3 (ipykernel)",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.8.10"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 5
401 | }
402 | 


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation
 2 | These instructions outline the pipeline for evaluating the Tag Map localizations against the coarse-localization metrics P2E and E2P as described in the paper. 
 3 | 
 4 | Currently the evaluation is only supported for the Matterport3D (MP3D) dataset which can be downloaded following the instructions [here](https://niessner.github.io/Matterport/).
 5 | 
 6 | 
 7 | ## Setup
 8 | The evaluation assumes that the MP3D data folder has the following structure:
 9 | ```
10 | <mp3d_dir>
11 | ├── <scene 1>
12 | │   ├── undistorted_color_images
13 | │   ├── undistorted_depth_images
14 | │   ├── matterport_camera_poses
15 | │   ├── matterport_camera_intrinsics
16 | │   ├── house_segmentations
17 | |   └── ...
18 | ├── <scene 2>
19 | │   ├── undistorted_color_images
20 | │   ├── undistorted_depth_images
21 | │   ├── matterport_camera_poses
22 | │   ├── matterport_camera_intrinsics
23 | │   ├── house_segmentations
24 | |   └── ...
25 | └── ...
26 | ```
27 | 
28 | 
29 | ## 1. Generate Tag Maps for all scenes
30 | The Tag Maps for all MP3D scenes can be generated using:
31 | 
32 | ```
33 | python scripts/generate_tag_maps_matterport.py \
34 |   --params config/tag_map_creation//matterport_ram.yaml \
35 |   --output_dir <path_to_output_directory> \
36 |   --matterport_dir <path_to_mp3d_dir>
37 | ```
38 | 
39 | Alternatively, pre-generated Tag Maps can be downloaded [here](https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/mp3d_tag_maps.zip). Please read and agree to the [MP3D EULA](https://kaldir.vc.in.tum.de/matterport/MP_TOS.pdf) before downloading.
40 | 
41 | 
42 | 
43 | ## 2. Generate scene lattice graphs
44 | Computing the coarse-localization metrics P2E and E2P requires computing the shortest paths between points in the scene. The shortest path computation is approximated using a lattice graph which spans the scene's free space while avoiding collisions with the scene geometry. Shortests paths are then computed and stored for each pair of nodes in the lattice graph.
45 | 
46 | The lattice graphs and precomputed shortest paths for all MP3D scenes can be generated using:
47 | ```
48 | python scripts/generate_lattice_graph_matterport.py \
49 |   --params config/lattice_graph_creation/matterport.yaml \
50 |   --output_dir <path_to_output_directory> \
51 |   --matterport_dir <path_to_mp3d_dir>
52 | ```
53 | 
54 | Alternatively, pre-generated lattice graphs can be downloaded [here](https://huggingface.co/datasets/frozendonuts/tag-mapping/resolve/main/mp3d_lattice_graphs.zip) (61 GB). Please read and agree to the [MP3D EULA](https://kaldir.vc.in.tum.de/matterport/MP_TOS.pdf) before downloading.
55 | 
56 | Lattice graphs can be visualized using the included script:
57 | ```
58 | python scripts/visualize_lattice_graph_matterport.py \
59 |   --lattice_graph_path <path_to_lattice_graph_file> \
60 |   --matterport_dir <path_to_mp3d_dir>
61 | ```
62 | 
63 | 
64 | ## 3. Run the evaluation
65 | The evaluation is ran with the following command:
66 | ```
67 | python scripts/evaluate_localization_matterport.py \
68 |   --params <path_to_param_file> \
69 |   --tag_maps_dir <path_to_tag_maps_directory> \
70 |   --lattice_graphs_dir <path_to_lattice_graphs_directory> \
71 |   --output_dir <path_to_output_directory> \
72 |   --matterport_dir <path_to_mp3d_dir>
73 | ```
74 | 
75 | Evaluations are done separately for the labeled objects and labeled regions/locations depending on the setting of the params file. For running the object and region evaluations the param files `config/evaluation/matterport_objects.yaml` and `config/evaluation/matterport_regions.yaml` can be used respectively.
76 | 
77 | For each scene, the evaluation outputs are saved as a pickled Python dictionary.
78 | 
79 | 
80 | ## 4. Visualizing evaluation results
81 | The evaluation saves an output file for every scene in the dataset. A notebook for visualizing the evaluation outputs for a scene can be found under the `notebooks` folder. 
82 | 


--------------------------------------------------------------------------------
/evaluation/config/evaluation/matterport_objects.yaml:
--------------------------------------------------------------------------------
 1 | label_params:
 2 |   type: "object" # ["object", "region"]
 3 | 
 4 |   blacklisted_labels: [
 5 |     "misc", "objects", "void", "unlabeled",
 6 |     "wall", "floor", "ceiling",
 7 |   ]
 8 | 
 9 | viewpoint_kwargs:
10 |   far_dist_fn: !python/lambda "lambda entry: entry.extras['depth_percentiles']['0.8']"
11 |   near_dist_fn: null
12 | 
13 | localization_kwargs:
14 |   params: 
15 |     voxel_voting:
16 |       viewpoint_weight: null  # [null, 'confidence']
17 |       voxel_size: 0.2
18 |       scoring_method: "normalized_votes"  # ['normalized_votes', 'votes']
19 |       
20 |     clustering:
21 |       algorithm: "dbscan"  # ['dbscan', 'hdbscan']
22 |       dbscan_kwargs:
23 |           eps: 0.4   # 2 * voxel_size
24 |           min_points: 5
25 |           print_progress: false
26 |       clustering_levels: [0.0, 0.25, 0.5, 0.75,]  # only used if 'scoring_method' == 'normalized_votes'
27 |       bounding_box_type: "axis_aligned"  # ['axis_aligned', 'oriented']


--------------------------------------------------------------------------------
/evaluation/config/evaluation/matterport_regions.yaml:
--------------------------------------------------------------------------------
 1 | label_params:
 2 |   type: "region" # ["object", "region"]
 3 | 
 4 |   blacklisted_labels: [
 5 |     "other room", "junk", "no label",
 6 | 
 7 |     # no appropriate tag
 8 |     "dining booth",
 9 |     "entryway/foyer/lobby",
10 |     "outdoor",
11 |   ]
12 | 
13 | viewpoint_kwargs:
14 |   far_dist_fn: !python/lambda "lambda entry: entry.extras['depth_percentiles']['0.8']"
15 |   near_dist_fn: null
16 | 
17 | localization_kwargs:
18 |   params: 
19 |     voxel_voting:
20 |       viewpoint_weight: null  # [null, 'confidence']
21 |       voxel_size: 0.2
22 |       scoring_method: "normalized_votes"  # ['normalized_votes', 'votes']
23 |       
24 |     clustering:
25 |       algorithm: "dbscan"  # ['dbscan', 'hdbscan']
26 |       dbscan_kwargs:
27 |           eps: 0.4   # 2 * voxel_size
28 |           min_points: 5
29 |           print_progress: false
30 |       clustering_levels: [0.0, 0.25, 0.5, 0.75,]  # only used if 'scoring_method' == 'normalized_votes'
31 |       bounding_box_type: "axis_aligned"  # ['axis_aligned', 'oriented']


--------------------------------------------------------------------------------
/evaluation/config/lattice_graph_creation/matterport.yaml:
--------------------------------------------------------------------------------
 1 | lattice_graph_creation_params:
 2 |   lattice_grid_kwargs:
 3 |     grid_res: 0.5
 4 |     outer_pad: 0.1
 5 | 
 6 |   lattice_filter_kwargs:
 7 |     distance_threshold: 2.0
 8 |     within_mesh_threshold: 0.0
 9 |     kdtree_query_k: 10
10 |     kdtree_query_num_workers: 8
11 | 


--------------------------------------------------------------------------------
/evaluation/config/tag_map_creation/matterport_ram.yaml:
--------------------------------------------------------------------------------
 1 | model_params:
 2 |   model: ram
 3 |   model_config:
 4 |     ram_pretrained_path: ram_swin_large_14m.pth   # set this to the path of the downloaded model checkpoint
 5 |     ram_image_size: 384
 6 |     vit: swin_l
 7 |     device: cuda
 8 | 
 9 | tag_map_generation_params:
10 |   matterport_viewpoint_near_dist: 0.2
11 | 
12 |   filtered_tagging_params:
13 |     crop_border_proportions: [0.05, 0.1]
14 | 
15 |   depth_filtering_params:
16 |     mean_threshold: 0.6
17 |     quantile_thresholds: [
18 |       [0.5, 0.6],  # [percentile, threshold]
19 |     ]
20 | 
21 |   stored_depth_percentiles: [0.8]


--------------------------------------------------------------------------------
/evaluation/config/tag_map_creation/matterport_ram_plus.yaml:
--------------------------------------------------------------------------------
 1 | model_params:
 2 |   model: ram_plus
 3 |   model_config:
 4 |     ram_pretrained_path: ram_plus_swin_large_14m.pth   # set this to the path of the downloaded model checkpoint
 5 |     ram_image_size: 384
 6 |     vit: swin_l
 7 |     device: cuda
 8 | 
 9 | tag_map_generation_params:
10 |   matterport_viewpoint_near_dist: 0.2
11 | 
12 |   filtered_tagging_params:
13 |     crop_border_proportions: [0.05, 0.1]
14 | 
15 |   depth_filtering_params:
16 |     mean_threshold: 0.6
17 |     quantile_thresholds: [
18 |       [0.5, 0.6],  # [percentile, threshold]
19 |     ]
20 | 
21 |   stored_depth_percentiles: [0.8]


--------------------------------------------------------------------------------
/evaluation/notebooks/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from typing import List
 4 | 
 5 | from tag_mapping.evaluation import LatticeNavigationGraph
 6 | from tag_mapping.utils import LineMesh
 7 | 
 8 | 
 9 | def generate_lattice_graph_shortest_path_linemeshes(
10 |     lattice_graph: LatticeNavigationGraph, node_inds_a: List, node_inds_b: List
11 | ):
12 |     shortest_path_linemeshes = []
13 |     for a_ind in node_inds_a:
14 |         spl = np.inf
15 |         matched_l_ind = None
16 |         for b_ind in node_inds_b:
17 | 
18 |             new_spl = lattice_graph.shortest_path_length(a_ind, b_ind)
19 |             if new_spl == None:
20 |                 continue
21 | 
22 |             if new_spl < spl:
23 |                 spl = new_spl
24 |                 matched_l_ind = b_ind
25 | 
26 |         if matched_l_ind != None:
27 |             sp_inds = lattice_graph.shortest_path(a_ind, matched_l_ind)
28 | 
29 |             sp_lines = np.zeros((len(sp_inds) - 1, 2)).astype(np.int32)
30 |             sp_lines[:, 0] = np.arange(len(sp_inds) - 1)
31 |             sp_lines[:, 1] = 1 + np.arange(len(sp_inds) - 1)
32 | 
33 |             sp_linemesh = LineMesh(
34 |                 points=lattice_graph.nodes_xyz[sp_inds],
35 |                 lines=sp_lines,
36 |                 colors=(0, 1, 1),
37 |                 radius=0.01,
38 |             )
39 | 
40 |             shortest_path_linemeshes += sp_linemesh.cylinder_segments
41 | 
42 |     return shortest_path_linemeshes
43 | 


--------------------------------------------------------------------------------
/evaluation/scripts/evaluate_localization_matterport.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import logging
  4 | from datetime import datetime
  5 | 
  6 | from tag_mapping.datasets.matterport.evaluate_matterport_scan_object_localizations import (
  7 |     evaluate_matterport_scan_object_localizations,
  8 | )
  9 | from tag_mapping.datasets.matterport.evaluate_matterport_scan_region_localizations import (
 10 |     evaluate_matterport_scan_region_localizations,
 11 | )
 12 | from tag_mapping.utils import load_yaml_params
 13 | 
 14 | 
 15 | if __name__ == "__main__":
 16 |     parser = argparse.ArgumentParser(
 17 |         description="Generate tag maps from Matterport scans"
 18 |     )
 19 |     parser.add_argument("--params_path", type=str, help="Path to params file")
 20 |     parser.add_argument("--tag_maps_dir", type=str, help="Path to tag map file")
 21 |     parser.add_argument(
 22 |         "--lattice_graphs_dir", type=str, help="Path to lattice graph file"
 23 |     )
 24 |     parser.add_argument("--output_dir", type=str, help="Path to output directory")
 25 |     parser.add_argument(
 26 |         "--output_name", type=str, help="Name of evaluation output directory"
 27 |     )
 28 |     parser.add_argument(
 29 |         "--matterport_dir", type=str, help="Path to directory of matterport scans"
 30 |     )
 31 |     parser.add_argument(
 32 |         "--scans",
 33 |         nargs="+",
 34 |         help="Scans to generate tag maps for. If not specified, all scans will in matterport_dir will be processed.",
 35 |     )
 36 |     args = parser.parse_args()
 37 | 
 38 |     # Setup logger
 39 |     logger = logging.getLogger(__name__)
 40 |     logger.addHandler(logging.StreamHandler())
 41 |     logger.setLevel(logging.INFO)
 42 | 
 43 |     # Read params
 44 |     params = load_yaml_params(args.params_path)
 45 | 
 46 |     label_type = params["label_params"]["type"]
 47 |     if label_type == "object":
 48 |         evaluate_matterport_scan_localization = (
 49 |             evaluate_matterport_scan_object_localizations
 50 |         )
 51 |     elif label_type == "region":
 52 |         evaluate_matterport_scan_localization = (
 53 |             evaluate_matterport_scan_region_localizations
 54 |         )
 55 |     else:
 56 |         raise ValueError(f"Invalid label type {params['label_params']['type']}")
 57 | 
 58 |     # Create output save directory
 59 |     output_name = (
 60 |         f"matterport_{label_type}_evaluation" if args.output_name == None else args.output_name
 61 |     )
 62 |     output_save_dir = os.path.join(
 63 |         args.output_dir,
 64 |         f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}",
 65 |     )
 66 |     os.makedirs(output_save_dir, exist_ok=True)
 67 |     logger.info(
 68 |         f"created matterport evaluation outputs save directory {output_save_dir}"
 69 |     )
 70 | 
 71 |     # Copy param file to output save dir
 72 |     os.system(f"cp {args.params_path} {output_save_dir}/_evaluation_params.yaml")
 73 | 
 74 |     scan_names = (
 75 |         args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir))
 76 |     )
 77 |     for scan_name in scan_names:
 78 |         logger.info(f"\n\nrunning evaluation on scan {scan_name}")
 79 | 
 80 |         scan_dir = os.path.join(args.matterport_dir, f"{scan_name}")
 81 |         if not os.path.isdir(scan_dir):
 82 |             logger.warning(f"skipping due to non-existing scan directory {scan_dir}")
 83 |             continue
 84 | 
 85 |         tag_map_path = os.path.join(args.tag_maps_dir, f"{scan_name}.tagmap")
 86 |         if not os.path.isfile(tag_map_path):
 87 |             logger.warning(f"skipping due to non-existing tag map {tag_map_path}")
 88 |             continue
 89 | 
 90 |         lattice_graph_path = os.path.join(
 91 |             args.lattice_graphs_dir, f"{scan_name}_lattice_graph.pkl"
 92 |         )
 93 |         if not os.path.isfile(lattice_graph_path):
 94 |             logger.warning(
 95 |                 f"skipping due to non-existing lattice graph {lattice_graph_path}"
 96 |             )
 97 |             continue
 98 | 
 99 |         try:
100 |             evaluate_matterport_scan_localization(
101 |                 params=params,
102 |                 scan_dir=os.path.abspath(scan_dir),
103 |                 tag_map_path=os.path.abspath(tag_map_path),
104 |                 lattice_graph_path=os.path.abspath(lattice_graph_path),
105 |                 output_dir=output_save_dir,
106 |                 logger=logger,
107 |             )
108 |         except Exception as e:
109 |             logger.error(f"failed to generate tag map for scan {scan_name}")
110 |             logger.error(e)
111 |             continue
112 | 


--------------------------------------------------------------------------------
/evaluation/scripts/generate_lattice_graph_matterport.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | from datetime import datetime
 5 | 
 6 | import open3d as o3d
 7 | 
 8 | from tag_mapping.evaluation import create_lattice_navigation_graph
 9 | from tag_mapping.utils import load_yaml_params
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(
14 |         description="Generate lattice navigation graphs from Matterport scans"
15 |     )
16 |     parser.add_argument("--params_path", type=str, help="Path to params file")
17 |     parser.add_argument("--output_dir", type=str, help="Path to output directory")
18 |     parser.add_argument(
19 |         "--output_name", type=str, help="Name of evaluation output directory"
20 |     )
21 |     parser.add_argument(
22 |         "--matterport_dir", type=str, help="Path to directory of matterport scans"
23 |     )
24 |     parser.add_argument(
25 |         "--scans",
26 |         nargs="+",
27 |         help="Scans to generate lattice navigation graphs for. If not specified, all scans will in matterport_dir will be processed.",
28 |     )
29 |     args = parser.parse_args()
30 | 
31 |     # Setup logger
32 |     logger = logging.getLogger(__name__)
33 |     logger.addHandler(logging.StreamHandler())
34 |     logger.setLevel(logging.INFO)
35 | 
36 |     # Read params
37 |     params = load_yaml_params(args.params_path)
38 | 
39 |     # Create output save directory
40 |     output_name = (
41 |         "matterport_lattice_graphs" if args.output_name == None else args.output_name
42 |     )
43 |     output_save_dir = os.path.join(
44 |         args.output_dir,
45 |         f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}",
46 |     )
47 |     os.makedirs(output_save_dir, exist_ok=True)
48 |     logger.info(
49 |         f"created matterport lattice graphs output save directory {output_save_dir}"
50 |     )
51 | 
52 |     # Copy param file to output save dir
53 |     os.system(f"cp {args.params_path} {output_save_dir}/_gen_params.yaml")
54 | 
55 |     # Generate lattice navigation graph for each scan
56 |     scan_names = (
57 |         args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir))
58 |     )
59 |     for scan_name in scan_names:
60 |         logger.info(f"\n\ncreating lattice graph for scan {scan_name}")
61 |         scan_dir = os.path.join(args.matterport_dir, f"{scan_name}")
62 |         ply_file_path = os.path.join(
63 |             scan_dir, "house_segmentations", f"{scan_name}.ply"
64 |         )
65 |         mesh = o3d.io.read_triangle_mesh(ply_file_path)
66 | 
67 |         try:
68 |             lattice_graph = create_lattice_navigation_graph(
69 |                 mesh,
70 |                 params=params["lattice_graph_creation_params"],
71 |                 print_progress=True,
72 |             )
73 |         except Exception as e:
74 |             logger.error(f"failed to generate tag map for scan {scan_name}")
75 |             logger.error(e)
76 |             continue
77 | 
78 |         save_path = os.path.join(output_save_dir, f"{scan_name}_lattice_graph.pkl")
79 |         lattice_graph.save(save_path)
80 |         logger.info(f"saved lattice graph to {save_path}")
81 | 


--------------------------------------------------------------------------------
/evaluation/scripts/generate_tag_maps_matterport.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import logging
 4 | from datetime import datetime
 5 | 
 6 | from tag_mapping.models import RAMTagger, RAMPlusTagger
 7 | 
 8 | from tag_mapping.datasets.matterport.generate_tag_map_from_matterport_scan import (
 9 |     generate_tag_map_from_matterport_scan,
10 | )
11 | from tag_mapping.utils import load_yaml_params
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser(
16 |         description="Generate tag maps from Matterport scans"
17 |     )
18 |     parser.add_argument("--params_path", type=str, help="Path to params file")
19 |     parser.add_argument("--output_dir", type=str, help="Path to output directory")
20 |     parser.add_argument(
21 |         "--output_name", type=str, help="Name of evaluation output directory"
22 |     )
23 |     parser.add_argument(
24 |         "--matterport_dir", type=str, help="Path to directory of matterport scans"
25 |     )
26 |     parser.add_argument(
27 |         "--scans",
28 |         nargs="+",
29 |         help="Scans to generate tag maps for. If not specified, all scans will in matterport_dir will be processed.",
30 |     )
31 |     args = parser.parse_args()
32 | 
33 |     # Setup logger
34 |     logger = logging.getLogger(__name__)
35 |     logger.addHandler(logging.StreamHandler())
36 |     logger.setLevel(logging.INFO)
37 | 
38 |     # Read params
39 |     params = load_yaml_params(args.params_path)
40 |     model_params = params["model_params"]
41 |     tag_map_generation_params = params["tag_map_generation_params"]
42 | 
43 |     # Create output save directory
44 |     output_name = (
45 |         "matterport_tag_maps" if args.output_name == None else args.output_name
46 |     )
47 |     output_save_dir = os.path.join(
48 |         args.output_dir,
49 |         f"{output_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}",
50 |     )
51 |     os.makedirs(output_save_dir, exist_ok=True)
52 |     logger.info(f"created matterport tag maps output save directory {output_save_dir}")
53 | 
54 |     # Load tagging model
55 |     if model_params["model"] == "ram":
56 |         tagging_model = RAMTagger(
57 |             config=model_params["model_config"],
58 |         )
59 |     elif model_params["model"] == "ram_plus":
60 |         tagging_model = RAMPlusTagger(
61 |             config=model_params["model_config"],
62 |         )
63 |     else:
64 |         raise ValueError(f"Unsupported model type {model_params['model']}")
65 | 
66 |     # Copy param file to output save dir
67 |     os.system(f"cp {args.params_path} {output_save_dir}/_gen_params.yaml")
68 | 
69 |     # Generate tag maps for each scan
70 |     scan_names = (
71 |         args.scans if args.scans != None else sorted(os.listdir(args.matterport_dir))
72 |     )
73 |     for scan_name in scan_names:
74 |         scan_dir = os.path.join(args.matterport_dir, f"{scan_name}")
75 | 
76 |         if not os.path.isdir(scan_dir):
77 |             logger.warning(f"skipping non-existing scan directory {scan_dir}")
78 |             continue
79 | 
80 |         try:
81 |             generate_tag_map_from_matterport_scan(
82 |                 params=tag_map_generation_params,
83 |                 tagging_model=tagging_model,
84 |                 scan_dir=scan_dir,
85 |                 output_dir=output_save_dir,
86 |                 logger=logger,
87 |             )
88 |         except Exception as e:
89 |             logger.error(f"failed to generate tag map for scan {scan_name}")
90 |             logger.error(e)
91 |             continue
92 | 


--------------------------------------------------------------------------------
/evaluation/scripts/visualize_lattice_graph_matterport.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import open3d as o3d
 5 | 
 6 | from tag_mapping.evaluation import LatticeNavigationGraph
 7 | 
 8 | 
 9 | def load_and_visualize_lattice_graph(
10 |     lattice_graph_path,
11 |     matterport_scans_dir,
12 | ):
13 |     scan_name = os.path.basename(lattice_graph_path).split('_')[0]
14 |     ply_path = os.path.join(
15 |         matterport_scans_dir, f"{scan_name}/house_segmentations/{scan_name}.ply"
16 |     )
17 |     mesh = o3d.io.read_triangle_mesh(ply_path)
18 |     lattice_graph = LatticeNavigationGraph.load(lattice_graph_path)
19 |     o3d.visualization.draw_geometries(
20 |         [
21 |             mesh,
22 |             lattice_graph.o3d_nodes_pointcloud,
23 |             lattice_graph.o3d_edges_lineset,
24 |         ]
25 |     )
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument(
31 |         "--lattice_graph_path",
32 |         type=str,
33 |         required=True,
34 |         help="Path to the lattice graph file",
35 |     )
36 |     parser.add_argument(
37 |         "--matterport_dir", type=str, help="Path to directory of matterport scans"
38 |     )
39 |     args = parser.parse_args()
40 | 
41 |     load_and_visualize_lattice_graph(
42 |         args.lattice_graph_path,
43 |         args.matterport_dir,
44 |     )
45 | 


--------------------------------------------------------------------------------
/tag_mapping/requirements.txt:
--------------------------------------------------------------------------------
 1 | cvxpy==1.5.2
 2 | hdbscan
 3 | jupyterlab
 4 | networkx
 5 | open3d==0.17.0
 6 | opencv-python
 7 | pandas
 8 | plyfile
 9 | recognize-anything
10 | sentence-transformers


--------------------------------------------------------------------------------
/tag_mapping/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='tag_mapping',
 5 |     version='0.1.0',
 6 |     packages=find_packages(),
 7 |     install_requires=[
 8 |         'recognize-anything',
 9 |     ],
10 |     description='Package for building spatial maps from image tags',
11 | )


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tag_map import TagMap, TagMapEntry
 2 | 
 3 | from .pose_graph import PoseGraph
 4 | 
 5 | import os
 6 | 
 7 | TAG_MAPPING_ROOT_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 8 | """Absolute path to the tag mapping root dir."""
 9 | 
10 | TAG_MAPPING_CONFIG_DIR = os.path.join(TAG_MAPPING_ROOT_DIR, 'config')


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/__init__.py:
--------------------------------------------------------------------------------
1 | from .file_utils import *
2 | 
3 | from .mpcat40_ram_tags_mapping import MPCAT40_RAM_TAGS_MAPPING
4 | from .mp_region_ram_tags_mapping import MP_REGION_RAM_TAGS_MAPPING
5 | 
6 | from .matterport_object_bounding_box import MatterportObjectBoundingBox
7 | from .matterport_region_bounding_box import MatterportRegionBoundingBox
8 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/category_mapping.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import yaml
 6 | 
 7 | 
 8 | CATEGORY_MAPPING_LINK = "https://raw.githubusercontent.com/niessner/Matterport/master/metadata/category_mapping.tsv"
 9 | CATEGORY_INDEX_MAPPING_PATH = (
10 |     Path(__file__).parent.absolute().joinpath("category_index_mapping.yaml")
11 | )
12 | 
13 | 
14 | def get_category_index_mapping():
15 |     df = pd.read_csv(CATEGORY_MAPPING_LINK, sep="\t")
16 |     df.replace(np.nan, "", inplace=True)  # replace empty cells with empty string
17 | 
18 |     category_index_mapping = {}
19 |     for _, row in df.iterrows():
20 |         index = (
21 |             row["index"] - 1
22 |         )  # index actually starts from 0, but in the .tsv file it starts from 1
23 | 
24 |         mappings = {}
25 |         for key, value in row.items():
26 |             if key == "index":
27 |                 continue
28 |             mappings[key] = value
29 | 
30 |         category_index_mapping[index] = mappings
31 | 
32 |     with open(CATEGORY_INDEX_MAPPING_PATH, "w") as f:
33 |         yaml.dump(category_index_mapping, f)
34 | 
35 | 
36 | def load_category_index_mapping():
37 |     with open(CATEGORY_INDEX_MAPPING_PATH, "r") as f:
38 |         category_index_mapping = yaml.load(f, Loader=yaml.FullLoader)
39 |     return category_index_mapping
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     get_category_index_mapping()
44 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/file_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | from typing import Tuple, List
  4 | 
  5 | 
  6 | def read_matterport_image_file(image_filepath) -> Image.Image:
  7 |     image = Image.open(image_filepath)
  8 |     return image
  9 | 
 10 | 
 11 | def read_matterport_depth_file(depth_filepath) -> Tuple[np.ndarray, Image.Image]:
 12 |     depth_image = Image.open(depth_filepath)
 13 |     SCALE_FACTOR = 4000  # https://github.com/vsislab/Matterport3D-Layout/issues/4
 14 |     depth = np.asarray(depth_image) / SCALE_FACTOR
 15 |     return depth, depth_image
 16 | 
 17 | 
 18 | def read_matterport_pose_file(pose_filepath) -> np.ndarray:
 19 |     # NOTE: returns the pose from camera frame to the world frame
 20 |     with open(pose_filepath, "r") as file:
 21 |         lines = file.readlines()
 22 |     T_cam_to_world = np.array(
 23 |         [list(map(float, line.split(" ")[:-1])) for line in lines]
 24 |     )
 25 |     return T_cam_to_world
 26 | 
 27 | 
 28 | def read_matterport_intrinsics_file(intrinsics_filepath):
 29 |     with open(intrinsics_filepath, "r") as file:
 30 |         lines = file.readlines()
 31 |     line = lines[0]
 32 |     intrinsics = line.split(" ")
 33 | 
 34 |     width = int(intrinsics[0])
 35 |     height = int(intrinsics[1])
 36 |     fx = float(intrinsics[2])
 37 |     fy = float(intrinsics[3])
 38 |     cx = float(intrinsics[4])
 39 |     cy = float(intrinsics[5])
 40 |     d = [float(i) for i in intrinsics[6:]]
 41 | 
 42 |     return width, height, fx, fy, cx, cy, d
 43 | 
 44 | 
 45 | class MatterportFilenameBridge:
 46 |     def __init__(self, frame_identifiers):
 47 |         self._frame_identifiers = frame_identifiers
 48 | 
 49 |     @classmethod
 50 |     def from_image_filename(cls, image_filename):
 51 |         frame_identifiers = image_filename.split(".")[0].split("_")
 52 |         frame_identifiers[1] = frame_identifiers[1][1:]
 53 |         return cls(frame_identifiers)
 54 | 
 55 |     @classmethod
 56 |     def from_pose_filename(cls, pose_filename):
 57 |         frame_identifiers = pose_filename.split(".")[0].split("_")
 58 |         frame_identifiers.remove("pose")
 59 |         return cls(frame_identifiers)
 60 | 
 61 |     @property
 62 |     def image_filename(self):
 63 |         return (
 64 |             self._frame_identifiers[0]
 65 |             + "_i"
 66 |             + self._frame_identifiers[1]
 67 |             + "_"
 68 |             + self._frame_identifiers[2]
 69 |             + ".jpg"
 70 |         )
 71 | 
 72 |     @property
 73 |     def depth_filename(self):
 74 |         return (
 75 |             self._frame_identifiers[0]
 76 |             + "_d"
 77 |             + self._frame_identifiers[1]
 78 |             + "_"
 79 |             + self._frame_identifiers[2]
 80 |             + ".png"
 81 |         )
 82 | 
 83 |     @property
 84 |     def pose_filename(self):
 85 |         return (
 86 |             self._frame_identifiers[0]
 87 |             + "_pose_"
 88 |             + self._frame_identifiers[1]
 89 |             + "_"
 90 |             + self._frame_identifiers[2]
 91 |             + ".txt"
 92 |         )
 93 | 
 94 | 
 95 | import re
 96 | from .matterport_object_bounding_box import MatterportObjectBoundingBox
 97 | 
 98 | from .category_mapping import load_category_index_mapping
 99 | 
100 | MATTERPORT_CATEGORY_INDEX_MAPPING = load_category_index_mapping()
101 | 
102 | 
103 | def read_matterport_object_bounding_boxes(
104 |     house_filepath,
105 |     category_taxonomies=("category", "mpcat40"),
106 | ):
107 |     """
108 |     Reads the object bounding box labels in a .house file.
109 | 
110 |     Args:
111 |         house_filepath: A string that specifies the path to the .house file.
112 |         category_taxonomies: A tuple of strings that specifies the taxonomies to use for
113 |             categorizing the objects.
114 | 
115 |     Returns:
116 |         A dictionary that maps each taxonomy to a dictionary mapping that taxonomy's labels
117 |             to a list of the corresponding bounding boxes.
118 |     """
119 |     for t in category_taxonomies:
120 |         assert t in MATTERPORT_CATEGORY_INDEX_MAPPING[0], "Invalid taxonomy {}".format(
121 |             t
122 |         )
123 | 
124 |     with open(house_filepath) as house_file:
125 |         lines = house_file.readlines()
126 | 
127 |     boxes = []
128 |     for line in lines:
129 |         if line[0] == "O":
130 |             data = re.split(r" +", line)
131 | 
132 |             boxes.append(
133 |                 MatterportObjectBoundingBox(
134 |                     category_index=int(data[3]),
135 |                     center=np.array(data[4:7], dtype=float),
136 |                     a1=np.array(data[7:10], dtype=float),
137 |                     a2=np.array(data[10:13], dtype=float),
138 |                     r=np.array(data[13:16], dtype=float),
139 |                 )
140 |             )
141 | 
142 |     out = {t: {} for t in category_taxonomies}
143 |     for box in boxes:
144 |         for t in category_taxonomies:
145 |             try:
146 |                 t_label = MATTERPORT_CATEGORY_INDEX_MAPPING[box.category_index][t]
147 |                 if t_label not in out[t]:
148 |                     out[t][t_label] = [box]
149 |                 else:
150 |                     out[t][t_label].append(box)
151 |             except KeyError:
152 |                 print(
153 |                     "[warning]: bounding box with invalid category_index {}".format(
154 |                         box.category_index
155 |                     )
156 |                 )
157 | 
158 |     return out
159 | 
160 | 
161 | from plyfile import PlyData
162 | 
163 | 
164 | def read_matterport_labeled_points(
165 |     mesh_ply_filepath,
166 |     category_taxonomies=("category", "mpcat40"),
167 | ):
168 |     """
169 |     Gets labeled points from the .ply house segmentation file.
170 |     Since the faces of the mesh are labeled, we compute the points as the mean of the vertices of the faces.
171 | 
172 |     Args:
173 |         house_filepath: A string that specifies the path to the .house file.
174 |         category_taxonomies: A tuple of strings that specifies the taxonomies to use for
175 |             categorizing the objects.
176 | 
177 |     Returns:
178 |         First returns a dictionary that maps each taxonomy to a dictionary mapping that taxonomy's labels
179 |             to a list of the corresponding points. Second, returns the points as a numpy array.
180 |     """
181 |     for t in category_taxonomies:
182 |         assert t in MATTERPORT_CATEGORY_INDEX_MAPPING[0], "Invalid taxonomy {}".format(
183 |             t
184 |         )
185 | 
186 |     plydata = PlyData.read(mesh_ply_filepath)
187 |     vertex_xyz = np.zeros((len(plydata["vertex"]), 3))
188 |     vertex_xyz[:, 0] = plydata["vertex"]["x"]
189 |     vertex_xyz[:, 1] = plydata["vertex"]["y"]
190 |     vertex_xyz[:, 2] = plydata["vertex"]["z"]
191 | 
192 |     face_vertex_inds = np.vstack(  # NOTE: this computation is slow
193 |         plydata["face"]["vertex_indices"]
194 |     )
195 |     face_center_xyz = np.mean(vertex_xyz[face_vertex_inds], axis=1)
196 |     face_category_ids = plydata["face"]["category_id"]
197 | 
198 |     unique_category_ids = np.unique(face_category_ids)
199 | 
200 |     out = {t: {} for t in category_taxonomies}
201 |     for category_id in unique_category_ids:
202 |         for t in category_taxonomies:
203 |             try:
204 |                 # NOTE: use category_id - 1 because for some reason the ply file
205 |                 #      has category ids that are 1-indexed
206 |                 t_label = MATTERPORT_CATEGORY_INDEX_MAPPING[category_id - 1][t]
207 |                 face_inds = np.where(face_category_ids == category_id)[0]
208 | 
209 |                 if t_label not in out[t]:
210 |                     out[t][t_label] = face_inds
211 |                 else:
212 |                     out[t][t_label] = np.concatenate((out[t][t_label], face_inds))
213 |             except KeyError:
214 |                 print("[warning]: face with invalid category_id {}".format(category_id))
215 | 
216 |     return out, face_center_xyz
217 | 
218 | 
219 | # https://github.com/niessner/Matterport/blob/master/data_organization.md
220 | # fmt: off
221 | MATTERPORT_REGION_NAME_MAPPING = {
222 |     "a": "bathroom",  # (should have a toilet and a sink)
223 |     "b": "bedroom",
224 |     "c": "closet",
225 |     "d": "dining room",  # (includes “breakfast rooms” other rooms people mainly eat in)
226 |     "e": "entryway/foyer/lobby",  # (should be the front door, not any door)
227 | 
228 |     # "f": "familyroom",  # (should be a room that a family hangs out in, not any area with couches)
229 |     "f": "living room",
230 |     
231 |     "g": "garage",
232 |     "h": "hallway",
233 |     "i": "library",  # (should be room like a library at a university, not an individual study)
234 |     "j": "laundryroom/mudroom",  # (place where people do laundry, etc.)
235 |     "k": "kitchen",
236 |     "l": "living room",  # (should be the main “showcase” living room in a house, not any area with couches)
237 |     "m": "meetingroom/conferenceroom",
238 | 
239 |     # "n": "lounge",  # (any area where people relax in comfy chairs/couches that is not the family room or living room
240 |     "n": "living room",
241 | 
242 |     "o": "office",  # (usually for an individual, or a small set of people)
243 |     "p": "porch/terrace/deck/driveway",  # (must be outdoors on ground level)
244 |     "r": "rec/game",  # (should have recreational objects, like pool table, etc.)
245 |     "s": "stairs",
246 | 
247 |     # "t": "toilet",  # (should be a small room with ONLY a toilet)
248 |     "t": "bathroom",
249 | 
250 |     "u": "utilityroom/toolroom",
251 |     "v": "tv",  # (must have theater-style seating)
252 |     "w": "workout/gym/exercise",
253 |     "x": "outdoor",  # areas containing grass, plants, bushes, trees, etc.
254 |     "y": "balcony",  # (must be outside and must not be on ground floor)
255 |     "B": "bar",
256 |     "C": "classroom",
257 |     "D": "dining booth",
258 |     "S": "spa/sauna",
259 | 
260 |     "z": "other room",  # (it is clearly a room, but the function is not clear)
261 |     "Z": "junk",  # (reflections of mirrors, random points floating in space, etc.)
262 |     "-": "no label",
263 | }
264 | # fmt: on
265 | 
266 | from .matterport_region_bounding_box import MatterportRegionBoundingBox
267 | 
268 | 
269 | def read_matterport_region_bounding_boxes(
270 |     house_filepath,
271 | ):
272 |     """
273 |     Reads the region bounding boxes in a .house file.
274 | 
275 |     Args:
276 |         house_filepath: A string that specifies the path to the .house file.
277 | 
278 |     Returns:
279 |         A dictionary that maps each region label to a list of the corresponding bounding boxes.
280 |     """
281 |     with open(house_filepath) as house_file:
282 |         lines = house_file.readlines()
283 | 
284 |     out = {}
285 |     for line in lines:
286 |         if line[0] == "R":
287 |             data = re.split(r" +", line)
288 | 
289 |             min_bound = np.array(data[9:12], dtype=float)
290 |             max_bound = np.array(data[12:15], dtype=float)
291 |             if np.any(min_bound == max_bound):
292 |                 PAD = 1.0
293 |                 print(
294 |                     f"warning: region label box with zero volumn, padding each bound by {PAD}"
295 |                 )
296 |                 min_bound = min_bound - PAD
297 |                 max_bound = max_bound + PAD
298 | 
299 |             box = MatterportRegionBoundingBox(
300 |                 label=MATTERPORT_REGION_NAME_MAPPING[data[5]],
301 |                 min_bound=min_bound,
302 |                 max_bound=max_bound,
303 |             )
304 | 
305 |             if box.label not in out:
306 |                 out[box.label] = [box]
307 |             else:
308 |                 out[box.label].append(box)
309 |     return out
310 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/generate_tag_map_from_matterport_scan.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from typing import Dict, Union, Optional
  6 | 
  7 | import uuid
  8 | from tag_mapping import TagMap, TagMapEntry
  9 | from tag_mapping.models import ImageTagger
 10 | from tag_mapping.filtering import valid_depth_frame
 11 | from tag_mapping.datasets.matterport import (
 12 |     read_matterport_image_file,
 13 |     read_matterport_depth_file,
 14 |     read_matterport_pose_file,
 15 |     read_matterport_intrinsics_file,
 16 |     MatterportFilenameBridge,
 17 | )
 18 | 
 19 | 
 20 | def generate_tag_map_from_matterport_scan(
 21 |     params: Dict,
 22 |     tagging_model: ImageTagger,
 23 |     scan_dir: Union[str, os.PathLike],
 24 |     output_dir: Union[str, os.PathLike],
 25 |     logger: Optional[logging.Logger] = None,
 26 | ) -> None:
 27 |     """
 28 |     Generate a tag map from a matterport scan.
 29 | 
 30 |     Args:
 31 |         params: Dictionary of parameters for tag map generation.
 32 |         tagging_model: The image tagging model which defines the method filtered_tag_image().
 33 |         scan_dir: Path to the matterport scan directory.
 34 |         output_dir: Path of the directory to save the tag map.
 35 |         logger: Logger to use, if None a logger will be created at debug level.
 36 |     """
 37 |     if logger is None:
 38 |         logger = logging.getLogger(__name__)
 39 |         logger.addHandler(logging.StreamHandler())
 40 |         logger.setLevel(logging.DEBUG)
 41 | 
 42 |     scan_name = os.path.basename(scan_dir)
 43 |     logger.info(f"generating tag map from matterport scan {scan_name}")
 44 | 
 45 |     images_dir = os.path.join(scan_dir, "undistorted_color_images")
 46 |     depths_dir = os.path.join(scan_dir, "undistorted_depth_images")
 47 |     poses_dir = os.path.join(scan_dir, "matterport_camera_poses")
 48 |     intrinsics_dir = os.path.join(scan_dir, "matterport_camera_intrinsics")
 49 |     logger.info(
 50 |         f"reading images, depth images, and poses from: {images_dir}\n{depths_dir}\n{poses_dir}"
 51 |     )
 52 | 
 53 |     # Averge intrinsics across all frames to get an estimate for the scan
 54 |     intrinsics = []
 55 |     for filename in os.listdir(intrinsics_dir):
 56 |         intrinsics_filepath = os.path.join(intrinsics_dir, filename)
 57 |         width, height, fx, fy, cx, cy, d = read_matterport_intrinsics_file(
 58 |             intrinsics_filepath
 59 |         )
 60 |         intrinsics.append([width, height, fx, fy])
 61 |     intrinsics = np.array(intrinsics)
 62 |     intrinsics = np.mean(intrinsics, axis=0)
 63 |     width, height, fx, fy = intrinsics
 64 |     logger.info(
 65 |         f"mean intrinsics over the scan: {width:.0f} {height:.0f} {fx:.2f} {fy:.2f}"
 66 |     )
 67 | 
 68 |     # Pack tag map metadata
 69 |     tag_map_metadata = {
 70 |         "scan_name": scan_name,
 71 |         "intrinsics": {
 72 |             "width": width,
 73 |             "height": height,
 74 |             "fx": fx,
 75 |             "fy": fy,
 76 |             "near_dist": params["matterport_viewpoint_near_dist"],
 77 |         },
 78 |         "tagging_model": tagging_model.__class__.__name__,
 79 |     }
 80 | 
 81 |     # Start tag map generation
 82 |     logger.info("starting tag map generation")
 83 |     tag_map = TagMap(metadata=tag_map_metadata)
 84 | 
 85 |     skipped_frames = []
 86 | 
 87 |     for image_filename in tqdm(os.listdir(images_dir)):
 88 |         filename_bridge = MatterportFilenameBridge.from_image_filename(image_filename)
 89 |         depth_filename = filename_bridge.depth_filename
 90 |         pose_filename = filename_bridge.pose_filename
 91 | 
 92 |         image = read_matterport_image_file(os.path.join(images_dir, image_filename))
 93 |         depth, depth_image = read_matterport_depth_file(
 94 |             os.path.join(depths_dir, depth_filename)
 95 |         )
 96 |         T_cam_to_world = read_matterport_pose_file(
 97 |             os.path.join(poses_dir, pose_filename)
 98 |         )
 99 | 
100 |         # skip frames with invalid depth values
101 |         if not valid_depth_frame(depth, **params["depth_filtering_params"]):
102 |             skipped_frames.append((image_filename, depth_filename))
103 |             continue
104 | 
105 |         # compute the tags and their confidences
106 |         tags, confidences = tagging_model.filtered_tag_image(
107 |             image, params=params["filtered_tagging_params"]
108 |         )
109 | 
110 |         # information to store about the depth frame
111 |         depth_percentiles = {
112 |             str(q): dq
113 |             for q, dq in zip(
114 |                 params["stored_depth_percentiles"],
115 |                 np.quantile(depth, params["stored_depth_percentiles"]),
116 |             )
117 |         }
118 | 
119 |         # pack data to store within a TagMapEntry and add it to the tag map
120 |         entry_uuid = uuid.uuid4()
121 |         entry = TagMapEntry(
122 |             pose=T_cam_to_world,
123 |             uuid=entry_uuid,
124 |             extras={
125 |                 "depth_percentiles": depth_percentiles,
126 |             },
127 |         )
128 |         tag_map.add_entry(entry)
129 | 
130 |         # add associated tags to the database
131 |         for tag, conf in zip(tags, confidences):
132 |             tag_map.add_tag(
133 |                 tag,
134 |                 entry_uuid,
135 |                 extras={},
136 |             )
137 | 
138 |     logger.info(
139 |         f"finished tag map generation, skipped {len(skipped_frames)} frames with invalid depth values"
140 |     )
141 | 
142 |     # Save the tag map
143 |     save_path = os.path.join(output_dir, f"{scan_name}.tagmap")
144 |     tag_map.save(save_path)
145 |     logger.info(f"saved tag map to {save_path}")
146 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/matterport_object_bounding_box.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import numpy as np
 4 | import open3d as o3d
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class MatterportObjectBoundingBox:
 9 |     category_index: int
10 |     center: np.ndarray
11 |     a1: np.ndarray
12 |     a2: np.ndarray
13 |     r: np.ndarray
14 | 
15 |     def corners(self):
16 |         """
17 |         Corners ordered following convention defined in Pytorch3D
18 |         https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/ops/iou_box3d.py
19 |         """
20 |         a1 = self.a1 / np.linalg.norm(self.a1)
21 |         a2 = self.a2 / np.linalg.norm(self.a2)
22 |         r1, r2, r3 = self.r
23 |         a3 = np.cross(self.a1, self.a2)
24 |         return np.array(
25 |             [
26 |                 self.center - r1 * a1 - r2 * a2 - r3 * a3,
27 |                 self.center + r1 * a1 - r2 * a2 - r3 * a3,
28 |                 self.center + r1 * a1 + r2 * a2 - r3 * a3,
29 |                 self.center - r1 * a1 + r2 * a2 - r3 * a3,
30 | 
31 |                 self.center - r1 * a1 - r2 * a2 + r3 * a3,
32 |                 self.center + r1 * a1 - r2 * a2 + r3 * a3,
33 |                 self.center + r1 * a1 + r2 * a2 + r3 * a3,
34 |                 self.center - r1 * a1 + r2 * a2 + r3 * a3,
35 |             ]
36 |         )
37 | 
38 |     def o3d_lineset(self, color=(0, 1, 0)):
39 |         vertices = self.corners().astype(np.float64)
40 | 
41 |         lines = np.array(
42 |             [
43 |                 [0, 1],
44 |                 [0, 3],
45 |                 [1, 2],
46 |                 [2, 3],
47 |                 [0, 4],
48 |                 [1, 5],
49 |                 [2, 6],
50 |                 [3, 7],
51 |                 [4, 5],
52 |                 [4, 7],
53 |                 [5, 6],
54 |                 [6, 7],
55 |             ]
56 |         ).astype(np.int32)
57 | 
58 |         colors = np.tile(color, (12, 1)).astype(np.float64)
59 | 
60 |         lineset = o3d.geometry.LineSet()
61 |         lineset.points = o3d.utility.Vector3dVector(vertices)
62 |         lineset.lines = o3d.utility.Vector2iVector(lines)
63 |         lineset.colors = o3d.utility.Vector3dVector(colors)
64 | 
65 |         return lineset
66 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/matterport_region_bounding_box.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import numpy as np
 4 | import open3d as o3d
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class MatterportRegionBoundingBox:
 9 |     label: str
10 |     min_bound: np.array
11 |     max_bound: np.array
12 | 
13 |     def corners(self):
14 |         """
15 |         Corners ordered following convention defined in Pytorch3D
16 |         https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/ops/iou_box3d.py
17 |         """
18 |         return np.array(
19 |             [
20 |                 self.min_bound,
21 |                 [self.max_bound[0], self.min_bound[1], self.min_bound[2]],
22 |                 [self.max_bound[0], self.max_bound[1], self.min_bound[2]],
23 |                 [self.min_bound[0], self.max_bound[1], self.min_bound[2]],
24 |                 [self.min_bound[0], self.min_bound[1], self.max_bound[2]],
25 |                 [self.max_bound[0], self.min_bound[1], self.max_bound[2]],
26 |                 self.max_bound,
27 |                 [self.min_bound[0], self.max_bound[1], self.max_bound[2]],
28 |             ]
29 |         )
30 | 
31 |     def o3d_lineset(self, color=(0, 1, 0)):
32 |         vertices = self.corners().astype(np.float64)
33 | 
34 |         lines = np.array(
35 |             [
36 |                 [0, 1],
37 |                 [0, 3],
38 |                 [1, 2],
39 |                 [2, 3],
40 |                 [0, 4],
41 |                 [1, 5],
42 |                 [2, 6],
43 |                 [3, 7],
44 |                 [4, 5],
45 |                 [4, 7],
46 |                 [5, 6],
47 |                 [6, 7],
48 |             ]
49 |         ).astype(np.int32)
50 | 
51 |         colors = np.tile(color, (12, 1)).astype(np.float64)
52 | 
53 |         lineset = o3d.geometry.LineSet()
54 |         lineset.points = o3d.utility.Vector3dVector(vertices)
55 |         lineset.lines = o3d.utility.Vector2iVector(lines)
56 |         lineset.colors = o3d.utility.Vector3dVector(colors)
57 | 
58 |         return lineset
59 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/mp_region_ram_tags_mapping.py:
--------------------------------------------------------------------------------
 1 | MP_REGION_RAM_TAGS_MAPPING = {
 2 |     "bathroom": ["bathroom"],
 3 | 
 4 |     "bedroom": ["bedroom"],
 5 | 
 6 |     "closet": ["closet"],
 7 | 
 8 |     "dining room": ["dining room"],
 9 | 
10 |     # (should be the front door, not any door)
11 |     "entryway/foyer/lobby": [],  
12 | 
13 |     # (should be a room that a family hangs out in, not any area with couches)
14 |     # NOTE: mapped to "living room"
15 |     "familyroom": [],
16 | 
17 |     "garage": ["garage"],
18 | 
19 |     "hallway": ["hallway"],
20 | 
21 |     # (should be room like a library at a university, not an individual study)
22 |     "library": ["library"],
23 | 
24 |     # (place where people do laundry, etc.)
25 |     "laundryroom/mudroom": ["laundry room"],  
26 | 
27 |     "kitchen": ["kitchen"],
28 | 
29 |     # (should be the main “showcase” living room in a house, not any area with couches)
30 |     "living room": ["living room"],
31 | 
32 |     "meetingroom/conferenceroom": ["meeting room"],
33 | 
34 |     # (any area where people relax in comfy chairs/couches that is not the family room or living room
35 |     # NOTE: mapped to "living room"
36 |     "lounge": [],
37 | 
38 |     # (usually for an individual, or a small set of people)
39 |     "office": ["office", "home office"],
40 | 
41 |     # (must be outdoors on ground level)
42 |     "porch/terrace/deck/driveway": ["porch", "terrace", "deck", "driveway"],
43 | 
44 |     # (should have recreational objects, like pool table, etc.)
45 |     "rec/game": ["recreation room"],
46 | 
47 |     "stairs": ["stairs", "stairwell"],
48 | 
49 |     # (should be a small room with ONLY a toilet)
50 |     # NOTE: mapped to "bathroom"
51 |     "toilet": [],  
52 | 
53 |     "utilityroom/toolroom": ["utility room"],
54 | 
55 |     # (must have theater-style seating)
56 |     "tv": ["cinema", "home theater", "theater"],
57 | 
58 |     "workout/gym/exercise": ["gym"],
59 | 
60 |     # areas containing grass, plants, bushes, trees, etc.
61 |     "outdoor": ["outdoor"],
62 | 
63 |     # (must be outside and must not be on ground floor)
64 |     "balcony": ["balcony"],
65 | 
66 |     "bar": ["bar"],
67 | 
68 |     "classroom": ["classroom"],
69 | 
70 |     "dining booth": [],
71 | 
72 |     "spa/sauna": ["sauna"],
73 | 
74 |     # (it is clearly a room, but the function is not clear)
75 |     "other room": [],  
76 | 
77 |     "junk": [],
78 | 
79 |     "no label": [],
80 | }
81 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/datasets/matterport/mpcat40_ram_tags_mapping.py:
--------------------------------------------------------------------------------
  1 | MPCAT40_RAM_TAGS_MAPPING = {
  2 |     "wall": ["glass wall", "molding", "room divider", "tile wall", "wall", "wood wall"],
  3 |     "floor": ["bath mat", "carpet", "doormat", "floor", "floor mat", "landing", "mat"],
  4 |     "chair": [
  5 |         "armchair",
  6 |         "beach chair",
  7 |         "bean bag chair",
  8 |         "beanbag",
  9 |         "chair",
 10 |         "computer chair",
 11 |         "feeding chair",
 12 |         "folding chair",
 13 |         "office chair",
 14 |         "rocking chair",
 15 |         "swivel chair",
 16 |         "throne",
 17 |     ],
 18 |     "door": [
 19 |         "archway",
 20 |         "barn door",
 21 |         "bathroom door",
 22 |         "door",
 23 |         "doorway",
 24 |         "elevator door",
 25 |         "garage door",
 26 |         "glass door",
 27 |         "screen door",
 28 |         "shower door",
 29 |     ],
 30 |     "table": [
 31 |         "altar",
 32 |         "billiard table",
 33 |         "changing table",
 34 |         "cocktail table",
 35 |         "computer desk",
 36 |         "dinning table",
 37 |         "foosball",
 38 |         "glass table",
 39 |         "kitchen table",
 40 |         "office desk",
 41 |         "picnic table",
 42 |         "poker table",
 43 |         "round table",
 44 |         "side table",
 45 |         "stand",
 46 |         "table",
 47 |         "vanity",
 48 |         "workbench",
 49 |         "writing desk",
 50 |     ],
 51 |     "picture": [
 52 |         "art",
 53 |         "art print",
 54 |         "couple photo",
 55 |         "decorative picture",
 56 |         "drawing",
 57 |         "family photo",
 58 |         "group photo",
 59 |         "movie poster",
 60 |         "oil painting",
 61 |         "photo",
 62 |         "photo frame",
 63 |         "picture",
 64 |         "picture frame",
 65 |         "portrait",
 66 |         "poster",
 67 |         "publicity portrait",
 68 |         "reflection",
 69 |         "wedding photo",
 70 |     ],
 71 |     "cabinet": [
 72 |         "armoire",
 73 |         "bathroom cabinet",
 74 |         "cabinet",
 75 |         "cabinetry",
 76 |         "closet",
 77 |         "file cabinet",
 78 |         "kitchen cabinet",
 79 |         "medicine cabinet",
 80 |         "side cabinet",
 81 |         "tv cabinet",
 82 |         "wine cabinet",
 83 |     ],
 84 |     "cushion": ["pillow", "throw pillow"],
 85 |     "window": [
 86 |         "bathroom window",
 87 |         "bedroom window",
 88 |         "kitchen window",
 89 |         "office window",
 90 |         "shop window",
 91 |         "skylight",
 92 |         "window",
 93 |         "window frame",
 94 |         "window screen",
 95 |     ],
 96 |     "sofa": ["couch", "loveseat"],
 97 |     "bed": [
 98 |         "bed",
 99 |         "bed frame",
100 |         "bunk bed",
101 |         "canopy bed",
102 |         "cat bed",
103 |         "dog bed",
104 |         "futon",
105 |         "hammock",
106 |         "headboard",
107 |         "hospital bed",
108 |         "infant bed",
109 |         "mattress",
110 |     ],
111 |     "curtain": ["curtain", "shower curtain"],
112 |     "chest_of_drawers": ["bureau", "drawer", "dresser", "nightstand"],
113 |     "plant": ["bush", "flower", "grass", "houseplant", "plant", "tree"],
114 |     "sink": ["basin", "bathroom sink", "sink"],
115 |     "stairs": ["ladder", "stair", "stairs", "stairwell"],
116 |     "ceiling": ["ceiling", "roof"],
117 |     "toilet": ["bidet", "toilet bowl", "toilet seat"],
118 |     "stool": ["bar stool", "footrest", "music stool", "step stool", "stool"],
119 |     "towel": [
120 |         "bath towel",
121 |         "beach towel",
122 |         "face towel",
123 |         "hand towel",
124 |         "paper towel",
125 |         "towel",
126 |     ],
127 |     "mirror": [
128 |         "bathroom mirror",
129 |         "car mirror",
130 |         "cosmetics mirror",
131 |         "mirror",
132 |         "rearview mirror",
133 |         "view mirror",
134 |     ],
135 |     "tv_monitor": [
136 |         "bulletin board",
137 |         "computer monitor",
138 |         "computer screen",
139 |         "display",
140 |         "monitor",
141 |         "television",
142 |         "whiteboard",
143 |     ],
144 |     "shower": ["shower", "shower door", "shower head"],
145 |     "column": ["pillar", "post"],
146 |     "bathtub": ["bath", "jacuzzi"],
147 |     "counter": [
148 |         "bar",
149 |         "buffet",
150 |         "counter",
151 |         "counter top",
152 |         "island",
153 |         "kitchen counter",
154 |         "kitchen island",
155 |         "wet bar",
156 |     ],
157 |     "fireplace": ["fireplace", "mantle"],
158 |     "lighting": [
159 |         "bedside lamp",
160 |         "chandelier",
161 |         "christmas light",
162 |         "lamp",
163 |         "lamp shade",
164 |         "lantern",
165 |         "light",
166 |         "light fixture",
167 |         "lighting",
168 |         "neon light",
169 |         "oil lamp",
170 |         "stage light",
171 |         "table lamp",
172 |         "wall lamp",
173 |     ],
174 |     "beam": ["beam"],
175 |     "railing": ["balustrade", "rail"],
176 |     "shelving": ["bookshelf", "easel", "shelf", "shelve", "spice rack"],
177 |     "blinds": ["blind"],
178 |     "gym_equipment": [
179 |         "barbell",
180 |         "dumbbell",
181 |         "stationary bicycle",
182 |         "training bench",
183 |         "treadmill",
184 |         "weight",
185 |     ],
186 |     "seating": ["bench", "church bench", "park bench", "seat", "window seat"],
187 |     "board_panel": ["board", "panel"],
188 |     "furniture": ["armoire", "closet", "furniture"],
189 |     "appliances": [
190 |         "appliance",
191 |         "boiler",
192 |         "coffee machine",
193 |         "dish washer",
194 |         "fridge",
195 |         "gas stove",
196 |         "hand dryer",
197 |         "home appliance",
198 |         "humidifier",
199 |         "ice maker",
200 |         "juicer",
201 |         "microwave",
202 |         "oven",
203 |         "rice cooker",
204 |         "sewing machine",
205 |         "stove",
206 |         "toaster",
207 |         "vacuum",
208 |         "waffle iron",
209 |         "washer",
210 |         "washing machine",
211 |     ],
212 |     "clothes": [
213 |         "baby clothe",
214 |         "baseball hat",
215 |         "bathrobe",
216 |         "bathroom accessory",
217 |         "bikini",
218 |         "bikini top",
219 |         "blouse",
220 |         "christmas hat",
221 |         "cloak",
222 |         "clothing",
223 |         "coat",
224 |         "cocktail dress",
225 |         "corset",
226 |         "costume",
227 |         "cowboy hat",
228 |         "crop top",
229 |         "denim jacket",
230 |         "dress",
231 |         "dress hat",
232 |         "dress shirt",
233 |         "dress shoe",
234 |         "dress suit",
235 |         "evening dress",
236 |         "fur coat",
237 |         "gown",
238 |         "halter top",
239 |         "hat",
240 |         "headdress",
241 |         "headscarf",
242 |         "hoodie",
243 |         "jacket",
244 |         "jeans",
245 |         "jockey cap",
246 |         "kilt",
247 |         "kimono",
248 |         "lab coat",
249 |         "lace dress",
250 |         "laundry",
251 |         "leather jacket",
252 |         "maxi dress",
253 |         "miniskirt",
254 |         "overcoat",
255 |         "pants",
256 |         "pantyhose",
257 |         "polo neck",
258 |         "polo shirt",
259 |         "raincoat",
260 |         "robe",
261 |         "safety vest",
262 |         "scarf",
263 |         "shirt",
264 |         "ski jacket",
265 |         "sports coat",
266 |         "straw hat",
267 |         "sun hat",
268 |         "suspenders",
269 |         "sweat pant",
270 |         "sweater",
271 |         "sweatshirt",
272 |         "t shirt",
273 |         "t-shirt",
274 |         "trench coat",
275 |         "underclothes",
276 |         "vest",
277 |         "visor",
278 |         "waterproof jacket",
279 |         "wedding dress",
280 |         "wrap dress",
281 |     ],
282 |     "misc": [],
283 |     "objects": [],
284 |     "void": [],
285 |     "unlabeled": [],
286 | }
287 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .lattice_navigation_graph import LatticeNavigationGraph, create_lattice_navigation_graph
2 | from .lattice_graph_utils import assign_label_box_lattice_graph_nodes, assign_proposal_box_lattice_graph_nodes


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/evaluation/lattice_graph_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import open3d as o3d
  3 | 
  4 | from tag_mapping.utils import nearest_points_in_box, o3d_check_lines_collision
  5 | from .lattice_navigation_graph import LatticeNavigationGraph
  6 | 
  7 | 
  8 | def assign_label_box_lattice_graph_nodes(
  9 |     lattice_graph: LatticeNavigationGraph,
 10 |     rcs: o3d.t.geometry.RaycastingScene,
 11 |     box_corners: np.ndarray,
 12 |     enable_inflation: bool = True,
 13 |     ogn_dist_threshold: float = 1.0,
 14 | ):
 15 |     """
 16 |     Assigns to a labeled bounding box nodes of the lattice graph.
 17 |     The assignment includes nodes of the following types:
 18 |         - Nodes within the labeled bounding box
 19 |         - Nodes who's shortest straight line path to the labeled bounding box
 20 |             is within the object goal nav distance threshold and collision free
 21 | 
 22 |     The object goal nav distance threshold is nominally defined as 1m following
 23 |     the Habitat challenge evaluation criteria:
 24 |         https://aihabitat.org/challenge/2023/
 25 | 
 26 |     Args:
 27 |         lattice_graph: LatticeNavigationGraph
 28 |         rcs: Open3d RaycastingScene
 29 |         box_corners: (8,3) array of labeled bounding box corners
 30 |         enable_inflation: If True, inflate the box to find additional points within
 31 |             the object goal nav distance threshold
 32 |         ogn_dist_threshold: Object goal nav distance threshold.
 33 |             Only used if enable_inflation is True.
 34 | 
 35 |     Returns:
 36 |         assigned_node_inds: List of node indices assigned to the labeled bounding box.
 37 |             Note that this list could be empty if no nodes are assigned
 38 |     """
 39 |     nodes_xyz = lattice_graph.nodes_xyz
 40 | 
 41 |     # Construct the convex hull (i.e. minimum oriented bounding box) from the box_corners
 42 |     o3d_box_corners = o3d.geometry.PointCloud()
 43 |     o3d_box_corners.points = o3d.utility.Vector3dVector(box_corners)
 44 |     obb = o3d_box_corners.get_minimal_oriented_bounding_box()
 45 | 
 46 |     # Get the indicies of the nodes within the initial bounding box
 47 |     in_box_inds = obb.get_point_indices_within_bounding_box(
 48 |         o3d.utility.Vector3dVector(nodes_xyz)
 49 |     )
 50 | 
 51 |     # Inflate the box to find additional points within the object goal nav distance
 52 |     if enable_inflation:
 53 |         inflated_obb_extent = obb.extent.copy()
 54 |         inflated_obb_extent += 2 * ogn_dist_threshold
 55 | 
 56 |         inflated_obb = o3d.geometry.OrientedBoundingBox()
 57 |         inflated_obb.center = obb.center.copy()
 58 |         inflated_obb.R = obb.R.copy()
 59 |         inflated_obb.extent = inflated_obb_extent
 60 | 
 61 |         in_inflated_box_inds = inflated_obb.get_point_indices_within_bounding_box(
 62 |             o3d.utility.Vector3dVector(nodes_xyz)
 63 |         )
 64 | 
 65 |         # Consider now only the nodes in the inflated box but NOT in the original box
 66 |         near_box_inds = [ind for ind in in_inflated_box_inds if ind not in in_box_inds]
 67 | 
 68 |         if len(near_box_inds) > 0:
 69 |             near_box_nodes = nodes_xyz[near_box_inds].reshape(-1, 3)
 70 |             proj_box_nodes = nearest_points_in_box(
 71 |                 box_corners, obb.center, near_box_nodes
 72 |             )
 73 | 
 74 |             near_box_dists = np.linalg.norm(near_box_nodes - proj_box_nodes, axis=1)
 75 |             within_ogn_dist = near_box_dists <= ogn_dist_threshold
 76 | 
 77 |             collision_mask = o3d_check_lines_collision(
 78 |                 rcs, near_box_nodes, proj_box_nodes
 79 |             )
 80 | 
 81 |             valid_mask = np.logical_and(~collision_mask, within_ogn_dist)
 82 |             near_box_inds = np.array(near_box_inds)[valid_mask].tolist()
 83 | 
 84 |         assigned_node_inds = in_box_inds + near_box_inds
 85 |     else:
 86 |         assigned_node_inds = in_box_inds
 87 | 
 88 |     return assigned_node_inds
 89 | 
 90 | 
 91 | def assign_proposal_box_lattice_graph_nodes(
 92 |     lattice_graph: LatticeNavigationGraph,
 93 |     rcs: o3d.t.geometry.RaycastingScene,
 94 |     box_corners: np.ndarray,
 95 | ):
 96 |     """
 97 |     Assigns to a proposed bounding box nodes of the lattice graph.
 98 | 
 99 |     First we check if the box already contains nodes, if it does then
100 |     we return those nodes. Otherwise we inflate the box and find nodes
101 |     nearby to the box.
102 | 
103 |     We only assign nearby nodes which are collision free to their projected
104 |     point in the box and who's projected point is within that node's voxel.
105 | 
106 |     Args:
107 |         lattice_graph: LatticeNavigationGraph
108 |         rcs: Open3d RaycastingScene
109 |         box_corners: (8,3) array of labeled bounding box corners
110 | 
111 |     Returns:
112 |         assigned_node_inds: List of node indices assigned to the labeled bounding box.
113 |             Note that this list could be empty if no nodes are assigned
114 |     """
115 |     nodes_xyz = lattice_graph.nodes_xyz
116 |     lattice_grid_res = lattice_graph.grid_resolution
117 | 
118 |     # Construct the convex hull (i.e. minimum oriented bounding box) from the box_corners
119 |     o3d_box_corners = o3d.geometry.PointCloud()
120 |     o3d_box_corners.points = o3d.utility.Vector3dVector(box_corners)
121 |     obb = o3d_box_corners.get_minimal_oriented_bounding_box()
122 | 
123 |     # Get the indicies of the nodes within the initial bounding box
124 |     in_box_inds = obb.get_point_indices_within_bounding_box(
125 |         o3d.utility.Vector3dVector(nodes_xyz)
126 |     )
127 | 
128 |     # End if the box already contains nodes
129 |     if len(in_box_inds) > 0:
130 |         return in_box_inds
131 | 
132 |     # Find additional nodes by inflating the bounding box
133 |     MAGIC_EXTENT_SCALING_CONSTANT = 2 * 1.414  # NOTE: 1.414 ~ sqrt(2)
134 | 
135 |     inflated_obb_extent = obb.extent.copy()
136 |     inflated_obb_extent = np.maximum(
137 |         inflated_obb_extent, MAGIC_EXTENT_SCALING_CONSTANT * lattice_grid_res
138 |     )
139 | 
140 |     inflated_obb = o3d.geometry.OrientedBoundingBox()
141 |     inflated_obb.center = obb.center.copy()
142 |     inflated_obb.R = obb.R.copy()
143 |     inflated_obb.extent = inflated_obb_extent
144 | 
145 |     near_box_inds = inflated_obb.get_point_indices_within_bounding_box(
146 |         o3d.utility.Vector3dVector(nodes_xyz)
147 |     )
148 | 
149 |     # Assign no nodes if inflating does not find any nodes
150 |     if len(near_box_inds) == 0:
151 |         return []
152 | 
153 |     near_box_nodes = nodes_xyz[near_box_inds].reshape(-1, 3)
154 |     proj_box_nodes = nearest_points_in_box(box_corners, obb.center, near_box_nodes)
155 | 
156 |     # Check if the projected points are within the node's voxel using infinity norm
157 |     near_box_dists = np.linalg.norm(near_box_nodes - proj_box_nodes, axis=1, ord=np.inf)
158 |     in_node_voxel = near_box_dists <= (lattice_grid_res / 2)
159 | 
160 |     collision_mask = o3d_check_lines_collision(rcs, near_box_nodes, proj_box_nodes)
161 | 
162 |     valid_mask = np.logical_and(~collision_mask, in_node_voxel)
163 |     near_box_inds = np.array(near_box_inds)[valid_mask].tolist()
164 | 
165 |     return near_box_inds
166 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/filtering/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference_filters import *
2 | from .image_filters import *
3 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/filtering/image_filters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from typing import Iterable, Tuple, Optional
 4 | 
 5 | 
 6 | def valid_depth_frame(
 7 |     depth_frame: np.ndarray,
 8 |     mean_threshold: Optional[float] = None,
 9 |     quantile_thresholds: Optional[Iterable[Tuple[float, float]]] = None,
10 | ) -> bool:
11 |     """
12 |     Used to filter out frames of up-close views that are unlikely to be informative.
13 | 
14 |     Args:
15 |         depth_frame: Depth frame to check.
16 |         mean_threshold: minimum threshold on mean of the depth frame.
17 |             Set to None to skip mean threshold check.
18 |         quantile_thresholds: list of tuples of quantiles their minimum depth thresholds.
19 |             Set to None to skip quantile threshold check.
20 | 
21 |     Returns:
22 |         True if the depth frame is valid, False otherwise.
23 |     """
24 | 
25 |     valid_depths_mask = np.logical_and(~np.isnan(depth_frame), ~np.isinf(depth_frame))
26 | 
27 |     if not np.any(valid_depths_mask):
28 |         return False
29 | 
30 |     if mean_threshold != None:
31 |         if np.mean(depth_frame[valid_depths_mask]) < mean_threshold:
32 |             return False
33 | 
34 |     if quantile_thresholds != None:
35 |         quantiles = np.quantile(
36 |             depth_frame[valid_depths_mask], [q for q, _ in quantile_thresholds]
37 |         )
38 |         if np.any(quantiles < [thresh for _, thresh in quantile_thresholds]):
39 |             return False
40 | 
41 |     return True
42 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/filtering/inference_filters.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | 
 3 | from PIL import Image
 4 | 
 5 | 
 6 | def compute_unlikely_tags_center_crop_ensemble(
 7 |     image: Image.Image,
 8 |     image_tags: Iterable[str],
 9 |     cc_proportions: Iterable[float],
10 |     tagging_model,
11 | ) -> Iterable[str]:
12 |     """
13 |     Finds unlikely tags in a set of tags for an image by running the
14 |     model on center cropped versions of the original image
15 | 
16 |     Args:
17 |         image: original image
18 |         image_tags: tags of the original image
19 |         cc_proportions: list of border crop proportions
20 |         tagging_model: tagging model
21 | 
22 |     Returns:
23 |         set of unlikely tags
24 |     """
25 | 
26 |     def center_crop(img, crop_border_proportion):
27 |         assert crop_border_proportion < 0.5
28 |         width, height = img.size
29 |         return img.crop(
30 |             (
31 |                 crop_border_proportion * width,
32 |                 crop_border_proportion * height,
33 |                 width * (1 - crop_border_proportion),
34 |                 height * (1 - crop_border_proportion),
35 |             )
36 |         )
37 | 
38 |     cc_images = [center_crop(image, ccp) for ccp in cc_proportions]
39 | 
40 |     unlikely_tags_set = set()
41 |     for cc_image in cc_images:
42 |         cc_image_tags = tagging_model.tag_image(cc_image)["tags"]
43 | 
44 |         unlikely_tags = [tag for tag in image_tags if tag not in cc_image_tags]
45 |         unlikely_tags_set.update(unlikely_tags)
46 | 
47 |     return unlikely_tags_set
48 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/localization/__init__.py:
--------------------------------------------------------------------------------
1 | from .viewpoint import *
2 | from .clustering import *
3 | from .voxel_voting import *
4 | from .pipeline import *


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/localization/clustering.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import hdbscan
 4 | import numpy as np
 5 | import open3d as o3d
 6 | 
 7 | 
 8 | def cluster_points_dbscan(
 9 |     points: Union[np.ndarray, o3d.geometry.PointCloud], **dbscan_kwargs
10 | ) -> np.ndarray:
11 |     """
12 |     Cluster points using DBSCAN implementation from Open3D
13 |     http://www.open3d.org/docs/release/python_api/open3d.geometry.PointCloud.html#open3d.geometry.PointCloud.cluster_dbscan
14 | 
15 |     Args:
16 |         points: (N, 3) array of points or Open3D point cloud
17 |         **dbscan_kwargs: keyword arguments to pass to dbscan
18 | 
19 |     Returns:
20 |         (N,) array of cluster labels
21 |     """
22 |     if isinstance(points, np.ndarray):
23 |         pcd = o3d.geometry.PointCloud()
24 |         pcd.points = o3d.utility.Vector3dVector(points)
25 |     elif isinstance(points, o3d.geometry.PointCloud):
26 |         pcd = points
27 |     else:
28 |         raise ValueError("points must be either an array or an Open3D point cloud")
29 | 
30 |     labels = np.array(pcd.cluster_dbscan(**dbscan_kwargs)).astype(np.int32)
31 |     return labels
32 | 
33 | 
34 | def cluster_points_hdbscan(
35 |     points: Union[np.ndarray, o3d.geometry.PointCloud], **hdbscan_kwargs
36 | ) -> np.ndarray:
37 |     """
38 |     Cluster points using HDBSCAN implementation from hdbscan package
39 |     https://github.com/scikit-learn-contrib/hdbscan
40 | 
41 |     Args:
42 |         points: (N, 3) array of points or Open3D point cloud
43 |         **hdbscan_kwargs: keyword arguments to pass to hdbscan
44 | 
45 |     Returns:
46 |         (N,) array of cluster labels
47 |     """
48 |     if isinstance(points, np.ndarray):
49 |         X = points
50 |     elif isinstance(points, o3d.geometry.PointCloud):
51 |         X = np.asarray(points.points)
52 |     else:
53 |         raise ValueError("points must be either an array or an Open3D point cloud")
54 | 
55 |     hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_kwargs)
56 | 
57 |     labels = hdbscan_clusterer.fit_predict(X).astype(np.int32)
58 |     return labels
59 | 
60 | 
61 | def cluster_points(
62 |     points: Union[np.ndarray, o3d.geometry.PointCloud],
63 |     algorithm: str,
64 |     **algorithm_kwargs,
65 | ) -> np.ndarray:
66 |     """
67 |     Cluster points using the specified algorithm
68 | 
69 |     Args:
70 |         points: (N, 3) array of points or Open3D point cloud
71 |         algorithm: algorithm to use for clustering
72 |             one of 'dbscan' or 'hdbscan'
73 |         **algorithm_kwargs: keyword arguments to pass to the clustering algorithm
74 | 
75 |     Returns:
76 |         (N,) array of cluster labels
77 |     """
78 |     if algorithm == "dbscan":
79 |         labels = cluster_points_dbscan(points, **algorithm_kwargs)
80 |     elif algorithm == "hdbscan":
81 |         labels = cluster_points_hdbscan(points, **algorithm_kwargs)
82 |     else:
83 |         raise ValueError(
84 |             "Invalid algorithm: {}. Must be dbscan or hdbscan".format(algorithm)
85 |         )
86 |     return labels
87 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/localization/pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import open3d as o3d
  3 | 
  4 | from typing import Any, Callable, Dict, Iterable, Optional
  5 | 
  6 | from tag_mapping import TagMapEntry
  7 | from .clustering import cluster_points
  8 | from .viewpoint import Viewpoint
  9 | from .voxel_voting import grid_voxel_voting
 10 | 
 11 | 
 12 | def localization_pipeline(
 13 |     viewpoints: Iterable[Viewpoint],
 14 |     params: Dict[str, Any],
 15 |     verbose: bool = False,
 16 | ) -> Dict[str, Any]:
 17 |     """
 18 |     Tag map localization pipeline.
 19 | 
 20 |     Args:
 21 |         viewpoints: Iterable of viewpoints
 22 |         params: Dictionary of parameters for the pipeline.
 23 | 
 24 |     Returns:
 25 |         Dictionary of results from the pipeline.
 26 |             "voxel_center_points": (N,3) array of voxel center points
 27 |             "voxel_scores": (N,) array of scores for each voxel
 28 |             "level_bbxes": Dictionary mapping clustering level to the bounding boxes
 29 |                 at that level.
 30 |     """
 31 |     vv_params, cl_params = (params["voxel_voting"], params["clustering"])
 32 | 
 33 |     ### Voxel voting
 34 |     if vv_params["viewpoint_weight"] == None:
 35 |         viewpoint_weight = None
 36 |     elif vv_params["viewpoint_weight"] == "confidence":
 37 |         viewpoint_weight = np.array([vp.extras["confidence"] for vp in viewpoints])
 38 |     else:
 39 |         raise ValueError(
 40 |             'Invalid viewpoint_weight {}. Must be None or "confidence"'.format(
 41 |                 vv_params["viewpoint_weight"]
 42 |             )
 43 |         )
 44 | 
 45 |     voxel_center_points, votes = grid_voxel_voting(
 46 |         viewpoints, vv_params["voxel_size"], viewpoint_weight
 47 |     )
 48 | 
 49 |     # handle case where voxel voting fills no voxels
 50 |     # then voxel_center_points and votes will be
 51 |     # (0,3) and (0,) respectively
 52 |     if voxel_center_points.shape[0] == 0:
 53 |         return {
 54 |             "voxel_center_points": voxel_center_points,
 55 |             "voxel_scores": votes,
 56 |             "level_bbxes": [],
 57 |         }
 58 | 
 59 |     if vv_params["scoring_method"] == "normalized_votes":
 60 |         voxel_scores = votes / np.max(votes)
 61 |         clustering_levels = cl_params["clustering_levels"]
 62 | 
 63 |         def score_to_votes(score):
 64 |             score = score * np.max(votes)
 65 |             if score == 0.0:
 66 |                 return 1
 67 |             else:
 68 |                 if score != int(score):
 69 |                     return int(np.ceil(score))
 70 |                 else:
 71 |                     return int(score)
 72 | 
 73 |     elif vv_params["scoring_method"] == "votes":
 74 |         voxel_scores = votes
 75 |         voxel_levels = np.unique(votes)
 76 |         clustering_levels = range(1, voxel_levels.max() + 1)
 77 |         score_to_votes = lambda score: score
 78 |     else:
 79 |         raise ValueError(
 80 |             'Invalid scoring_method {}. Must be "normalized_votes" or "votes"'.format(
 81 |                 vv_params["scoring_method"]
 82 |             )
 83 |         )
 84 | 
 85 |     ### Clustering
 86 |     if cl_params["algorithm"] == "dbscan":
 87 |         cluster_fn = lambda pcd: cluster_points(
 88 |             pcd, algorithm="dbscan", **cl_params["dbscan_kwargs"]
 89 |         )
 90 |     elif cl_params["algorithm"] == "hdbscan":
 91 |         cluster_fn = lambda pcd: cluster_points(
 92 |             pcd, algorithm="hdbscan", **cl_params["hdbscan_kwargs"]
 93 |         )
 94 |     else:
 95 |         raise ValueError(
 96 |             'Invalid algorithm {}. Must be "dbscan" or "hdbscan"'.format(
 97 |                 cl_params["algorithm"]
 98 |             )
 99 |         )
100 | 
101 |     def bb_fn(pcd, bb_type):
102 |         if bb_type == "axis_aligned":
103 |             box = pcd.get_axis_aligned_bounding_box()
104 | 
105 |             # Pad zero dimensions to avoid zero-volume bounding boxes
106 |             if box.volume() == 0.0:
107 |                 if verbose:
108 |                     print(
109 |                         "[warning]: bounding box with zero volume, padding zero length dimensions to voxel_size."
110 |                     )
111 |                 min_bound = box.get_min_bound()
112 |                 max_bound = box.get_max_bound()
113 |                 zero_dims = np.where(min_bound == max_bound)[0]
114 |                 min_bound[zero_dims] -= vv_params["voxel_size"] / 2
115 |                 max_bound[zero_dims] += vv_params["voxel_size"] / 2
116 |                 box = o3d.geometry.AxisAlignedBoundingBox(min_bound, max_bound)
117 | 
118 |         elif bb_type == "oriented":
119 |             raise NotImplementedError(
120 |                 "Oriented bounding boxes not implemented yet."
121 |             )  # TODO: implement?
122 | 
123 |         else:
124 |             raise ValueError(
125 |                 'Invalid bounding_box_type {}. Must be "axis_aligned" or "oriented"'.format(
126 |                     bb_type
127 |                 )
128 |             )
129 |         assert box.volume() > 0.0
130 |         return box
131 | 
132 |     level_bbxes = []
133 |     for level in clustering_levels:
134 |         level_pcd = o3d.geometry.PointCloud()
135 |         level_pcd.points = o3d.utility.Vector3dVector(
136 |             voxel_center_points[voxel_scores >= level]
137 |         )
138 | 
139 |         if len(level_pcd.points) == 0:
140 |             if verbose:
141 |                 print(
142 |                     "[warning]: no more points past level {}, stopping.".format(level)
143 |                 )
144 |             break
145 | 
146 |         cluster_labels = cluster_fn(level_pcd)
147 | 
148 |         for i in range(cluster_labels.max() + 1):
149 |             cluster_inds = np.where(cluster_labels == i)[0]
150 |             cluster_pcd = level_pcd.select_by_index(cluster_inds)
151 | 
152 |             cluster_box = bb_fn(cluster_pcd, bb_type=cl_params["bounding_box_type"])
153 | 
154 |             level_bbxes.append((level, cluster_box))
155 | 
156 |     ## Apply non-max suppression to the clustered regions
157 |     level_bbxes = sorted(level_bbxes, key=lambda x: x[0], reverse=True)
158 |     remove_box = len(level_bbxes) * [False]
159 |     for i in range(len(level_bbxes) - 1):
160 |         l1, p1 = level_bbxes[i]
161 |         for j in range(i + 1, len(level_bbxes)):
162 |             # skip if p2 already marked for removal
163 |             if remove_box[j]:
164 |                 continue
165 | 
166 |             l2, p2 = level_bbxes[j]
167 | 
168 |             if l1 == l2:
169 |                 continue
170 | 
171 |             p1_in_p2 = _box_contains_box(p2, p1)
172 |             if p1_in_p2:
173 |                 remove_box[j] = True
174 | 
175 |     level_bbxes = [lb for i, lb in enumerate(level_bbxes) if not remove_box[i]]
176 | 
177 |     # Map scores used for clustering back to the more easily interpretable votes
178 |     level_bbxes = [(score_to_votes(l), b) for l, b in level_bbxes]
179 | 
180 |     return {
181 |         "voxel_center_points": voxel_center_points,
182 |         "voxel_scores": voxel_scores,
183 |         "level_bbxes": level_bbxes,  # TODO return instead a list of scored bounding boxes
184 |     }
185 | 
186 | 
187 | def tagmap_entries_to_viewpoints(
188 |     entries: Iterable[TagMapEntry],
189 |     intrinsics: Dict[str, Any],
190 |     store_extras_keys: Iterable[str] = [],
191 |     far_dist_fn: Optional[Callable[[TagMapEntry], float]] = None,
192 |     near_dist_fn: Optional[Callable[[TagMapEntry], float]] = None,
193 | ) -> Iterable[Viewpoint]:
194 |     """
195 |     Helper function to convert an iterable of tag map entries their corresponding
196 |     viewpoints.
197 | 
198 |     Args:
199 |         entries: Iterable of entries from a tag map.
200 |         intrinsics: Dictionary of camera intrinsics, must define
201 |             ["width", "height", "fx", "fy"]
202 |         store_extras_keys: Iterable of keys of the entry extras to store in
203 |             the viewpoint extras.
204 |         far_dist_fn: Function to compute the viewpoint's far distance from a query entry.
205 |             If None, the viewpoint's far distance is set to intrinsics["far_dist"]
206 |         near_dist_fn: Function to compute the viewpoint's near distance from a query entry
207 |             If None, the viewpoint's near distance is set to intrinsics["near_dist"]
208 |     """
209 | 
210 |     if far_dist_fn == None:
211 |         far_dist_fn = lambda entry: intrinsics["far_dist"]
212 | 
213 |     if near_dist_fn == None:
214 |         near_dist_fn = lambda entry: intrinsics["near_dist"]
215 | 
216 |     viewpoints = []
217 |     for entry in entries:
218 |         try:
219 |             far_dist = far_dist_fn(entry)
220 |         except Exception as e:
221 |             print("Error in far_dist_fn, using value in intrinsics: {}.".format(e))
222 | 
223 |         try:
224 |             near_dist = near_dist_fn(entry)
225 |         except Exception as e:
226 |             print("Error in near_dist_fn, using value in intrinsics: {}.".format(e))
227 | 
228 |         extras = {k: v for k, v in entry.extras.items() if k in store_extras_keys}
229 | 
230 |         if far_dist <= near_dist:
231 |             # skip creating viewpoint if far_dist <= near_dist
232 |             continue
233 | 
234 |         vp = Viewpoint.from_intrinsics(
235 |             extrinsic_matrix=entry.pose,
236 |             width=intrinsics["width"],
237 |             height=intrinsics["height"],
238 |             fx=intrinsics["fx"],
239 |             fy=intrinsics["fy"],
240 |             near_dist=near_dist,
241 |             far_dist=far_dist,
242 |             extras=extras,
243 |         )
244 |         viewpoints.append(vp)
245 | 
246 |     return viewpoints
247 | 
248 | 
249 | def _box_contains_box(box1, box2):
250 |     """
251 |     Helper function that returns True if box1 wholly contains box2.
252 |     """
253 |     # TODO implement this for other box types
254 |     if (
255 |         type(box1) != o3d.geometry.AxisAlignedBoundingBox
256 |         or type(box2) != o3d.geometry.AxisAlignedBoundingBox
257 |     ):
258 |         raise NotImplementedError(
259 |             "Unsupported box type, must be AxisAlignedBoundingBox"
260 |         )
261 | 
262 |     box1_min_bound, box1_max_bound = (box1.get_min_bound(), box1.get_max_bound())
263 |     box2_min_bound, box2_max_bound = (box2.get_min_bound(), box2.get_max_bound())
264 |     return np.all(box1_min_bound <= box2_min_bound) and np.all(
265 |         box1_max_bound >= box2_max_bound
266 |     )
267 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/localization/viewpoint.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple, Union
  2 | 
  3 | import numpy as np
  4 | import open3d as o3d
  5 | 
  6 | 
  7 | class Viewpoint:
  8 |     """
  9 |     A viewpoint where the local frame has Z forward
 10 |     following traditional camera frame conventions
 11 |     """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         extrinsic_matrix: np.ndarray,
 16 |         w_fov: float,
 17 |         h_fov: float,
 18 |         near_dist: Optional[float] = None,
 19 |         far_dist: Optional[float] = None,
 20 |         extras: Optional[dict] = None,
 21 |     ) -> None:
 22 |         """
 23 |         Args:
 24 |             extrinsic_matrix: 4x4 transformation matrix from local to world frame
 25 |             w_fov: horizontal field of view in radians
 26 |             h_fov: vertical field of view in radians
 27 |             near_dist: distance to near plane
 28 |             far_dist: distance to far plane
 29 |             extras: optional dictionary of extra data
 30 |         """
 31 |         self._extrinsic_matrix = extrinsic_matrix
 32 |         self._w_fov, self._h_fov = (w_fov, h_fov)
 33 | 
 34 |         self._near_dist, self._far_dist = (near_dist, far_dist)
 35 |         if near_dist != None and far_dist != None:
 36 |             assert near_dist < far_dist, "near_dist must be less than far_dist"
 37 | 
 38 |         self._extras = extras
 39 | 
 40 |     @classmethod
 41 |     def from_intrinsics(
 42 |         cls,
 43 |         extrinsic_matrix: np.ndarray,
 44 |         width: int,
 45 |         height: int,
 46 |         fx: float,
 47 |         fy: float,
 48 |         near_dist: Optional[float] = None,
 49 |         far_dist: Optional[float] = None,
 50 |         extras: Optional[dict] = None,
 51 |     ) -> "Viewpoint":
 52 |         """
 53 |         Construct a viewpoint for camera intrinsics parameters
 54 |         """
 55 |         return cls(
 56 |             extrinsic_matrix,
 57 |             2 * np.arctan(width / (2 * fx)),
 58 |             2 * np.arctan(height / (2 * fy)),
 59 |             near_dist,
 60 |             far_dist,
 61 |             extras,
 62 |         )
 63 | 
 64 |     def within_viewpoint(self, points: np.ndarray) -> np.ndarray:
 65 |         """
 66 |         Returns a boolean array indicating whether each point is within the
 67 |         viewpoint's frustum
 68 | 
 69 |         Args:
 70 |             points: Nx3 array of points in the world frame
 71 | 
 72 |         Returns:
 73 |             (N,) boolean array
 74 |         """
 75 |         # transform points to the viewpoint's local frame
 76 |         points_local = (points - self.origin) @ self.R
 77 | 
 78 |         # compute w and h ray angles for the grid points and filter out
 79 |         # points which are outside the field of view
 80 |         w = np.arctan(points_local[:, 0] / (points_local[:, 2] + 1e-6))
 81 |         h = np.arctan(points_local[:, 1] / (points_local[:, 2] + 1e-6))
 82 | 
 83 |         inside = np.logical_and(
 84 |             np.logical_and(w < (self._w_fov / 2), w > -(self._w_fov / 2)),
 85 |             np.logical_and(h < (self._h_fov / 2), h > -(self._h_fov / 2)),
 86 |         )
 87 | 
 88 |         # check depth bounds
 89 |         d = points_local[:, 2]
 90 | 
 91 |         if self._near_dist != None:
 92 |             inside = np.logical_and(inside, d > self._near_dist)
 93 |         else:
 94 |             inside = np.logical_and(inside, d > 0)
 95 | 
 96 |         if self._far_dist != None:
 97 |             inside = np.logical_and(inside, d < self._far_dist)
 98 | 
 99 |         return inside
100 | 
101 |     @property
102 |     def extras(self) -> Union[dict, None]:
103 |         return self._extras
104 | 
105 |     @property
106 |     def R(self) -> np.ndarray:
107 |         """
108 |         Rotation matrix from the viewpoint's local frame to the world frame
109 |         """
110 |         return self._extrinsic_matrix[:3, :3]
111 | 
112 |     @property
113 |     def origin(self) -> np.ndarray:
114 |         """
115 |         The origin of the viewpoint's local frame in the world frame
116 |         """
117 |         return self._extrinsic_matrix[:3, 3]
118 | 
119 |     @property
120 |     def fov(self) -> Tuple[float, float]:
121 |         """
122 |         Returns the horizontal and vertical field of view in radians
123 |         """
124 |         return self._w_fov, self._h_fov
125 | 
126 |     @property
127 |     def bounding_rays(self) -> np.ndarray:
128 |         """
129 |         Returns the four bounding rays expressed
130 |         in the world frame
131 | 
132 |         Returns a 4x3 array of unit vectors with each row
133 |         representing a ray in the order of:
134 |             top-left corner
135 |             bottom-left corner
136 |             top-right corner
137 |             bottom-right corner
138 |         """
139 |         if not hasattr(self, "_bounding_rays"):
140 |             dx, dy = (np.tan(self._w_fov / 2), np.tan(self._h_fov / 2))
141 | 
142 |             rays = np.array(
143 |                 [
144 |                     [-dx, -dy, 1.0],  # top-left corner
145 |                     [-dx, dy, 1.0],  # bottom-left corner
146 |                     [dx, -dy, 1.0],  # top-right corner
147 |                     [dx, dy, 1.0],  # bottom-right corner
148 |                 ]
149 |             )
150 |             rays = rays / np.linalg.norm(rays, axis=1, keepdims=True)
151 | 
152 |             # transform rays to the world frame
153 |             self._bounding_rays = rays @ self._extrinsic_matrix[:3, :3].T
154 | 
155 |         return self._bounding_rays
156 | 
157 |     @property
158 |     def frustum_points(self) -> np.ndarray:
159 |         """
160 |         Returns the eight points of the viewpoint's frustum
161 | 
162 |         Returns a 8x3 array of points in the order of:
163 |             near top-left corner
164 |             near bottom-left corner
165 |             near top-right corner
166 |             near bottom-right corner
167 | 
168 |             far top-left corner
169 |             far bottom-left corner
170 |             far top-right corner
171 |             far bottom-right corner
172 | 
173 |         NOTE: if near_dist or far_dist are None, the corresponding far and near
174 |         dists are set to 0 and 1 respectively
175 |         """
176 |         if not hasattr(self, "_frustum_points"):
177 |             ez_world = self.R[:, -1]
178 |             d = ez_world.T @ self.bounding_rays[0]
179 |             near_factor = 0 if self._near_dist == None else self._near_dist / d
180 |             far_factor = 1 if self._far_dist == None else self._far_dist / d
181 | 
182 |             self._frustum_points = np.array(
183 |                 [
184 |                     self.origin + self.bounding_rays[0] * near_factor,
185 |                     self.origin + self.bounding_rays[1] * near_factor,
186 |                     self.origin + self.bounding_rays[2] * near_factor,
187 |                     self.origin + self.bounding_rays[3] * near_factor,
188 |                     self.origin + self.bounding_rays[0] * far_factor,
189 |                     self.origin + self.bounding_rays[1] * far_factor,
190 |                     self.origin + self.bounding_rays[2] * far_factor,
191 |                     self.origin + self.bounding_rays[3] * far_factor,
192 |                 ]
193 |             ).astype(np.float64)
194 | 
195 |         return self._frustum_points
196 | 
197 |     @property
198 |     def aabb(self) -> o3d.geometry.AxisAlignedBoundingBox:
199 |         """
200 |         Returns the axis-aligned bounding box of the viewpoint's frustum
201 |         """
202 |         if not hasattr(self, "_aabb"):
203 |             self._aabb = o3d.geometry.AxisAlignedBoundingBox.create_from_points(
204 |                 o3d.utility.Vector3dVector(self.frustum_points)
205 |             )
206 |         return self._aabb
207 | 
208 |     def o3d_lineset(self, color=(0, 0, 1)) -> o3d.geometry.LineSet:
209 |         """
210 |         Return the an Open3D lineset of the viewpoint for visualization
211 |         """
212 |         points = self.frustum_points
213 | 
214 |         lines = np.array(
215 |             [
216 |                 [0, 4],
217 |                 [1, 5],
218 |                 [2, 6],
219 |                 [3, 7],
220 |             ]
221 |         ).astype(np.int32)
222 | 
223 |         if self._near_dist != None:
224 |             near_plane_lines = np.array(
225 |                 [
226 |                     [0, 1],
227 |                     [0, 2],
228 |                     [1, 3],
229 |                     [2, 3],
230 |                 ]
231 |             ).astype(np.int32)
232 |             lines = np.concatenate([lines, near_plane_lines], axis=0)
233 | 
234 |         if self._far_dist != None:
235 |             far_plane_lines = np.array(
236 |                 [
237 |                     [4, 5],
238 |                     [4, 6],
239 |                     [5, 7],
240 |                     [6, 7],
241 |                 ]
242 |             ).astype(np.int32)
243 |             lines = np.concatenate([lines, far_plane_lines], axis=0)
244 | 
245 |         lineset = o3d.geometry.LineSet()
246 |         lineset.points = o3d.utility.Vector3dVector(points)
247 |         lineset.lines = o3d.utility.Vector2iVector(lines)
248 |         lineset.colors = o3d.utility.Vector3dVector(np.tile(color, (lines.shape[0], 1)))
249 | 
250 |         return lineset
251 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/localization/voxel_voting.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, Optional, Tuple
  2 | from .viewpoint import Viewpoint
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | def voxel_voting(
  8 |     points: np.ndarray,
  9 |     voxel_size: float,
 10 |     point_weights: Optional[np.ndarray] = None,
 11 | ) -> Tuple[np.ndarray, np.ndarray]:
 12 |     """
 13 |     Voxel voting based localization given a set of points.
 14 | 
 15 |     Args:
 16 |         points: (N,3) array of points
 17 |         voxel_size: Size of the voxels
 18 |         point_weights: (N,) array of weights for each point. If None, all points
 19 |             are weighted equally.
 20 | 
 21 |     Returns:
 22 |         A point cloud of the voxel centers (N,3) and the votes for each voxel (N,)
 23 |     """
 24 |     voxel_coords = np.floor(points / voxel_size)
 25 |     keys = voxel_coords.astype(np.int32)
 26 |     _, inds, inverse_inds, counts = np.unique(
 27 |         keys, axis=0, return_index=True, return_inverse=True, return_counts=True
 28 |     )
 29 | 
 30 |     if point_weights is None:
 31 |         votes = counts
 32 |     else:
 33 |         votes = np.zeros(len(inds))
 34 |         for i in range(len(inds)):
 35 |             votes[i] = np.sum(point_weights[inverse_inds == i])
 36 | 
 37 |     voxel_centers = (voxel_coords[inds] + 0.5) * voxel_size
 38 | 
 39 |     return voxel_centers, votes
 40 | 
 41 | 
 42 | def grid_voxel_voting(
 43 |     viewpoints: Iterable[Viewpoint],
 44 |     voxel_size: float,
 45 |     viewpoint_weight: Optional[Iterable[float]] = None,
 46 | ) -> Tuple[np.ndarray, np.ndarray]:
 47 |     """
 48 |     Voxel voting based localization given a set of viewpoints.
 49 |     1. For each viewpoint, generate a set of grid points inside the viewpoint
 50 |         where the grid is aligned with the world frame
 51 |     2. Merge grid points into a single point cloud
 52 |     3. Voxelize the point cloud and count the number of points in each voxel
 53 |         where the count can be weighted by viewpoint_weight
 54 | 
 55 |     Args:
 56 |         viewpoints: Iterable of viewpoints
 57 |         voxel_size: Size of the voxels
 58 |         viewpoint_weight: Iterable of weights for each viewpoint. If None, all
 59 |             viewpoints are weighted equally.
 60 | 
 61 |     Returns:
 62 |         A point cloud of the voxel centers (N,3) and the votes for each voxel (N,)
 63 |     """
 64 |     if viewpoint_weight is not None:
 65 |         assert len(viewpoint_weight) == len(viewpoints)
 66 | 
 67 |     grid_points = []
 68 |     grid_point_weights = []
 69 |     for i, vp in enumerate(viewpoints):
 70 |         aabb_max_bound = vp.aabb.get_max_bound()
 71 |         aabb_min_bound = vp.aabb.get_min_bound()
 72 | 
 73 |         range_N = np.ceil((aabb_max_bound - aabb_min_bound) / voxel_size)
 74 | 
 75 |         xx, yy, zz = np.meshgrid(
 76 |             voxel_size * np.arange(range_N[0]) + aabb_min_bound[0],
 77 |             voxel_size * np.arange(range_N[1]) + aabb_min_bound[1],
 78 |             voxel_size * np.arange(range_N[2]) + aabb_min_bound[2],
 79 |         )
 80 | 
 81 |         vp_grid_points = np.concatenate(
 82 |             [c.reshape(-1, 1) for c in [xx, yy, zz]], axis=1
 83 |         )
 84 | 
 85 |         # get only the points within the viewpoint
 86 |         inside = vp.within_viewpoint(vp_grid_points)
 87 |         vp_grid_points = vp_grid_points[inside]
 88 | 
 89 |         grid_points.append(vp_grid_points)
 90 | 
 91 |         if viewpoint_weight is not None:
 92 |             grid_point_weights.append(
 93 |                 viewpoint_weight[i] * np.ones(vp_grid_points.shape[0])
 94 |             )
 95 | 
 96 |     if len(grid_points) == 0:
 97 |         return np.zeros((0, 3)), np.zeros((0,))
 98 | 
 99 |     grid_points = np.concatenate(grid_points, axis=0)
100 |     grid_point_weights = (
101 |         None if viewpoint_weight is None else np.concatenate(grid_point_weights, axis=0)
102 |     )
103 | 
104 |     return voxel_voting(grid_points, voxel_size, grid_point_weights)
105 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_tagger import ImageTagger
2 | from .ram_tagger import RAMTagger
3 | from .ram_plus_tagger import RAMPlusTagger
4 | 
5 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/models/image_tagger.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, List, Tuple
 3 | from PIL import Image
 4 | 
 5 | 
 6 | class ImageTagger(ABC):
 7 |     """
 8 |     Abstract base class for all image taggers
 9 |     """
10 | 
11 |     @abstractmethod
12 |     def tag_image(self, image: Image.Image) -> Tuple[List, List]:
13 |         """
14 |         Forwards the tagging model and returns the tags and confidences
15 |         """
16 |         raise NotImplementedError
17 | 
18 |     @abstractmethod
19 |     def filtered_tag_image(
20 |         self, image: Image.Image, params: Dict[str, Any]
21 |     ) -> Tuple[List, List]:
22 |         """
23 |         Forwards the model and applies additional inference filtering to remove unlikely tags
24 |         """
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/models/ram_plus_tagger.py:
--------------------------------------------------------------------------------
 1 | from tag_mapping.models.ram_tagger import RAMTagger
 2 | 
 3 | from ram import get_transform
 4 | from ram.models import ram_plus
 5 | 
 6 | 
 7 | class RAMPlusTagger(RAMTagger):
 8 |     def _init_model(self, config) -> None:
 9 |         # override RAM model to load RAM++ model
10 |         self._device = config["device"]
11 | 
12 |         self._model = ram_plus(
13 |             pretrained=config["ram_pretrained_path"],
14 |             image_size=config["ram_image_size"],
15 |             vit=config["vit"],
16 |         )
17 |         self._model.to(self._device)
18 |         self._model.eval()
19 | 
20 |         self._transform = get_transform(config["ram_image_size"])
21 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/models/ram_tagger.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from typing import Dict, List, Tuple, Any
 4 | from PIL import Image
 5 | 
 6 | from ram import get_transform, inference_ram
 7 | from ram.models import ram
 8 | 
 9 | from tag_mapping.models.image_tagger import ImageTagger
10 | from tag_mapping.filtering import compute_unlikely_tags_center_crop_ensemble
11 | 
12 | 
13 | class RAMTagger(ImageTagger):
14 |     """
15 |     Wrapper for the Recognize-Anything tagging model
16 |     """
17 | 
18 |     def __init__(self, config) -> None:
19 |         self._init_model(config)
20 | 
21 |     def _init_model(self, config) -> None:
22 |         self._device = config["device"]
23 | 
24 |         self._model = ram(
25 |             pretrained=config["ram_pretrained_path"],
26 |             image_size=config["ram_image_size"],
27 |             vit=config["vit"],
28 |         )
29 |         self._model.to(self._device)
30 |         self._model.eval()
31 | 
32 |         self._transform = get_transform(config["ram_image_size"])
33 | 
34 |     def tag_image(self, image: Image.Image) -> Tuple[List, List]:
35 |         """
36 |         Forwards the tagging model and returns the tags and confidences
37 |         """
38 |         tags, confidences = inference_ram(
39 |             self._transform(image).unsqueeze(0).to(self._device), self._model
40 |         )
41 |         tags = tags.split(" | ")
42 |         return {"tags": tags, "confidences": confidences}
43 | 
44 |     def override_class_thresholds(self, thresholds: Dict[str, float]) -> None:
45 |         for cls, t in thresholds.items():
46 |             try:
47 |                 self._model.override_class_threshold(cls, t)
48 |             except Exception as e:
49 |                 print("Couldn't override threshold for {} because: {}".format(cls, e))
50 | 
51 |     def filtered_tag_image(
52 |         self, image: Image.Image, params: Dict[str, Any]
53 |     ) -> Tuple[List, List]:
54 |         """
55 |         Forwards the model and applies additional inference filtering to remove unlikely tags
56 |         """
57 |         out = self.tag_image(image)
58 |         tags, confidences = (out["tags"], out["confidences"])
59 | 
60 |         # filter tags
61 |         unlikely_tags = compute_unlikely_tags_center_crop_ensemble(
62 |             image,
63 |             tags,
64 |             params["crop_border_proportions"],
65 |             self,
66 |         )
67 | 
68 |         filtered_tags = [tag for tag in tags if tag not in unlikely_tags]
69 |         filtered_tag_confidences = [
70 |             conf for tag, conf in zip(tags, confidences) if tag not in unlikely_tags
71 |         ]
72 |         return filtered_tags, filtered_tag_confidences
73 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/pose_graph.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import networkx as nx
  4 | 
  5 | 
  6 | class PoseGraph:
  7 |     """
  8 |     Class implementing methods for working with a pose graph.
  9 | 
 10 |     Currently, this class is a wrapper providing additional functionality on top
 11 |     of a pose graph generated elsewhere.
 12 |     It does not support generating pose graphs or modification of the stored pose graph!
 13 |     """
 14 | 
 15 |     def __init__(self, points, edges):
 16 |         """
 17 |         Args:
 18 |             points: np.ndarray (N, m) of m dimensional points
 19 |             edges: np.ndarray (E, 2) of indices into points
 20 |         """
 21 |         self._nodes = points
 22 | 
 23 |         self._graph = nx.Graph()
 24 |         self._graph.add_nodes_from(np.arange(points.shape[0]))
 25 | 
 26 |         edge_lengths = np.linalg.norm(points[edges[:, 0]] - points[edges[:, 1]], axis=1)
 27 |         for (i, j), length in zip(edges, edge_lengths):
 28 |             self._graph.add_edge(i, j, length=length)
 29 | 
 30 |         # make sure that the graph is connected
 31 |         if not nx.is_connected(self._graph):
 32 |             raise ValueError(
 33 |                 "Cannot create pose graph with arguments representing a disconnected graph"
 34 |             )
 35 | 
 36 |     def closest_node_idx(self, point):
 37 |         """
 38 |         Args:
 39 |             point: np.ndarray (m,) of a m dimensional point
 40 | 
 41 |         Returns:
 42 |             index of the closest node
 43 |         """
 44 |         dists = np.linalg.norm(self._nodes - point, axis=1)
 45 |         return np.argmin(dists)
 46 | 
 47 |     def closest_node(self, point):
 48 |         """
 49 |         Args:
 50 |             point: np.ndarray (m,) of a m dimensional point
 51 | 
 52 |         Returns:
 53 |             coordinates of the closest node
 54 |         """
 55 |         return self._nodes[self.closest_node_idx(point)]
 56 | 
 57 |     def shortest_path(self, start_point, end_point):
 58 |         """
 59 |         Args:
 60 |             start_point: np.ndarray (m,) of a m dimensional point
 61 |             end_point: np.ndarray (m,) of a m dimensional point
 62 | 
 63 |         Returns:
 64 |             list of indices of nodes on the shortest path
 65 |         """
 66 |         start_idx = self.closest_node_idx(start_point)
 67 |         end_idx = self.closest_node_idx(end_point)
 68 | 
 69 |         return nx.shortest_path(self._graph, start_idx, end_idx, weight="length")
 70 | 
 71 |     def shortest_path_length(self, start_point, end_point):
 72 |         """
 73 |         Args:
 74 |             start_point: np.ndarray (m,) of a m dimensional point
 75 |             end_point: np.ndarray (m,) of a m dimensional point
 76 | 
 77 |         Returns:
 78 |             length of the shortest path
 79 |         """
 80 |         start_idx = self.closest_node_idx(start_point)
 81 |         end_idx = self.closest_node_idx(end_point)
 82 | 
 83 |         return nx.shortest_path_length(self._graph, start_idx, end_idx, weight="length")
 84 | 
 85 |     def save(self, save_dir):
 86 |         """
 87 |         Args:
 88 |             save_dir: path to save the graph to
 89 |         """
 90 |         if not os.path.exists(save_dir):
 91 |             os.makedirs(save_dir)
 92 |         np.save(os.path.join(save_dir, "edges.npy"), np.array(self._graph.edges))
 93 |         np.save(os.path.join(save_dir, "node_coords.npy"), self._nodes)
 94 | 
 95 |     @classmethod
 96 |     def load(cls, load_dir):
 97 |         """
 98 |         Args:
 99 |             load_dir: path to load the graph from
100 | 
101 |         Returns:
102 |             PoseGraph object
103 |         """
104 |         edges = np.load(os.path.join(load_dir, "edges.npy"))
105 |         nodes = np.load(os.path.join(load_dir, "node_coords.npy"))
106 |         return cls(nodes, edges)
107 | 
108 |     @property
109 |     def nodes(self):
110 |         return self._nodes.copy()
111 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/tag_map.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | import uuid
  5 | 
  6 | from dataclasses import dataclass
  7 | from typing import Dict, Any
  8 | 
  9 | 
 10 | @dataclass(frozen=True)
 11 | class TagMapEntry:
 12 |     pose: np.ndarray
 13 |     uuid: uuid.UUID
 14 |     extras: Dict[str, Any] = None
 15 | 
 16 | 
 17 | @dataclass(frozen=True)
 18 | class TagDBEntry:
 19 |     entry_uuid: uuid.UUID
 20 |     extras: Dict[str, Any] = None
 21 | 
 22 | 
 23 | class TagMap:
 24 |     def __init__(self, metadata: Dict[str, Any]):
 25 |         self._metadata = metadata
 26 |         self._tags_db = {}
 27 |         self._entry_db = {}
 28 | 
 29 |     def add_entry(self, entry: TagMapEntry):
 30 |         """
 31 |         Add entry (i.e. observed frame) to the tag map.
 32 | 
 33 |         NOTE: This method raises a ValueError if the entry's uuid is
 34 |         already in the entry database.
 35 |         """
 36 |         if entry.uuid in self._entry_db:
 37 |             raise ValueError("uuid already in the entry database")
 38 |         self._entry_db[entry.uuid] = entry
 39 | 
 40 |     def add_tag(self, tag: str, entry_uuid: uuid.UUID, extras: Dict[str, Any] = None):
 41 |         """
 42 |         Associates a tag with an entry in the tag map.
 43 | 
 44 |         Args:
 45 |             tag: The tag to associate with the entry.
 46 |             entry_uuid: The uuid of the entry to associate with the tag.
 47 |             extras: Any extra data to associate with the tag.
 48 |         """
 49 |         if entry_uuid not in self._entry_db:
 50 |             raise ValueError("uuid not in the entry database")
 51 | 
 52 |         tag_db_entry = TagDBEntry(entry_uuid, extras)
 53 |         if tag not in self._tags_db:
 54 |             self._tags_db[tag] = [tag_db_entry]
 55 |         else:
 56 |             self._tags_db[tag].append(tag_db_entry)
 57 | 
 58 |     def add_extra(self, extra_name: str, extra_data: Any, overwrite: bool = False):
 59 |         """
 60 |         Add extra data to the tag map (e.g. pose graph).
 61 | 
 62 |         Args:
 63 |             extra_name: The name of the extra data.
 64 |             extra_data: The extra data.
 65 |             overwrite: Whether to overwrite the extra data if it already exists.
 66 |         """
 67 |         if not hasattr(self, "_extras"):
 68 |             self._extras = {extra_name: extra_data}
 69 |         else:
 70 |             if extra_name in self._extras and not overwrite:
 71 |                 raise ValueError(
 72 |                     "Extra {} already stored in tag map, set overwrite=True to overwrite".format(
 73 |                         extra_name
 74 |                     )
 75 |                 )
 76 |             else:
 77 |                 self._extras[extra_name] = extra_data
 78 | 
 79 |     def query(self, tag: str, return_uuids: bool = False):
 80 |         """
 81 |         Query the tag map for all entries associated with a tag.
 82 | 
 83 |         Returns:
 84 |             A list of TagMapEntry objects associated with the tag
 85 |                 or None if the tag is not in the tag map.
 86 |         """
 87 |         if tag not in self._tags_db:
 88 |             print("{} not in the tag map".format(tag))
 89 |             return None
 90 | 
 91 |         entry_uuids = [e.entry_uuid for e in self._tags_db[tag]]
 92 |         entries = [self._entry_db[id] for id in entry_uuids]
 93 |         tag_extras = [e.extras for e in self._tags_db[tag]]
 94 | 
 95 |         # pack tag extras into entry.extras
 96 |         for entry, tag_extra in zip(entries, tag_extras):
 97 |             if entry.extras is None:
 98 |                 entry.extras = {}
 99 | 
100 |             if tag_extra is not None:
101 |                 for key, value in tag_extra.items():
102 |                     entry.extras[key] = value
103 | 
104 |         if return_uuids:
105 |             return entries, entry_uuids
106 |         else:
107 |             return entries
108 | 
109 |     def save(self, save_path):
110 |         save_dir = os.path.dirname(save_path)
111 |         if not os.path.exists(save_dir):
112 |             os.makedirs(save_dir)
113 |         with open(save_path, "wb") as file:
114 |             pickle.dump(self, file)
115 | 
116 |     @classmethod
117 |     def load(cls, save_path):
118 |         with open(save_path, "rb") as file:
119 |             obj = pickle.load(file)
120 |         if isinstance(obj, cls):
121 |             return obj
122 |         else:
123 |             raise ValueError("Loaded object is not an instance of TagMap")
124 | 
125 |     @property
126 |     def metadata(self):
127 |         return self._metadata
128 | 
129 |     @property
130 |     def unique_objects(self):
131 |         return self._tags_db.keys()
132 | 
133 |     @property
134 |     def num_entries(self):
135 |         return len(self._entry_db)
136 | 
137 |     @property
138 |     def num_tags(self):
139 |         return len(self._tags_db)
140 | 
141 |     @property
142 |     def extras(self):
143 |         return self._extras
144 |     
145 |     def __contains__(self, tag: str):
146 |         return tag in self.unique_objects
147 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .nearest_points_in_box import nearest_points_in_box
2 | from .collision_check import o3d_check_lines_collision
3 | from .get_box_corners import get_box_corners
4 | 
5 | from .load_yaml_params import load_yaml_params
6 | 
7 | from .line_mesh import LineMesh, box_to_linemesh


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/collision_check.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import open3d as o3d
 3 | 
 4 | 
 5 | def o3d_check_lines_collision(
 6 |     rcs: o3d.t.geometry.RaycastingScene,
 7 |     lines_start: np.ndarray,
 8 |     lines_end: np.ndarray,
 9 | ) -> np.ndarray:
10 |     """
11 |     Uses open3d.t.geometry.RaycastingScene to check for collisions between lines and a mesh.
12 | 
13 |     Args:
14 |         rcs: open3d.t.geometry.RaycastingScene of the mesh
15 |         lines_start: (N,3) array of line start points
16 |         lines_end: (N,3) array of line end points
17 | 
18 |     Returns:
19 |         (N,) boolean array of whether each line collides with the mesh
20 |     """
21 |     edge_vectors = lines_end - lines_start
22 |     edge_lengths = np.linalg.norm(edge_vectors, axis=1)
23 | 
24 |     # IMPORTANT: normalize ray direction vector
25 |     ray_directions = edge_vectors / edge_lengths[:, np.newaxis]
26 | 
27 |     rays = np.concatenate([lines_start, ray_directions], axis=1)
28 |     rays = o3d.core.Tensor(rays, dtype=o3d.core.Dtype.Float32)
29 |     res = rcs.cast_rays(rays)
30 | 
31 |     collision_mask = np.asarray(res["t_hit"]) < edge_lengths
32 | 
33 |     return collision_mask
34 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/get_box_corners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from open3d.geometry import AxisAlignedBoundingBox, OrientedBoundingBox
 4 | from tag_mapping.datasets.matterport import (
 5 |     MatterportObjectBoundingBox,
 6 |     MatterportRegionBoundingBox,
 7 | )
 8 | 
 9 | 
10 | def get_box_corners(box) -> np.ndarray:
11 |     """
12 |     Helper function that takes in a box of a supported type and
13 |     outputs its corners as an array of shape (8, 3) in the following order:
14 | 
15 |         (4) +---------+. (5)
16 |             | ` .     |  ` .
17 |             | (0) +---+-----+ (1)
18 |             |     |   |     |
19 |         (7) +-----+---+. (6)|
20 |             ` .   |     ` . |
21 |             (3) ` +---------+ (2)
22 | 
23 |     Args:
24 |         box: a box of a supported type
25 | 
26 |     Returns:
27 |         corners: (8, 3) array of corners
28 |     """
29 |     if type(box) == AxisAlignedBoundingBox:
30 |         min_bound = box.get_min_bound()
31 |         max_bound = box.get_max_bound()
32 |         corners = np.array(
33 |             [
34 |                 min_bound,
35 |                 [max_bound[0], min_bound[1], min_bound[2]],
36 |                 [max_bound[0], max_bound[1], min_bound[2]],
37 |                 [min_bound[0], max_bound[1], min_bound[2]],
38 |                 [min_bound[0], min_bound[1], max_bound[2]],
39 |                 [max_bound[0], min_bound[1], max_bound[2]],
40 |                 max_bound,
41 |                 [min_bound[0], max_bound[1], max_bound[2]],
42 |             ]
43 |         )
44 | 
45 |     elif type(box) == OrientedBoundingBox:
46 |         raise NotImplementedError
47 | 
48 |     elif type(box) == MatterportObjectBoundingBox:
49 |         corners = box.corners()
50 | 
51 |     elif type(box) == MatterportRegionBoundingBox:
52 |         corners = box.corners()
53 | 
54 |     else:
55 |         raise ValueError(f"Unsupported box type {type(box)}")
56 | 
57 |     return corners
58 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/line_mesh.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import open3d as o3d
  3 | 
  4 | """
  5 | This file contains a workaround LineMesh class for Open3D which is a lineset with cylinders instead of lines.
  6 | This is useful for visualizing lines of different thicknesses in Open3D.
  7 | 
  8 | From:
  9 |     https://github.com/isl-org/Open3D/pull/738#issuecomment-564785941
 10 |     https://github.com/isl-org/Open3D/pull/738#issuecomment-697027818
 11 | """
 12 | 
 13 | 
 14 | def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])):
 15 |     """
 16 |     Aligns vector a to vector b with axis angle rotation
 17 |     """
 18 |     if np.array_equal(a, b):
 19 |         return None, None
 20 |     axis_ = np.cross(a, b)
 21 |     axis_ = axis_ / np.linalg.norm(axis_)
 22 |     angle = np.arccos(np.dot(a, b))
 23 | 
 24 |     return axis_, angle
 25 | 
 26 | 
 27 | def normalized(a, axis=-1, order=2):
 28 |     """Normalizes a numpy array of points"""
 29 |     l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
 30 |     l2[l2 == 0] = 1
 31 |     return a / np.expand_dims(l2, axis), l2
 32 | 
 33 | 
 34 | class LineMesh(object):
 35 |     def __init__(self, points, lines, colors=[0, 1, 0], radius=0.05):
 36 |         """Creates a line represented as sequence of cylinder triangular meshes
 37 | 
 38 |         Arguments:
 39 |             points {ndarray} -- Numpy array of ponts Nx3.
 40 | 
 41 |         Keyword Arguments:
 42 |             colors {list} -- list of colors, or single color of the line (default: {[0, 1, 0]})
 43 |             radius {float} -- radius of cylinder (default: {0.15})
 44 |         """
 45 |         self.points = np.array(points)
 46 |         self.lines = np.array(lines)
 47 |         self.colors = np.array(colors)
 48 |         self.radius = radius
 49 |         self.cylinder_segments = []
 50 | 
 51 |         self.create_line_mesh()
 52 | 
 53 |     def create_line_mesh(self):
 54 |         first_points = self.points[self.lines[:, 0], :]
 55 |         second_points = self.points[self.lines[:, 1], :]
 56 |         line_segments = second_points - first_points
 57 |         line_segments_unit, line_lengths = normalized(line_segments)
 58 | 
 59 |         z_axis = np.array([0, 0, 1])
 60 |         # Create triangular mesh cylinder segments of line
 61 |         for i in range(line_segments_unit.shape[0]):
 62 |             line_segment = line_segments_unit[i, :]
 63 |             line_length = line_lengths[i]
 64 |             # get axis angle rotation to allign cylinder with line segment
 65 |             axis, angle = align_vector_to_another(z_axis, line_segment)
 66 |             # Get translation vector
 67 |             translation = first_points[i, :] + line_segment * line_length * 0.5
 68 |             # create cylinder and apply transformations
 69 |             cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder(
 70 |                 self.radius, line_length
 71 |             )
 72 |             cylinder_segment = cylinder_segment.translate(translation, relative=False)
 73 |             if axis is not None:
 74 |                 axis_a = axis * angle
 75 |                 cylinder_segment = cylinder_segment.rotate(
 76 |                     R=o3d.geometry.get_rotation_matrix_from_axis_angle(axis_a),
 77 |                     center=cylinder_segment.get_center(),
 78 |                 )
 79 |             # color cylinder
 80 |             color = self.colors if self.colors.ndim == 1 else self.colors[i, :]
 81 |             cylinder_segment.paint_uniform_color(color)
 82 | 
 83 |             self.cylinder_segments.append(cylinder_segment)
 84 | 
 85 |     def add_line(self, vis):
 86 |         """Adds this line to the visualizer"""
 87 |         for cylinder in self.cylinder_segments:
 88 |             vis.add_geometry(cylinder)
 89 | 
 90 |     def remove_line(self, vis):
 91 |         """Removes this line from the visualizer"""
 92 |         for cylinder in self.cylinder_segments:
 93 |             vis.remove_geometry(cylinder)
 94 | 
 95 | 
 96 | ##########################################################################################
 97 | 
 98 | from tag_mapping.utils import get_box_corners
 99 | 
100 | 
101 | def box_to_linemesh(box, color=(0, 1, 0), radius=0.02):
102 |     """
103 |     Get a LineMesh from a box type.
104 | 
105 |     The box type must be supported by get_box_corners() as we assume that
106 |     get_box_corners() will return the corners in the expected order
107 |     """
108 |     box_points = get_box_corners(box)
109 | 
110 |     box_lines = np.array(
111 |         [
112 |             [0, 1],
113 |             [0, 3],
114 |             [1, 2],
115 |             [2, 3],
116 |             [0, 4],
117 |             [1, 5],
118 |             [2, 6],
119 |             [3, 7],
120 |             [4, 5],
121 |             [4, 7],
122 |             [5, 6],
123 |             [6, 7],
124 |         ]
125 |     )
126 | 
127 |     return LineMesh(
128 |         points=box_points,
129 |         lines=box_lines,
130 |         colors=color,
131 |         radius=radius,
132 |     )
133 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/load_yaml_params.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | 
 4 | def load_yaml_params(params_path):
 5 |     """
 6 |     Modified yaml safe loading to allow for yaml to load python lambdas given by !python/lambda
 7 |     """
 8 | 
 9 |     def yaml_lambda_constructor(loader, node):
10 |         value = loader.construct_scalar(node)
11 |         return eval(value)
12 | 
13 |     yaml.SafeLoader.add_constructor("!python/lambda", yaml_lambda_constructor)
14 | 
15 |     with open(params_path, "r") as f:
16 |         params = yaml.safe_load(f)
17 |     return params
18 | 


--------------------------------------------------------------------------------
/tag_mapping/tag_mapping/utils/nearest_points_in_box.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cvxpy as cp
 3 | 
 4 | from typing import Tuple
 5 | 
 6 | 
 7 | def nearest_points_in_box(
 8 |     box_corners: np.ndarray,
 9 |     box_center: np.ndarray,
10 |     points: np.ndarray,
11 |     solve_kwargs=None,
12 | ) -> np.ndarray:
13 |     """
14 |     Computes the points that are closests to the given points, bounded
15 |     within the box by solving a QP.
16 | 
17 |     Args:
18 |         box_corners: (8,3) array of box corners in order defined in _box_hrep()
19 |         box_center: (3,) array of center coordinate of the box
20 |         points: (N,3) array of the given points
21 |         solve_kwargs: Keyword arguments to pass to cp.Problem.solve(),
22 |             e.g. the solver, verbose, etc.
23 | 
24 |     Returns:
25 |         (N,3) array of the closest points
26 |     """
27 |     if solve_kwargs is None:
28 |         solve_kwargs = {"verbose": False, "solver": cp.ECOS}
29 | 
30 |     box_A, box_b = _box_hrep(box_corners, box_center)
31 | 
32 |     N = points.shape[0]
33 |     X = cp.Variable((3, N))
34 |     objective = cp.Minimize(cp.sum_squares(X - points.T))
35 | 
36 |     # NOTE: use reshape to allow broadcasting of the inequality
37 |     constraints = [box_A @ X <= (box_b + box_A @ box_center).reshape(-1, 1)]
38 | 
39 |     prob = cp.Problem(objective, constraints)
40 |     prob.solve(**solve_kwargs)
41 |     if prob.status not in ["optimal", "optimal_inaccurate"]:
42 |         raise RuntimeError("Closest point QP did not reach optimal solution")
43 | 
44 |     return X.value.T
45 | 
46 | 
47 | def _box_hrep(corners: np.ndarray, center: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
48 |     """
49 |     Computes H-rep arrays A and b such that a point p is in
50 |     the box if A(p - center) <= b
51 | 
52 |     Args:
53 |         corners: (8, 3) corners in the relative order outlined as follows:
54 |             (4) +---------+. (5)
55 |                 | ` .     |  ` .
56 |                 | (0) +---+-----+ (1)
57 |                 |     |   |     |
58 |             (7) +-----+---+. (6)|
59 |                 ` .   |     ` . |
60 |                 (3) ` +---------+ (2)
61 | 
62 |         center: (3,) center of the box
63 | 
64 |     Returns:
65 |         A: (6, 3)
66 |         b: (6,)
67 |     """
68 | 
69 |     def face_hrep(c, vo, vx, vy):
70 |         ex = vx - vo
71 |         ey = vy - vo
72 |         n = np.cross(ex, ey)
73 |         n /= np.linalg.norm(n)
74 |         d = np.dot(n, vo - c)
75 |         return n, d
76 | 
77 |     face_hreps = [
78 |         face_hrep(center, corners[1], corners[0], corners[2]),
79 |         face_hrep(center, corners[4], corners[5], corners[7]),
80 |         face_hrep(center, corners[2], corners[3], corners[6]),
81 |         face_hrep(center, corners[3], corners[0], corners[7]),
82 |         face_hrep(center, corners[0], corners[1], corners[4]),
83 |         face_hrep(center, corners[1], corners[2], corners[5]),
84 |     ]
85 | 
86 |     A = np.concatenate([n.reshape(1, -1) for n, _ in face_hreps], axis=0)
87 |     b = np.array([d for _, d in face_hreps])
88 | 
89 |     return A, b
90 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 | 
190 | Copyright (c) 2022 OPPO
191 | 
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 | 
196 | https://www.apache.org/licenses/LICENSE-2.0
197 | 
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ram/configs/*.json
2 | include ram/configs/swin/*.json
3 | include ram/data/*.txt
4 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/datasets/openimages_common_214/imgs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/datasets/openimages_common_214/imgs/.gitkeep


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/datasets/openimages_common_214/openimages_common_214_ram_taglist.txt:
--------------------------------------------------------------------------------
  1 | accident
  2 | accordion
  3 | plane
  4 | airport
  5 | antelope
  6 | apple
  7 | art gallery
  8 | eggplant
  9 | auditorium
 10 | autumn
 11 | baboon
 12 | backpack
 13 | bakery
 14 | bamboo
 15 | banana
 16 | barbecue
 17 | bed
 18 | bedroom
 19 | clock
 20 | bicycle
 21 | bikini
 22 | birthday cake
 23 | blackberry
 24 | blueberry
 25 | pig
 26 | bookcase
 27 | bridge
 28 | broccoli
 29 | bus
 30 | butterfly
 31 | calculator
 32 | calendar
 33 | camping
 34 | candle
 35 | candy
 36 | cannon
 37 | canyon
 38 | car
 39 | carousel
 40 | cat
 41 | cave
 42 | ceiling
 43 | cheese
 44 | cheetah
 45 | chef
 46 | chicken
 47 | christmas
 48 | christmas tree
 49 | clover
 50 | coral
 51 | corn
 52 | courtyard
 53 | crab
 54 | lobster
 55 | crocodile
 56 | crosswalk
 57 | crow
 58 | cucumber
 59 | cup
 60 | currency
 61 | dachshund
 62 | deer
 63 | desert
 64 | die
 65 | dinosaur
 66 | dog
 67 | dolphin
 68 | doodle
 69 | dragonfly
 70 | drum
 71 | duck
 72 | dumbbell
 73 | easter egg
 74 | egg
 75 | elephant
 76 | faucet
 77 | ferris wheel
 78 | fire
 79 | fireman
 80 | firework
 81 | flamingo
 82 | flower
 83 | football
 84 | fountain
 85 | fox
 86 | fridge
 87 | frog
 88 | ham
 89 | gas stove
 90 | giraffe
 91 | glacier
 92 | glove
 93 | goat
 94 | goose
 95 | gorilla
 96 | grape
 97 | guitar
 98 | gull
 99 | gym
100 | halloween
101 | hamburger
102 | hamster
103 | handbag
104 | hedgehog
105 | helicopter
106 | horse
107 | hummingbird
108 | jellyfish
109 | kangaroo
110 | kimono
111 | kite
112 | ladybird
113 | laptop
114 | leg
115 | mailbox
116 | library
117 | lightning
118 | lily
119 | lion
120 | lizard
121 | luggage
122 | mannequin
123 | map
124 | mask
125 | mattress
126 | microphone
127 | microwave
128 | monkey
129 | moon
130 | mosque
131 | mouse
132 | mushroom
133 | nebula
134 | sea
135 | ostrich
136 | palm tree
137 | paper
138 | pasta
139 | patient
140 | pavilion
141 | pear
142 | pebble
143 | penguin
144 | pet
145 | piano
146 | picture frame
147 | pine
148 | pineapple
149 | pizza
150 | police car
151 | pomegranate
152 | poodle
153 | popcorn
154 | stamp
155 | power station
156 | printer
157 | pumpkin
158 | raccoon
159 | rainbow
160 | rat
161 | restroom
162 | ring
163 | run
164 | salad
165 | sandwich
166 | sausage
167 | shark
168 | sheet music
169 | shrine
170 | snowboard
171 | snake
172 | sparrow
173 | squirrel
174 | stage
175 | starfish
176 | statue
177 | steering wheel
178 | stream
179 | street art
180 | street light
181 | submarine
182 | suite
183 | surfboard
184 | sushi
185 | swan
186 | tattoo
187 | teddy
188 | tennis court
189 | tennis racket
190 | tiger
191 | toast
192 | toilet bowl
193 | toy
194 | tractor
195 | train
196 | trampoline
197 | treadmill
198 | truck
199 | tunnel
200 | turkey
201 | vending machine
202 | waffle
203 | walnut
204 | washing machine
205 | water buffalo
206 | waterfall
207 | watermelon
208 | wheat
209 | wheelchair
210 | windmill
211 | winter
212 | wolf
213 | woodpecker
214 | zebra
215 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/datasets/openimages_common_214/openimages_common_214_tag2text_tagidlist.txt:
--------------------------------------------------------------------------------
  1 | 3
  2 | 8
  3 | 16
  4 | 19
  5 | 21
  6 | 33
  7 | 44
  8 | 50
  9 | 58
 10 | 61
 11 | 71
 12 | 77
 13 | 84
 14 | 96
 15 | 117
 16 | 139
 17 | 142
 18 | 147
 19 | 180
 20 | 200
 21 | 202
 22 | 206
 23 | 244
 24 | 267
 25 | 317
 26 | 321
 27 | 347
 28 | 361
 29 | 380
 30 | 387
 31 | 398
 32 | 407
 33 | 471
 34 | 486
 35 | 489
 36 | 509
 37 | 514
 38 | 530
 39 | 568
 40 | 590
 41 | 595
 42 | 612
 43 | 622
 44 | 626
 45 | 654
 46 | 658
 47 | 664
 48 | 684
 49 | 699
 50 | 704
 51 | 717
 52 | 720
 53 | 727
 54 | 760
 55 | 773
 56 | 786
 57 | 787
 58 | 812
 59 | 814
 60 | 817
 61 | 843
 62 | 855
 63 | 856
 64 | 907
 65 | 950
 66 | 955
 67 | 957
 68 | 1023
 69 | 1042
 70 | 1056
 71 | 1066
 72 | 1091
 73 | 1094
 74 | 1108
 75 | 1141
 76 | 1148
 77 | 1152
 78 | 1168
 79 | 1174
 80 | 1187
 81 | 1231
 82 | 1235
 83 | 1246
 84 | 1276
 85 | 1277
 86 | 1305
 87 | 1308
 88 | 1344
 89 | 1359
 90 | 1362
 91 | 1393
 92 | 1394
 93 | 1410
 94 | 1411
 95 | 1468
 96 | 1504
 97 | 1524
 98 | 1536
 99 | 1540
100 | 1542
101 | 1546
102 | 1553
103 | 1572
104 | 1574
105 | 1606
106 | 1610
107 | 1615
108 | 1655
109 | 1672
110 | 1680
111 | 1682
112 | 1687
113 | 1691
114 | 1692
115 | 1711
116 | 1712
117 | 1713
118 | 1719
119 | 1727
120 | 1733
121 | 1761
122 | 1770
123 | 1782
124 | 1784
125 | 1786
126 | 1803
127 | 1812
128 | 1816
129 | 1820
130 | 1829
131 | 1831
132 | 1841
133 | 1845
134 | 1878
135 | 1882
136 | 1931
137 | 1940
138 | 1944
139 | 1947
140 | 1974
141 | 1975
142 | 1977
143 | 2009
144 | 2031
145 | 2035
146 | 2052
147 | 2065
148 | 2110
149 | 2113
150 | 2138
151 | 2149
152 | 2154
153 | 2157
154 | 2174
155 | 2178
156 | 2184
157 | 2185
158 | 2202
159 | 2222
160 | 2233
161 | 2291
162 | 2301
163 | 2302
164 | 2317
165 | 2320
166 | 2351
167 | 2354
168 | 2373
169 | 2383
170 | 2393
171 | 2403
172 | 2413
173 | 2415
174 | 2417
175 | 2423
176 | 2449
177 | 2454
178 | 2455
179 | 2472
180 | 2494
181 | 2495
182 | 2528
183 | 2541
184 | 2543
185 | 2553
186 | 2563
187 | 2589
188 | 2603
189 | 2654
190 | 2656
191 | 2658
192 | 2676
193 | 2690
194 | 2693
195 | 2700
196 | 2708
197 | 2720
198 | 2721
199 | 2729
200 | 2732
201 | 2734
202 | 2756
203 | 2786
204 | 2792
205 | 2801
206 | 2821
207 | 2851
208 | 2887
209 | 2906
210 | 2909
211 | 2924
212 | 2929
213 | 2966
214 | 2980
215 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/datasets/openimages_rare_200/imgs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/datasets/openimages_rare_200/imgs/.gitkeep


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/datasets/openimages_rare_200/openimages_rare_200_ram_taglist.txt:
--------------------------------------------------------------------------------
  1 | Aerial photography
  2 | Aircraft engine
  3 | Ale
  4 | Aloe
  5 | Amphibian
  6 | Angling
  7 | Anole
  8 | Antique car
  9 | Arcade game
 10 | Arthropod
 11 | Assault rifle
 12 | Athletic shoe
 13 | Auto racing
 14 | Backlighting
 15 | Bagpipes
 16 | Ball game
 17 | Barbecue chicken
 18 | Barechested
 19 | Barquentine
 20 | Beef tenderloin
 21 | Billiard room
 22 | Billiards
 23 | Bird of prey
 24 | Black swan
 25 | Black-and-white
 26 | Blond
 27 | Boating
 28 | Bonbon
 29 | Bottled water
 30 | Bouldering
 31 | Bovine
 32 | Bratwurst
 33 | Breadboard
 34 | Briefs
 35 | Brisket
 36 | Brochette
 37 | Calabaza
 38 | Camera operator
 39 | Canola
 40 | Childbirth
 41 | Chordophone
 42 | Church bell
 43 | Classical sculpture
 44 | Close-up
 45 | Cobblestone
 46 | Coca-cola
 47 | Combat sport
 48 | Comics
 49 | Compact car
 50 | Computer speaker
 51 | Cookies and crackers
 52 | Coral reef fish
 53 | Corn on the cob
 54 | Cosmetics
 55 | Crocodilia
 56 | Digital camera
 57 | Dishware
 58 | Divemaster
 59 | Dobermann
 60 | Dog walking
 61 | Domestic rabbit
 62 | Domestic short-haired cat
 63 | Double-decker bus
 64 | Drums
 65 | Electric guitar
 66 | Electric piano
 67 | Electronic instrument
 68 | Equestrianism
 69 | Equitation
 70 | Erinaceidae
 71 | Extreme sport
 72 | Falafel
 73 | Figure skating
 74 | Filling station
 75 | Fire apparatus
 76 | Firearm
 77 | Flatbread
 78 | Floristry
 79 | Forklift truck
 80 | Freight transport
 81 | Fried food
 82 | Fried noodles
 83 | Frigate
 84 | Frozen yogurt
 85 | Frying
 86 | Full moon
 87 | Galleon
 88 | Glacial landform
 89 | Gliding
 90 | Go-kart
 91 | Goats
 92 | Grappling
 93 | Great white shark
 94 | Gumbo
 95 | Gun turret
 96 | Hair coloring
 97 | Halter
 98 | Headphones
 99 | Heavy cruiser
100 | Herding
101 | High-speed rail
102 | Holding hands
103 | Horse and buggy
104 | Horse racing
105 | Hound
106 | Hunting knife
107 | Hurdling
108 | Inflatable
109 | Jackfruit
110 | Jeans
111 | Jiaozi
112 | Junk food
113 | Khinkali
114 | Kitesurfing
115 | Lawn game
116 | Leaf vegetable
117 | Lechon
118 | Lifebuoy
119 | Locust
120 | Lumpia
121 | Luxury vehicle
122 | Machine tool
123 | Medical imaging
124 | Melee weapon
125 | Microcontroller
126 | Middle ages
127 | Military person
128 | Military vehicle
129 | Milky way
130 | Miniature Poodle
131 | Modern dance
132 | Molluscs
133 | Monoplane
134 | Motorcycling
135 | Musical theatre
136 | Narcissus
137 | Nest box
138 | Newsagent's shop
139 | Nile crocodile
140 | Nordic skiing
141 | Nuclear power plant
142 | Orator
143 | Outdoor shoe
144 | Parachuting
145 | Pasta salad
146 | Peafowl
147 | Pelmeni
148 | Perching bird
149 | Performance car
150 | Personal water craft
151 | Pit bull
152 | Plant stem
153 | Pork chop
154 | Portrait photography
155 | Primate
156 | Procyonidae
157 | Prosciutto
158 | Public speaking
159 | Racewalking
160 | Ramen
161 | Rear-view mirror
162 | Residential area
163 | Ribs
164 | Rice ball
165 | Road cycling
166 | Roller skating
167 | Roman temple
168 | Rowing
169 | Rural area
170 | Sailboat racing
171 | Scaled reptile
172 | Scuba diving
173 | Senior citizen
174 | Shallot
175 | Shinto shrine
176 | Shooting range
177 | Siberian husky
178 | Sledding
179 | Soba
180 | Solar energy
181 | Sport climbing
182 | Sport utility vehicle
183 | Steamed rice
184 | Stemware
185 | Sumo
186 | Surfing Equipment
187 | Team sport
188 | Touring car
189 | Toy block
190 | Trampolining
191 | Underwater diving
192 | Vegetarian food
193 | Wallaby
194 | Water polo
195 | Watercolor paint
196 | Whiskers
197 | Wind wave
198 | Woodwind instrument
199 | Yakitori
200 | Zeppelin
201 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/1641173_2291260800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/1641173_2291260800.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/demo/demo1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo1.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/demo/demo2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo2.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/demo/demo3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo3.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/demo/demo4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/demo/demo4.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/experiment_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/experiment_comparison.png


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/localization_and_recognition.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/localization_and_recognition.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/openset_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/openset_example.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/ram_grounded_sam.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/ram_grounded_sam.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/tag2text_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tag2text_framework.png


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/tag2text_grounded_sam.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tag2text_grounded_sam.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/images/tagging_results.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leggedrobotics/tagmap/ded3eb728f645e6ba7756e559c0bebdd777d3958/thirdparty/recognize-anything/images/tagging_results.jpg


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/inference_ram.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * The Recognize Anything Model (RAM)
 3 |  * Written by Xinyu Huang
 4 | '''
 5 | import argparse
 6 | import numpy as np
 7 | import random
 8 | import time
 9 | 
10 | import torch
11 | 
12 | from PIL import Image
13 | from ram.models import ram
14 | from ram import inference_ram as inference
15 | from ram import get_transform
16 | 
17 | import matplotlib.pyplot as plt
18 | 
19 | parser = argparse.ArgumentParser(
20 |     description='RAM inference for tagging')
21 | parser.add_argument('--image',
22 |                     metavar='DIR',
23 |                     help='path to dataset',
24 |                     default='images/1641173_2291260800.jpg')
25 | parser.add_argument('--pretrained',
26 |                     metavar='DIR',
27 |                     help='path to pretrained model',
28 |                     default='pretrained/ram_swin_large_14m.pth')
29 | parser.add_argument('--image-size',
30 |                     default=384,
31 |                     type=int,
32 |                     metavar='N',
33 |                     help='input image size (default: 384)')
34 | 
35 | 
36 | if __name__ == "__main__":
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41 | 
42 |     transform = get_transform(image_size=args.image_size)
43 | 
44 |     #######load model
45 |     model = ram(pretrained=args.pretrained,
46 |                              image_size=args.image_size,
47 |                              vit='swin_l')
48 |     model.eval()
49 | 
50 |     model = model.to(device)
51 | 
52 |     image = transform(Image.open(args.image)).unsqueeze(0).to(device)
53 | 
54 |     print('image shape: ', image.shape)
55 |     plt.imshow(image.squeeze().permute(1,2,0).cpu().numpy())
56 |     plt.show()
57 | 
58 |     start_inference_time = time.time()
59 |     res = inference(image, model)
60 |     print('Inference time: ', time.time() - start_inference_time)
61 | 
62 |     print("Image Tags: ", res[0])
63 |     print("Confidence: ", " | ".join(["{:.3f}".format(conf) for conf in res[1]]))
64 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/inference_ram_combined.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * The Recognize Anything Model (RAM) inference on seen AND unseen classes
 3 | '''
 4 | import argparse
 5 | import numpy as np
 6 | import random
 7 | 
 8 | import torch
 9 | 
10 | from PIL import Image
11 | from ram.models import ram
12 | from ram import inference_ram_openset as inference
13 | from ram import get_transform
14 | 
15 | from ram.utils import build_openset_label_embedding
16 | from torch import nn
17 | 
18 | parser = argparse.ArgumentParser(
19 |     description='RAM inference for tagging')
20 | parser.add_argument('--image',
21 |                     metavar='DIR',
22 |                     help='path to dataset',
23 |                     default='images/openset_example.jpg')
24 | parser.add_argument('--pretrained',
25 |                     metavar='DIR',
26 |                     help='path to pretrained model',
27 |                     default='pretrained/ram_swin_large_14m.pth')
28 | parser.add_argument('--image-size',
29 |                     default=384,
30 |                     type=int,
31 |                     metavar='N',
32 |                     help='input image size (default: 384)')
33 | 
34 | 
35 | if __name__ == "__main__":
36 | 
37 |     args = parser.parse_args()
38 | 
39 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40 | 
41 |     transform = get_transform(image_size=args.image_size)
42 | 
43 |     #######load model
44 |     model = ram(pretrained=args.pretrained,
45 |                              image_size=args.image_size,
46 |                              vit='swin_l')
47 |     
48 |     model.eval()
49 | 
50 |     model = model.to(device)
51 | 
52 |     #######set openset interference
53 |     openset_label_embedding, openset_categories = build_openset_label_embedding()
54 | 
55 |     model.tag_list = np.concatenate(
56 |         (model.tag_list, np.array(openset_categories)))
57 |     
58 |     model.label_embed = nn.Parameter(torch.cat(
59 |         (model.label_embed, openset_label_embedding.float())))
60 | 
61 |     model.num_class = len(model.tag_list)
62 | 
63 |     # the threshold for unseen categories is often lower
64 |     openset_class_threshold = torch.ones(len(openset_categories)) * 0.5
65 |     model.class_threshold = torch.cat(
66 |         (model.class_threshold, openset_class_threshold))
67 |     #######
68 | 
69 |     image = transform(Image.open(args.image)).unsqueeze(0).to(device)
70 | 
71 |     res = inference(image, model)
72 |     print("Image Tags: ", res)
73 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/inference_ram_openset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * The Recognize Anything Model (RAM) inference on unseen classes
 3 |  * Written by Xinyu Huang
 4 | '''
 5 | import argparse
 6 | import numpy as np
 7 | import random
 8 | 
 9 | import torch
10 | 
11 | from PIL import Image
12 | from ram.models import ram
13 | from ram import inference_ram_openset as inference
14 | from ram import get_transform
15 | 
16 | from ram.utils import build_openset_label_embedding
17 | from torch import nn
18 | 
19 | parser = argparse.ArgumentParser(
20 |     description='RAM inference for tagging')
21 | parser.add_argument('--image',
22 |                     metavar='DIR',
23 |                     help='path to dataset',
24 |                     default='images/openset_example.jpg')
25 | parser.add_argument('--pretrained',
26 |                     metavar='DIR',
27 |                     help='path to pretrained model',
28 |                     default='pretrained/ram_swin_large_14m.pth')
29 | parser.add_argument('--image-size',
30 |                     default=384,
31 |                     type=int,
32 |                     metavar='N',
33 |                     help='input image size (default: 384)')
34 | 
35 | 
36 | if __name__ == "__main__":
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41 | 
42 |     transform = get_transform(image_size=args.image_size)
43 | 
44 |     #######load model
45 |     model = ram(pretrained=args.pretrained,
46 |                              image_size=args.image_size,
47 |                              vit='swin_l')
48 |     
49 |     #######set openset interference
50 |     openset_label_embedding, openset_categories = build_openset_label_embedding()
51 | 
52 |     model.tag_list = np.array(openset_categories)
53 |     
54 |     model.label_embed = nn.Parameter(openset_label_embedding.float())
55 | 
56 |     model.num_class = len(openset_categories)
57 |     # the threshold for unseen categories is often lower
58 |     model.class_threshold = torch.ones(model.num_class) * 0.5
59 |     #######
60 | 
61 |     model.eval()
62 | 
63 |     model = model.to(device)
64 | 
65 |     image = transform(Image.open(args.image)).unsqueeze(0).to(device)
66 | 
67 |     res = inference(image, model)
68 |     print("Image Tags: ", res)
69 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/inference_tag2text.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * The Tag2Text Model
 3 |  * Written by Xinyu Huang
 4 | '''
 5 | import argparse
 6 | import numpy as np
 7 | import random
 8 | 
 9 | import torch
10 | 
11 | from PIL import Image
12 | from ram.models import tag2text
13 | from ram import inference_tag2text as inference
14 | from ram import get_transform
15 | 
16 | 
17 | parser = argparse.ArgumentParser(
18 |     description='Tag2Text inferece for tagging and captioning')
19 | parser.add_argument('--image',
20 |                     metavar='DIR',
21 |                     help='path to dataset',
22 |                     default='images/1641173_2291260800.jpg')
23 | parser.add_argument('--pretrained',
24 |                     metavar='DIR',
25 |                     help='path to pretrained model',
26 |                     default='pretrained/tag2text_swin_14m.pth')
27 | parser.add_argument('--image-size',
28 |                     default=384,
29 |                     type=int,
30 |                     metavar='N',
31 |                     help='input image size (default: 448)')
32 | parser.add_argument('--thre',
33 |                     default=0.68,
34 |                     type=float,
35 |                     metavar='N',
36 |                     help='threshold value')
37 | parser.add_argument('--specified-tags',
38 |                     default='None',
39 |                     help='User input specified tags')
40 | 
41 | 
42 | if __name__ == "__main__":
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
47 | 
48 |     transform = get_transform(image_size=args.image_size)
49 | 
50 |     # delete some tags that may disturb captioning
51 |     # 127: "quarter"; 2961: "back", 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one"
52 |     delete_tag_index = [127,2961, 3351, 3265, 3338, 3355, 3359]
53 | 
54 |     #######load model
55 |     model = tag2text(pretrained=args.pretrained,
56 |                              image_size=args.image_size,
57 |                              vit='swin_b',
58 |                              delete_tag_index=delete_tag_index)
59 |     model.threshold = args.thre  # threshold for tagging
60 |     model.eval()
61 | 
62 |     model = model.to(device)
63 | 
64 |     image = transform(Image.open(args.image)).unsqueeze(0).to(device)
65 | 
66 |     res = inference(image, model, args.specified_tags)
67 |     print("Model Identified Tags: ", res[0])
68 |     print("User Specified Tags: ", res[1])
69 |     print("Image Caption: ", res[2])
70 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import inference_tag2text, inference_ram, inference_ram_openset
2 | from .transform import get_transform
3 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/configs/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |       "BertModel"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 12,
15 |     "num_hidden_layers": 12,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 30524,
19 |     "encoder_width": 768,
20 |     "add_cross_attention": true   
21 |   }


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/configs/q2l_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |       "BertModel"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 4,
15 |     "num_hidden_layers": 2,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 30522,
19 |     "encoder_width": 768,
20 |     "add_cross_attention": true,
21 |     "add_tag_cross_attention": false
22 |   }


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/configs/swin/config_swinB_384.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
3 |     "vision_width": 1024,
4 |     "image_res": 384,
5 |     "window_size": 12,
6 |     "embed_dim": 128,
7 |     "depths": [ 2, 2, 18, 2 ],
8 |     "num_heads": [ 4, 8, 16, 32 ]
9 |   }


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/configs/swin/config_swinL_384.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth",
3 |     "vision_width": 1536,
4 |     "image_res": 384,
5 |     "window_size": 12,
6 |     "embed_dim": 192,
7 |     "depths": [ 2, 2, 18, 2 ],
8 |     "num_heads": [ 6, 12, 24, 48 ]
9 |   }


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/inference.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * The Inference of RAM and Tag2Text Models
 3 |  * Written by Xinyu Huang
 4 | '''
 5 | import torch
 6 | 
 7 | 
 8 | def inference_tag2text(image, model, input_tag="None"):
 9 | 
10 |     with torch.no_grad():
11 |         caption, tag_predict = model.generate(image,
12 |                                               tag_input=None,
13 |                                               max_length=50,
14 |                                               return_tag_predict=True)
15 | 
16 |     if input_tag == '' or input_tag == 'none' or input_tag == 'None':
17 |         return tag_predict[0], None, caption[0]
18 | 
19 |     # If user input specified tags:
20 |     else:
21 |         input_tag_list = []
22 |         input_tag_list.append(input_tag.replace(',', ' | '))
23 | 
24 |         with torch.no_grad():
25 |             caption, input_tag = model.generate(image,
26 |                                                 tag_input=input_tag_list,
27 |                                                 max_length=50,
28 |                                                 return_tag_predict=True)
29 | 
30 |         return tag_predict[0], input_tag[0], caption[0]
31 | 
32 | 
33 | def inference_ram(image, model):
34 | 
35 |     with torch.no_grad():
36 |         tags, confidences = model.generate_tag(image)
37 | 
38 |     return tags[0], confidences[0]
39 | 
40 | 
41 | def inference_ram_openset(image, model):
42 | 
43 |     with torch.no_grad():
44 |         tags = model.generate_tag_openset(image)
45 | 
46 |     return tags[0]
47 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .ram import ram, RAM
2 | from .ram_plus import ram_plus, RAM_plus
3 | from .tag2text import tag2text
4 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/models/ram.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |  * The Recognize Anything Model (RAM)
  3 |  * Written by Xinyu Huang
  4 | '''
  5 | import json
  6 | import warnings
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from .bert import BertConfig, BertModel
 13 | from .swin_transformer import SwinTransformer
 14 | from .utils import *
 15 | 
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | 
 19 | 
 20 | class RAM(nn.Module):
 21 |     def __init__(self,
 22 |                  med_config=f'{CONFIG_PATH}/configs/med_config.json',
 23 |                  image_size=384,
 24 |                  vit='base',
 25 |                  vit_grad_ckpt=False,
 26 |                  vit_ckpt_layer=0,
 27 |                  prompt='a picture of ',
 28 |                  threshold=0.68,
 29 |                  delete_tag_index=[],
 30 |                  tag_list=f'{CONFIG_PATH}/data/ram_tag_list.txt'):
 31 |         r""" The Recognize Anything Model (RAM) inference module.
 32 |         RAM is a strong image tagging model, which can recognize any common category with high accuracy.
 33 |         Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/
 34 |         
 35 |         Args:
 36 |             med_config (str): path for the mixture of encoder-decoder model's configuration file
 37 |             image_size (int): input image size
 38 |             vit (str): model size of vision transformer
 39 |             threshold (int): tagging threshold
 40 |             delete_tag_index (list): delete some tags that may disturb captioning
 41 |         """
 42 |         super().__init__()
 43 | 
 44 |         # create image encoder
 45 |         self.image_size = image_size
 46 |         if vit == 'swin_b':
 47 |             if image_size == 224:
 48 |                 vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json'
 49 |             elif image_size == 384:
 50 |                 vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json'
 51 |             vision_config = read_json(vision_config_path)
 52 |             assert image_size == vision_config['image_res']
 53 |             # assert config['patch_size'] == 32
 54 |             vision_width = vision_config['vision_width']
 55 | 
 56 |             self.visual_encoder = SwinTransformer(
 57 |                 img_size=vision_config['image_res'],
 58 |                 patch_size=4,
 59 |                 in_chans=3,
 60 |                 embed_dim=vision_config['embed_dim'],
 61 |                 depths=vision_config['depths'],
 62 |                 num_heads=vision_config['num_heads'],
 63 |                 window_size=vision_config['window_size'],
 64 |                 mlp_ratio=4.,
 65 |                 qkv_bias=True,
 66 |                 drop_rate=0.0,
 67 |                 drop_path_rate=0.1,
 68 |                 ape=False,
 69 |                 patch_norm=True,
 70 |                 use_checkpoint=False)
 71 | 
 72 |         elif vit == 'swin_l':
 73 |             if image_size == 224:
 74 |                 vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json'
 75 |             elif image_size == 384:
 76 |                 vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json'
 77 |             vision_config = read_json(vision_config_path)
 78 |             assert image_size == vision_config['image_res']
 79 |             # assert config['patch_size'] == 32
 80 |             vision_width = vision_config['vision_width']
 81 | 
 82 |             self.visual_encoder = SwinTransformer(
 83 |                 img_size=vision_config['image_res'],
 84 |                 patch_size=4,
 85 |                 in_chans=3,
 86 |                 embed_dim=vision_config['embed_dim'],
 87 |                 depths=vision_config['depths'],
 88 |                 num_heads=vision_config['num_heads'],
 89 |                 window_size=vision_config['window_size'],
 90 |                 mlp_ratio=4.,
 91 |                 qkv_bias=True,
 92 |                 drop_rate=0.0,
 93 |                 drop_path_rate=0.1,
 94 |                 ape=False,
 95 |                 patch_norm=True,
 96 |                 use_checkpoint=False)
 97 | 
 98 |         else:
 99 |             self.visual_encoder, vision_width = create_vit(
100 |                 vit, image_size, vit_grad_ckpt, vit_ckpt_layer)
101 | 
102 |         # create tokenzier
103 |         self.tokenizer = init_tokenizer()
104 | 
105 |         # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder
106 |         # create image-tag interaction encoder
107 |         encoder_config = BertConfig.from_json_file(med_config)
108 |         encoder_config.encoder_width = 512
109 |         self.tag_encoder = BertModel(config=encoder_config,
110 |                                      add_pooling_layer=False)
111 | 
112 |         self.delete_tag_index = delete_tag_index
113 |         self.prompt = prompt
114 |         self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1
115 | 
116 |         # load tag list
117 |         self.tag_list = self.load_tag_list(tag_list)
118 | 
119 |         # create image-tag recognition decoder
120 |         self.threshold = threshold
121 |         self.num_class = len(self.tag_list)
122 |         q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json')
123 |         q2l_config.encoder_width = 512
124 |         self.tagging_head = BertModel(config=q2l_config,
125 |                                       add_pooling_layer=False)
126 |         self.tagging_head.resize_token_embeddings(len(self.tokenizer))
127 |         # self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size)
128 |         self.label_embed = nn.Parameter(torch.zeros(self.num_class, q2l_config.encoder_width))
129 | 
130 |         if q2l_config.hidden_size != 512:
131 |             self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size)
132 |         else:
133 |             self.wordvec_proj = nn.Identity()
134 | 
135 |         self.fc = nn.Linear(q2l_config.hidden_size, 1)
136 | 
137 |         self.del_selfattention()
138 | 
139 |         # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder"
140 |         tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '',
141 |                                     ' ')
142 |         self.image_proj = nn.Linear(vision_width, 512)
143 |         # self.label_embed = nn.Parameter(torch.load(f'{CONFIG_PATH}/data/textual_label_embedding.pth',map_location='cpu').float())
144 | 
145 |         # adjust thresholds for some tags
146 |         self.class_threshold = torch.ones(self.num_class) * self.threshold
147 |         ram_class_threshold_path = f'{CONFIG_PATH}/data/ram_tag_list_threshold.txt'
148 |         with open(ram_class_threshold_path, 'r', encoding='utf-8') as f:
149 |             ram_class_threshold = [float(s.strip()) for s in f]
150 |         for key,value in enumerate(ram_class_threshold):
151 |             self.class_threshold[key] = value
152 | 
153 |     def override_class_threshold(self, cls, threshold):
154 |         assert (type(cls) == str) and (type(threshold) == float)
155 |         try:
156 |             cls_idx = int(np.where(cls == self.tag_list)[0])
157 |         except:
158 |             raise ValueError('{} not in the tag list'.format(cls))
159 |         assert (
160 |                 threshold >= 0.0 and threshold <= 1.0
161 |             ), "threshold must be between 0 and 1"
162 |         self.class_threshold[cls_idx] = threshold
163 | 
164 |     def load_tag_list(self, tag_list_file):
165 |         with open(tag_list_file, 'r', encoding="utf-8") as f:
166 |             tag_list = f.read().splitlines()
167 |         tag_list = np.array(tag_list)
168 |         return tag_list
169 | 
170 |     # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label
171 |     def del_selfattention(self):
172 |         del self.tagging_head.embeddings
173 |         for layer in self.tagging_head.encoder.layer:
174 |             del layer.attention
175 | 
176 |     def generate_tag(self,
177 |                  image,
178 |                  threshold=0.68,
179 |                  tag_input=None,
180 |                  ):
181 |             
182 |         label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed))
183 | 
184 |         image_embeds = self.image_proj(self.visual_encoder(image))
185 |         image_atts = torch.ones(image_embeds.size()[:-1],
186 |                                 dtype=torch.long).to(image.device)
187 | 
188 |         # recognized image tags using image-tag recogntiion decoder
189 |         image_cls_embeds = image_embeds[:, 0, :]
190 |         image_spatial_embeds = image_embeds[:, 1:, :]
191 | 
192 |         bs = image_spatial_embeds.shape[0]
193 |         label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1)
194 |         tagging_embed = self.tagging_head(
195 |             encoder_embeds=label_embed,
196 |             encoder_hidden_states=image_embeds,
197 |             encoder_attention_mask=image_atts,
198 |             return_dict=False,
199 |             mode='tagging',
200 |         )
201 | 
202 |         class_scores = torch.sigmoid(
203 |             self.fc(tagging_embed[0]).squeeze(-1))
204 | 
205 |         targets = torch.where(
206 |             class_scores > self.class_threshold.to(image.device),
207 |             torch.tensor(1.0).to(image.device),
208 |             torch.zeros(self.num_class).to(image.device))
209 | 
210 |         tag = targets.cpu().numpy()
211 |         tag[:,self.delete_tag_index] = 0
212 |         tag_output = []
213 |         tag_confidences = []
214 |         for b in range(bs):
215 |             index = np.argwhere(tag[b] == 1)
216 |             confidences = class_scores[b, index].cpu().numpy().reshape(-1)
217 |             tag_confidences.append(confidences)
218 |             token = self.tag_list[index].squeeze(axis=1)
219 |             tag_output.append(' | '.join(token))
220 | 
221 |         return tag_output, tag_confidences
222 | 
223 |     def generate_tag_openset(self,
224 |                  image,
225 |                  threshold=0.68,
226 |                  tag_input=None,
227 |                  ):
228 |             
229 |         label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed))
230 | 
231 |         image_embeds = self.image_proj(self.visual_encoder(image))
232 |         image_atts = torch.ones(image_embeds.size()[:-1],
233 |                                 dtype=torch.long).to(image.device)
234 | 
235 |         # recognized image tags using image-tag recogntiion decoder
236 |         image_cls_embeds = image_embeds[:, 0, :]
237 |         image_spatial_embeds = image_embeds[:, 1:, :]
238 | 
239 |         bs = image_spatial_embeds.shape[0]
240 |         label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1)
241 |         tagging_embed = self.tagging_head(
242 |             encoder_embeds=label_embed,
243 |             encoder_hidden_states=image_embeds,
244 |             encoder_attention_mask=image_atts,
245 |             return_dict=False,
246 |             mode='tagging',
247 |         )
248 | 
249 |         class_scores = torch.sigmoid(self.fc(tagging_embed[0]).squeeze(-1))
250 | 
251 |         targets = torch.where(
252 |             class_scores > self.class_threshold.to(image.device),
253 |             torch.tensor(1.0).to(image.device),
254 |             torch.zeros(self.num_class).to(image.device))
255 | 
256 |         tag = targets.cpu().numpy()
257 |         tag[:,self.delete_tag_index] = 0
258 |         tag_output = []
259 |         for b in range(bs):
260 |             index = np.argwhere(tag[b] == 1)
261 |             token = self.tag_list[index].squeeze(axis=1)
262 |             tag_output.append(' | '.join(token))
263 | 
264 |         # TODO also return tag confidences!!!
265 | 
266 |         return tag_output
267 | 
268 | 
269 | # load RAM pretrained model parameters
270 | def ram(pretrained='', **kwargs):
271 |     model = RAM(**kwargs)
272 |     if pretrained:
273 |         if kwargs['vit'] == 'swin_b':
274 |             model, msg = load_checkpoint_swinbase(model, pretrained, kwargs)
275 |         elif kwargs['vit'] == 'swin_l':
276 |             model, msg = load_checkpoint_swinlarge(model, pretrained, kwargs)
277 |         else:
278 |             model, msg = load_checkpoint(model, pretrained)
279 |         print('vit:', kwargs['vit'])
280 | #         print('msg', msg)
281 |     return model
282 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/transform.py:
--------------------------------------------------------------------------------
 1 | from torchvision.transforms import Normalize, Compose, Resize, ToTensor
 2 | 
 3 | 
 4 | def get_transform(image_size=384):
 5 |     return Compose([
 6 |         lambda image: image.convert("RGB"),
 7 |         Resize((image_size, image_size)),
 8 |         ToTensor(),
 9 |         Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
10 |     ])
11 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .metrics import get_mAP, get_PR
2 | from .openset_utils import build_openset_label_embedding
3 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple
  2 | 
  3 | import numpy as np
  4 | from numpy import ndarray
  5 | 
  6 | 
  7 | def get_mAP(
  8 |     preds: ndarray,
  9 |     gt_file: str,
 10 |     taglist: List[str]
 11 | ) -> Tuple[float, ndarray]:
 12 |     assert preds.shape[1] == len(taglist)
 13 | 
 14 |     # When mapping categories from test datasets to our system, there might be
 15 |     # multiple vs one situation due to different semantic definitions of tags.
 16 |     # So there can be duplicate tags in `taglist`. This special case is taken
 17 |     # into account.
 18 |     tag2idxs = {}
 19 |     for idx, tag in enumerate(taglist):
 20 |         if tag not in tag2idxs:
 21 |             tag2idxs[tag] = []
 22 |         tag2idxs[tag].append(idx)
 23 | 
 24 |     # build targets
 25 |     targets = np.zeros_like(preds)
 26 |     with open(gt_file, "r") as f:
 27 |         lines = [line.strip("\n").split(",") for line in f.readlines()]
 28 |     assert len(lines) == targets.shape[0]
 29 |     for i, line in enumerate(lines):
 30 |         for tag in line[1:]:
 31 |             targets[i, tag2idxs[tag]] = 1.0
 32 | 
 33 |     # compute average precision for each class
 34 |     APs = np.zeros(preds.shape[1])
 35 |     for k in range(preds.shape[1]):
 36 |         APs[k] = _average_precision(preds[:, k], targets[:, k])
 37 | 
 38 |     return APs.mean(), APs
 39 | 
 40 | 
 41 | def _average_precision(output: ndarray, target: ndarray) -> float:
 42 |     epsilon = 1e-8
 43 | 
 44 |     # sort examples
 45 |     indices = output.argsort()[::-1]
 46 |     # Computes prec@i
 47 |     total_count_ = np.cumsum(np.ones((len(output), 1)))
 48 | 
 49 |     target_ = target[indices]
 50 |     ind = target_ == 1
 51 |     pos_count_ = np.cumsum(ind)
 52 |     total = pos_count_[-1]
 53 |     pos_count_[np.logical_not(ind)] = 0
 54 |     pp = pos_count_ / total_count_
 55 |     precision_at_i_ = np.sum(pp)
 56 |     precision_at_i = precision_at_i_ / (total + epsilon)
 57 | 
 58 |     return precision_at_i
 59 | 
 60 | 
 61 | def get_PR(
 62 |     pred_file: str,
 63 |     gt_file: str,
 64 |     taglist: List[str]
 65 | ) -> Tuple[float, float, ndarray, ndarray]:
 66 |     # When mapping categories from test datasets to our system, there might be
 67 |     # multiple vs one situation due to different semantic definitions of tags.
 68 |     # So there can be duplicate tags in `taglist`. This special case is taken
 69 |     # into account.
 70 |     tag2idxs = {}
 71 |     for idx, tag in enumerate(taglist):
 72 |         if tag not in tag2idxs:
 73 |             tag2idxs[tag] = []
 74 |         tag2idxs[tag].append(idx)
 75 | 
 76 |     # build preds
 77 |     with open(pred_file, "r", encoding="utf-8") as f:
 78 |         lines = [line.strip().split(",") for line in f.readlines()]
 79 |     preds = np.zeros((len(lines), len(tag2idxs)), dtype=bool)
 80 |     for i, line in enumerate(lines):
 81 |         for tag in line[1:]:
 82 |             preds[i, tag2idxs[tag]] = True
 83 | 
 84 |     # build targets
 85 |     with open(gt_file, "r", encoding="utf-8") as f:
 86 |         lines = [line.strip().split(",") for line in f.readlines()]
 87 |     targets = np.zeros((len(lines), len(tag2idxs)), dtype=bool)
 88 |     for i, line in enumerate(lines):
 89 |         for tag in line[1:]:
 90 |             targets[i, tag2idxs[tag]] = True
 91 | 
 92 |     assert preds.shape == targets.shape
 93 | 
 94 |     # calculate P and R
 95 |     TPs = ( preds &  targets).sum(axis=0)  # noqa: E201, E222
 96 |     FPs = ( preds & ~targets).sum(axis=0)  # noqa: E201, E222
 97 |     FNs = (~preds &  targets).sum(axis=0)  # noqa: E201, E222
 98 |     eps = 1.e-9
 99 |     Ps = TPs / (TPs + FPs + eps)
100 |     Rs = TPs / (TPs + FNs + eps)
101 | 
102 |     return Ps.mean(), Rs.mean(), Ps, Rs
103 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/ram/utils/openset_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from clip import clip
  7 | 
  8 | 
  9 | def article(name):
 10 |     return "an" if name[0] in "aeiou" else "a"
 11 | 
 12 | 
 13 | def processed_name(name, rm_dot=False):
 14 |     # _ for lvis
 15 |     # / for obj365
 16 |     res = name.replace("_", " ").replace("/", " or ").lower()
 17 |     if rm_dot:
 18 |         res = res.rstrip(".")
 19 |     return res
 20 | 
 21 | 
 22 | single_template = ["a photo of a {}."]
 23 | 
 24 | multiple_templates = [
 25 |     "There is {article} {} in the scene.",
 26 |     "There is the {} in the scene.",
 27 |     "a photo of {article} {} in the scene.",
 28 |     "a photo of the {} in the scene.",
 29 |     "a photo of one {} in the scene.",
 30 |     "itap of {article} {}.",
 31 |     "itap of my {}.",  # itap: I took a picture of
 32 |     "itap of the {}.",
 33 |     "a photo of {article} {}.",
 34 |     "a photo of my {}.",
 35 |     "a photo of the {}.",
 36 |     "a photo of one {}.",
 37 |     "a photo of many {}.",
 38 |     "a good photo of {article} {}.",
 39 |     "a good photo of the {}.",
 40 |     "a bad photo of {article} {}.",
 41 |     "a bad photo of the {}.",
 42 |     "a photo of a nice {}.",
 43 |     "a photo of the nice {}.",
 44 |     "a photo of a cool {}.",
 45 |     "a photo of the cool {}.",
 46 |     "a photo of a weird {}.",
 47 |     "a photo of the weird {}.",
 48 |     "a photo of a small {}.",
 49 |     "a photo of the small {}.",
 50 |     "a photo of a large {}.",
 51 |     "a photo of the large {}.",
 52 |     "a photo of a clean {}.",
 53 |     "a photo of the clean {}.",
 54 |     "a photo of a dirty {}.",
 55 |     "a photo of the dirty {}.",
 56 |     "a bright photo of {article} {}.",
 57 |     "a bright photo of the {}.",
 58 |     "a dark photo of {article} {}.",
 59 |     "a dark photo of the {}.",
 60 |     "a photo of a hard to see {}.",
 61 |     "a photo of the hard to see {}.",
 62 |     "a low resolution photo of {article} {}.",
 63 |     "a low resolution photo of the {}.",
 64 |     "a cropped photo of {article} {}.",
 65 |     "a cropped photo of the {}.",
 66 |     "a close-up photo of {article} {}.",
 67 |     "a close-up photo of the {}.",
 68 |     "a jpeg corrupted photo of {article} {}.",
 69 |     "a jpeg corrupted photo of the {}.",
 70 |     "a blurry photo of {article} {}.",
 71 |     "a blurry photo of the {}.",
 72 |     "a pixelated photo of {article} {}.",
 73 |     "a pixelated photo of the {}.",
 74 |     "a black and white photo of the {}.",
 75 |     "a black and white photo of {article} {}.",
 76 |     "a plastic {}.",
 77 |     "the plastic {}.",
 78 |     "a toy {}.",
 79 |     "the toy {}.",
 80 |     "a plushie {}.",
 81 |     "the plushie {}.",
 82 |     "a cartoon {}.",
 83 |     "the cartoon {}.",
 84 |     "an embroidered {}.",
 85 |     "the embroidered {}.",
 86 |     "a painting of the {}.",
 87 |     "a painting of a {}.",
 88 | ]
 89 | 
 90 | 
 91 | openimages_rare_unseen = ['Aerial photography',
 92 | 'Aircraft engine',
 93 | 'Ale',
 94 | 'Aloe',
 95 | 'Amphibian',
 96 | 'Angling',
 97 | 'Anole',
 98 | 'Antique car',
 99 | 'Arcade game',
100 | 'Arthropod',
101 | 'Assault rifle',
102 | 'Athletic shoe',
103 | 'Auto racing',
104 | 'Backlighting',
105 | 'Bagpipes',
106 | 'Ball game',
107 | 'Barbecue chicken',
108 | 'Barechested',
109 | 'Barquentine',
110 | 'Beef tenderloin',
111 | 'Billiard room',
112 | 'Billiards',
113 | 'Bird of prey',
114 | 'Black swan',
115 | 'Black-and-white',
116 | 'Blond',
117 | 'Boating',
118 | 'Bonbon',
119 | 'Bottled water',
120 | 'Bouldering',
121 | 'Bovine',
122 | 'Bratwurst',
123 | 'Breadboard',
124 | 'Briefs',
125 | 'Brisket',
126 | 'Brochette',
127 | 'Calabaza',
128 | 'Camera operator',
129 | 'Canola',
130 | 'Childbirth',
131 | 'Chordophone',
132 | 'Church bell',
133 | 'Classical sculpture',
134 | 'Close-up',
135 | 'Cobblestone',
136 | 'Coca-cola',
137 | 'Combat sport',
138 | 'Comics',
139 | 'Compact car',
140 | 'Computer speaker',
141 | 'Cookies and crackers',
142 | 'Coral reef fish',
143 | 'Corn on the cob',
144 | 'Cosmetics',
145 | 'Crocodilia',
146 | 'Digital camera',
147 | 'Dishware',
148 | 'Divemaster',
149 | 'Dobermann',
150 | 'Dog walking',
151 | 'Domestic rabbit',
152 | 'Domestic short-haired cat',
153 | 'Double-decker bus',
154 | 'Drums',
155 | 'Electric guitar',
156 | 'Electric piano',
157 | 'Electronic instrument',
158 | 'Equestrianism',
159 | 'Equitation',
160 | 'Erinaceidae',
161 | 'Extreme sport',
162 | 'Falafel',
163 | 'Figure skating',
164 | 'Filling station',
165 | 'Fire apparatus',
166 | 'Firearm',
167 | 'Flatbread',
168 | 'Floristry',
169 | 'Forklift truck',
170 | 'Freight transport',
171 | 'Fried food',
172 | 'Fried noodles',
173 | 'Frigate',
174 | 'Frozen yogurt',
175 | 'Frying',
176 | 'Full moon',
177 | 'Galleon',
178 | 'Glacial landform',
179 | 'Gliding',
180 | 'Go-kart',
181 | 'Goats',
182 | 'Grappling',
183 | 'Great white shark',
184 | 'Gumbo',
185 | 'Gun turret',
186 | 'Hair coloring',
187 | 'Halter',
188 | 'Headphones',
189 | 'Heavy cruiser',
190 | 'Herding',
191 | 'High-speed rail',
192 | 'Holding hands',
193 | 'Horse and buggy',
194 | 'Horse racing',
195 | 'Hound',
196 | 'Hunting knife',
197 | 'Hurdling',
198 | 'Inflatable',
199 | 'Jackfruit',
200 | 'Jeans',
201 | 'Jiaozi',
202 | 'Junk food',
203 | 'Khinkali',
204 | 'Kitesurfing',
205 | 'Lawn game',
206 | 'Leaf vegetable',
207 | 'Lechon',
208 | 'Lifebuoy',
209 | 'Locust',
210 | 'Lumpia',
211 | 'Luxury vehicle',
212 | 'Machine tool',
213 | 'Medical imaging',
214 | 'Melee weapon',
215 | 'Microcontroller',
216 | 'Middle ages',
217 | 'Military person',
218 | 'Military vehicle',
219 | 'Milky way',
220 | 'Miniature Poodle',
221 | 'Modern dance',
222 | 'Molluscs',
223 | 'Monoplane',
224 | 'Motorcycling',
225 | 'Musical theatre',
226 | 'Narcissus',
227 | 'Nest box',
228 | 'Newsagent\'s shop',
229 | 'Nile crocodile',
230 | 'Nordic skiing',
231 | 'Nuclear power plant',
232 | 'Orator',
233 | 'Outdoor shoe',
234 | 'Parachuting',
235 | 'Pasta salad',
236 | 'Peafowl',
237 | 'Pelmeni',
238 | 'Perching bird',
239 | 'Performance car',
240 | 'Personal water craft',
241 | 'Pit bull',
242 | 'Plant stem',
243 | 'Pork chop',
244 | 'Portrait photography',
245 | 'Primate',
246 | 'Procyonidae',
247 | 'Prosciutto',
248 | 'Public speaking',
249 | 'Racewalking',
250 | 'Ramen',
251 | 'Rear-view mirror',
252 | 'Residential area',
253 | 'Ribs',
254 | 'Rice ball',
255 | 'Road cycling',
256 | 'Roller skating',
257 | 'Roman temple',
258 | 'Rowing',
259 | 'Rural area',
260 | 'Sailboat racing',
261 | 'Scaled reptile',
262 | 'Scuba diving',
263 | 'Senior citizen',
264 | 'Shallot',
265 | 'Shinto shrine',
266 | 'Shooting range',
267 | 'Siberian husky',
268 | 'Sledding',
269 | 'Soba',
270 | 'Solar energy',
271 | 'Sport climbing',
272 | 'Sport utility vehicle',
273 | 'Steamed rice',
274 | 'Stemware',
275 | 'Sumo',
276 | 'Surfing Equipment',
277 | 'Team sport',
278 | 'Touring car',
279 | 'Toy block',
280 | 'Trampolining',
281 | 'Underwater diving',
282 | 'Vegetarian food',
283 | 'Wallaby',
284 | 'Water polo',
285 | 'Watercolor paint',
286 | 'Whiskers',
287 | 'Wind wave',
288 | 'Woodwind instrument',
289 | 'Yakitori',
290 | 'Zeppelin']
291 | 
292 | 
293 | def build_openset_label_embedding(categories=None):
294 |     if categories is None:
295 |         categories = openimages_rare_unseen
296 |     model, _ = clip.load("ViT-B/16")
297 |     templates = multiple_templates
298 | 
299 |     run_on_gpu = torch.cuda.is_available()
300 | 
301 |     with torch.no_grad():
302 |         openset_label_embedding = []
303 |         for category in categories:
304 |             texts = [
305 |                 template.format(
306 |                     processed_name(category, rm_dot=True), article=article(category)
307 |                 )
308 |                 for template in templates
309 |             ]
310 |             texts = [
311 |                 "This is " + text if text.startswith("a") or text.startswith("the") else text
312 |                 for text in texts
313 |             ]
314 |             texts = clip.tokenize(texts)  # tokenize
315 |             if run_on_gpu:
316 |                 texts = texts.cuda()
317 |                 model = model.cuda()
318 |             text_embeddings = model.encode_text(texts)
319 |             text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
320 |             text_embedding = text_embeddings.mean(dim=0)
321 |             text_embedding /= text_embedding.norm()
322 |             openset_label_embedding.append(text_embedding)
323 |         openset_label_embedding = torch.stack(openset_label_embedding, dim=1)
324 |         if run_on_gpu:
325 |             openset_label_embedding = openset_label_embedding.cuda()
326 | 
327 |     openset_label_embedding = openset_label_embedding.t()
328 |     return openset_label_embedding, categories
329 | 
330 | 
331 | 
332 | 
333 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/requirements.txt:
--------------------------------------------------------------------------------
 1 | timm==0.4.12
 2 | transformers==4.15.0
 3 | fairscale==0.4.4
 4 | pycocoevalcap
 5 | torch
 6 | torchvision
 7 | Pillow
 8 | scipy
 9 | git+https://github.com/openai/CLIP.git
10 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = recognize-anything
 3 | version = 0.0.1
 4 | description = Recognize Anything Model and Tag2Text Model
 5 | 
 6 | [options]
 7 | packages = find:
 8 | include_package_data = True
 9 | 
10 | [options.packages.find]
11 | exclude =
12 |     datasets
13 |     images
14 |     outputs
15 |     pretrained
16 | 


--------------------------------------------------------------------------------
/thirdparty/recognize-anything/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | setuptools.setup()
3 | 


--------------------------------------------------------------------------------