├── .gitignore ├── LICENSE ├── README.md ├── ace_encoder_pretrained.pt ├── ace_loss.py ├── ace_network.py ├── ace_trainer.py ├── ace_util.py ├── ace_vis_util.py ├── ace_visualizer.py ├── dataset.py ├── datasets ├── dataset_util.py ├── setup_12scenes.py ├── setup_7scenes.py ├── setup_cambridge.py └── setup_wayspots.py ├── dsacstar ├── dsacstar.cpp ├── dsacstar_derivative.h ├── dsacstar_loss.h ├── dsacstar_types.h ├── dsacstar_util.h ├── dsacstar_util_rgbd.h ├── setup.py ├── stop_watch.h ├── thread_rand.cpp └── thread_rand.h ├── environment.yml ├── eval_poses.py ├── merge_ensemble_results.py ├── scripts ├── train_12scenes.sh ├── train_12scenes_pgt.sh ├── train_7scenes.sh ├── train_7scenes_pgt.sh ├── train_all.sh ├── train_cambridge.sh ├── train_cambridge_ensemble.sh ├── train_wayspots.sh ├── viz_12scenes.sh ├── viz_7scenes.sh ├── viz_cambridge.sh └── viz_wayspots.sh ├── test_ace.py └── train_ace.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .idea/ 3 | 4 | # Dataset files. 5 | datasets/7scenes_* 6 | datasets/12scenes_* 7 | datasets/Cambridge_* 8 | datasets/wayspots_* 9 | datasets/pgt_* 10 | datasets/visloc_pseudo_gt_limitations 11 | 12 | # Outputs. 13 | output/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | - Main ACE Files: 2 | - ace_encoder_pretrained.pt 3 | - ace_loss.py 4 | - ace_network.py 5 | - ace_trainer.py 6 | - ace_util.py 7 | - ace_visualizer.py 8 | - ace_vis_util.py 9 | - eval_poses.py 10 | - merge_ensemble_results.py 11 | - test_ace.py 12 | - train_ace.py 13 | - Contents of the scripts/ folder. 14 | 15 | Copyright © Niantic, Inc. 2022. Patent Pending. 16 | 17 | All rights reserved. 18 | 19 | 20 | 21 | ================================================================================ 22 | 23 | 24 | 25 | This Software is licensed under the terms of the following ACE license 26 | which allows for non-commercial use only. For any other use of the software not 27 | covered by the terms of this license, please contact partnerships@nianticlabs.com 28 | 29 | 30 | 31 | ================================================================================ 32 | 33 | 34 | 35 | ACE License 36 | 37 | 38 | This Agreement is made by and between the Licensor and the Licensee as 39 | defined and identified below. 40 | 41 | 42 | 1. Definitions. 43 | 44 | In this Agreement (“the Agreement”) the following words shall have the 45 | following meanings: 46 | 47 | "Authors" shall mean E. Brachmann, T. Cavallari, V. Prisacariu 48 | "Licensee" Shall mean the person or organization agreeing to use the 49 | Software in accordance with these terms and conditions. 50 | "Licensor" shall mean Niantic Inc., a company organized and existing under 51 | the laws of Delaware, whose principal place of business is at 1 Ferry Building, 52 | Suite 200, San Francisco, 94111. 53 | "Software" shall mean the ACE Software uploaded by Licensor to the 54 | GitHub repository at https://github.com/nianticlabs/ace 55 | on May 23rd, 2023, in source code or object code form and any 56 | accompanying documentation as well as any modifications or additions uploaded 57 | to the same GitHub repository by Licensor. 58 | 59 | 60 | 2. License. 61 | 62 | 2.1 The Licensor has all necessary rights to grant a license under: (i) 63 | copyright and rights in the nature of copyright subsisting in the Software; and 64 | (ii) certain patent rights resulting from a patent application(s) filed by the 65 | Licensor in the United States and/or other jurisdictions in connection with the 66 | Software. The Licensor grants the Licensee for the duration of this Agreement, 67 | a free of charge, non-sublicenseable, non-exclusive, non-transferable copyright 68 | and patent license (in consequence of said patent application(s)) to use the 69 | Software for non-commercial purpose only, including teaching and research at 70 | educational institutions and research at not-for-profit research institutions 71 | in accordance with the provisions of this Agreement. Non-commercial use 72 | expressly excludes any profit-making or commercial activities, including without 73 | limitation sale, license, manufacture or development of commercial products, use in 74 | commercially-sponsored research, use at a laboratory or other facility owned or 75 | controlled (whether in whole or in part) by a commercial entity, provision of 76 | consulting service, use for or on behalf of any commercial entity, use in 77 | research where a commercial party obtains rights to research results or any 78 | other benefit, and use of the code in any models, model weights or code 79 | resulting from such procedure in any commercial product. Notwithstanding the 80 | foregoing restrictions, you can use this code for publishing comparison results 81 | for academic papers, including retraining on your own data. Any use of the 82 | Software for any purpose other than pursuant to the license grant set forth 83 | above shall automatically terminate this License. 84 | 85 | 86 | 2.2 The Licensee is permitted to make modifications to the Software 87 | provided that any distribution of such modifications is in accordance with 88 | Clause 3. 89 | 90 | 2.3 Except as expressly permitted by this Agreement and save to the 91 | extent and in the circumstances expressly required to be permitted by law, the 92 | Licensee is not permitted to rent, lease, sell, offer to sell, or loan the 93 | Software or its associated documentation. 94 | 95 | 96 | 3. Redistribution and modifications 97 | 98 | 3.1 The Licensee may reproduce and distribute copies of the Software, with 99 | or without modifications, in source format only and provided that any and every 100 | distribution is accompanied by an unmodified copy of this License, and that the 101 | following copyright notice is always displayed in an obvious manner: 102 | "Copyright © Niantic, Inc. 2023. All rights reserved." 103 | 104 | 3.2 In the case where the Software has been modified, any distribution must 105 | include prominent notices indicating which files have been changed. 106 | 107 | 3.3 The Licensee shall cause any work that it distributes or publishes, 108 | that in whole or in part contains or is derived from the Software or any part 109 | thereof (“Work based on the Software”), to be licensed as a whole at no charge 110 | to all third parties entitled to a license to the Software under the terms of 111 | this License and on the same terms provided in this License. 112 | 113 | 114 | 4. Duration. 115 | 116 | This Agreement is effective until the Licensee terminates it by destroying 117 | the Software, any Work based on the Software, and its documentation together 118 | with all copies. It will also terminate automatically if the Licensee fails to 119 | abide by its terms. Upon automatic termination the Licensee agrees to destroy 120 | all copies of the Software, Work based on the Software, and its documentation. 121 | 122 | 123 | 5. Disclaimer of Warranties. 124 | 125 | The Software is provided as is. To the maximum extent permitted by law, 126 | Licensor provides no warranties or conditions of any kind, either express or 127 | implied, including without limitation, any warranties or condition of title, 128 | non-infringement or fitness for a particular purpose. 129 | 130 | 131 | 6. LIMITATION OF LIABILITY. 132 | 133 | IN NO EVENT SHALL THE LICENSOR AND/OR AUTHORS BE LIABLE FOR ANY DIRECT, 134 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING 135 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 136 | DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 137 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 138 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 139 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 140 | 141 | 142 | 7. Indemnity. 143 | 144 | The Licensee shall indemnify the Licensor and/or Authors against all third 145 | party claims that may be asserted against or suffered by the Licensor and/or 146 | Authors and which relate to use of the Software by the Licensee. 147 | 148 | 149 | 8. Intellectual Property. 150 | 151 | 8.1 As between the Licensee and Licensor, copyright and all other 152 | intellectual property rights subsisting in or in connection with the Software 153 | and supporting information shall remain at all times the property of the 154 | Licensor. The Licensee shall acquire no rights in any such material except as 155 | expressly provided in this Agreement. 156 | 157 | 8.2 No permission is granted to use the trademarks or product names of the 158 | Licensor except as required for reasonable and customary use in describing the 159 | origin of the Software and for the purposes of abiding by the terms of Clause 160 | 3.1. 161 | 162 | 8.3 The Licensee shall promptly notify the Licensor of any improvement or 163 | new use of the Software (“Improvements”) in sufficient detail for Licensor to 164 | evaluate the Improvements. The Licensee hereby grants the Licensor and its 165 | affiliates a non-exclusive, fully paid-up, royalty-free, irrevocable and 166 | perpetual license to all Improvements for non-commercial academic research and 167 | teaching purposes upon creation of such improvements. 168 | 169 | 8.4 The Licensee grants an exclusive first option to the Licensor to be 170 | exercised by the Licensor within three (3) years of the date of notification of 171 | an Improvement under Clause 8.3 to use any the Improvement for commercial 172 | purposes on terms to be negotiated and agreed by Licensee and Licensor in good 173 | faith within a period of six (6) months from the date of exercise of the said 174 | option (including without limitation any royalty share in net income from such 175 | commercialization payable to the Licensee, as the case may be). 176 | 177 | 178 | 9. Acknowledgements. 179 | 180 | The Licensee shall acknowledge the Authors and use of the Software in the 181 | publication of any work that uses, or results that are achieved through, the 182 | use of the Software. The following citation shall be included in the 183 | acknowledgement: "Accelerated Coordinate Encoding: Learning to Relocalize in 184 | Minutes using RGB and Poses"; by Eric Brachmann, Tommaso Cavallari, and Victor 185 | Adrian Prisacariu; CVPR 2023. 186 | 187 | 188 | 10. Governing Law. 189 | 190 | This Agreement shall be governed by, construed and interpreted in 191 | accordance with English law and the parties submit to the exclusive 192 | jurisdiction of the English courts. 193 | 194 | 195 | 11. Termination. 196 | 197 | Upon termination of this Agreement, the licenses granted hereunder will 198 | terminate and Sections 5, 6, 7, 8, 9, 10 and 11 shall survive any termination 199 | of this Agreement. 200 | 201 | 202 | -------------------------------------------------------------------------------- 203 | - dataset.py 204 | - Contents of the dsacstar/ folder. 205 | 206 | BSD 3-Clause License 207 | 208 | Copyright (c) 2020, ebrach 209 | All rights reserved. 210 | 211 | Redistribution and use in source and binary forms, with or without 212 | modification, are permitted provided that the following conditions are met: 213 | 214 | 1. Redistributions of source code must retain the above copyright notice, this 215 | list of conditions and the following disclaimer. 216 | 217 | 2. Redistributions in binary form must reproduce the above copyright notice, 218 | this list of conditions and the following disclaimer in the documentation 219 | and/or other materials provided with the distribution. 220 | 221 | 3. Neither the name of the copyright holder nor the names of its 222 | contributors may be used to endorse or promote products derived from 223 | this software without specific prior written permission. 224 | 225 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 226 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 227 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 228 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 229 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 230 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 231 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 232 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 233 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 234 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 235 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses 2 | 3 | ---------------------------------------------------------------------------------------- 4 | 5 | This repository contains the code associated to the ACE paper: 6 | > **Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses** 7 | > 8 | > [Eric Brachmann](https://ebrach.github.io/), [Tommaso Cavallari](https://scholar.google.it/citations?user=r7osSm0AAAAJ&hl=en), and [Victor Adrian Prisacariu](https://www.robots.ox.ac.uk/~victor/) 9 | > 10 | > [CVPR 2023, Highlight](https://openaccess.thecvf.com/content/CVPR2023/papers/Brachmann_Accelerated_Coordinate_Encoding_Learning_to_Relocalize_in_Minutes_Using_RGB_CVPR_2023_paper.pdf) 11 | 12 | For further information please visit: 13 | 14 | - [Project page (with videos, method explanations, dataset details)](https://nianticlabs.github.io/ace) 15 | - [Arxiv](https://arxiv.org/abs/2305.14059) 16 | 17 | Table of contents: 18 | 19 | - [Installation](#installation) 20 | - [Dataset Setup](#datasets) 21 | - [Usage](#usage) 22 | - [ACE Training](#ace-training) 23 | - [ACE Evaluation](#ace-evaluation) 24 | - [Training Scripts](#complete-training-and-evaluation-scripts) 25 | - [Pretrained ACE Networks](#pretrained-ace-networks) 26 | - [Note on the Encoder Training](#encoder-training) 27 | - [References](#publications) 28 | 29 | ## Installation 30 | 31 | This code uses PyTorch to train and evaluate the scene-specific coordinate prediction head networks. It has been tested 32 | on Ubuntu 20.04 with a T4 Nvidia GPU, although it should reasonably run with other Linux distributions and GPUs as well. 33 | 34 | We provide a pre-configured [`conda`](https://docs.conda.io/en/latest/) environment containing all required dependencies 35 | necessary to run our code. 36 | You can re-create and activate the environment with: 37 | 38 | ```shell 39 | conda env create -f environment.yml 40 | conda activate ace 41 | ``` 42 | 43 | **All the following commands in this file need to run in the `ace` environment.** 44 | 45 | The ACE network predicts dense 3D scene coordinates associated to the pixels of the input images. 46 | In order to estimate the 6DoF camera poses, it relies on the RANSAC implementation of the DSAC* paper (Brachmann and 47 | Rother, TPAMI 2021), which is written in C++. 48 | As such, you need to build and install the C++/Python bindings of those functions. 49 | You can do this with: 50 | 51 | ```shell 52 | cd dsacstar 53 | python setup.py install 54 | ``` 55 | 56 | > (Optional) If you want to create videos of the training/evaluation process: 57 | > ```shell 58 | > sudo apt install ffmpeg 59 | > ``` 60 | 61 | Having done the steps above, you are ready to experiment with ACE! 62 | 63 | ## Datasets 64 | 65 | The ACE method has been evaluated using multiple published datasets: 66 | 67 | - [Microsoft 7-Scenes](https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/) 68 | - [Stanford 12-Scenes](https://graphics.stanford.edu/projects/reloc/) 69 | - [Cambridge Landmarks](https://www.repository.cam.ac.uk/handle/1810/251342/) 70 | - [Niantic Wayspots](https://nianticlabs.github.io/ace#dataset) 71 | 72 | We provide scripts in the `datasets` folder to automatically download and extract the data in a format that can be 73 | readily used by the ACE scripts. 74 | The format is the same used by the DSAC* codebase, see [here](https://github.com/vislearn/dsacstar#data-structure) for 75 | details. 76 | 77 | > **Important: make sure you have checked the license terms of each dataset before using it.** 78 | 79 | ### {7, 12}-Scenes: 80 | 81 | You can use the `datasets/setup_{7,12}scenes.py` scripts to download the data. 82 | As mentioned in the paper, we experimented with two variants of each of these datasets: one using the original 83 | D-SLAM ground truth camera poses, and one using _Pseudo Ground Truth (PGT)_ camera poses obtained after running SfM on 84 | the scenes 85 | (see 86 | the [ICCV 2021 paper](https://openaccess.thecvf.com/content/ICCV2021/html/Brachmann_On_the_Limits_of_Pseudo_Ground_Truth_in_Visual_Camera_ICCV_2021_paper.html) 87 | , 88 | and [associated code](https://github.com/tsattler/visloc_pseudo_gt_limitations/) for details). 89 | 90 | To download and prepare the datasets using the D-SLAM poses: 91 | 92 | ```shell 93 | cd datasets 94 | # Downloads the data to datasets/7scenes_{chess, fire, ...} 95 | ./setup_7scenes.py 96 | # Downloads the data to datasets/12scenes_{apt1_kitchen, ...} 97 | ./setup_12scenes.py 98 | ``` 99 | 100 | To download and prepare the datasets using the PGT poses: 101 | 102 | ```shell 103 | cd datasets 104 | # Downloads the data to datasets/pgt_7scenes_{chess, fire, ...} 105 | ./setup_7scenes.py --poses pgt 106 | # Downloads the data to datasets/pgt_12scenes_{apt1_kitchen, ...} 107 | ./setup_12scenes.py --poses pgt 108 | ``` 109 | 110 | ### Cambridge Landmarks / Niantic Wayspots: 111 | 112 | We used a single variant of these datasets. Simply run: 113 | 114 | ```shell 115 | cd datasets 116 | # Downloads the data to datasets/Cambridge_{GreatCourt, KingsCollege, ...} 117 | ./setup_cambridge.py 118 | # Downloads the data to datasets/wayspots_{bears, cubes, ...} 119 | ./setup_wayspots.py 120 | ``` 121 | 122 | ## Usage 123 | 124 | We provide scripts to train and evaluate ACE scene coordinate regression networks. 125 | In the following sections we'll detail some of the main command line options that can be used to customize the 126 | behavior of both the training and the pose estimation script. 127 | 128 | ### ACE Training 129 | 130 | The ACE scene-specific coordinate regression head for a scene can be trained using the `train_ace.py` script. 131 | Basic usage: 132 | 133 | ```shell 134 | ./train_ace.py 135 | # Example: 136 | ./train_ace.py datasets/7scenes_chess output/7scenes_chess.pt 137 | ``` 138 | 139 | The output map file contains just the weights of the scene-specific head network -- encoded as half-precision floating 140 | point -- for a size of ~4MB when using default options, as mentioned in the paper. The testing script will use these 141 | weights, together with the scene-agnostic pretrained encoder (`ace_encoder_pretrained.pt`) we provide, to estimate 6DoF 142 | poses for the query images. 143 | 144 | **Additional parameters** that can be passed to the training script to alter its behavior: 145 | 146 | - `--training_buffer_size`: Changes the size of the training buffer containing decorrelated image features (see paper), 147 | that is created at the beginning of the training process. The default size is 8M. 148 | - `--samples_per_image`: How many features to sample from each image during the buffer generation phase. This affects 149 | the amount of time necessary to fill the training buffer, but also affects the amount of decorrelation in the features 150 | present in the buffer. The default is 1024 samples per image. 151 | - `--epochs`: How many full passes over the training buffer are performed during the training. This directly affects the 152 | training time. Default is 16. 153 | - `--num_head_blocks`: The depth of the head network. Specifically, the number of extra 3-layer residual blocks to add 154 | to the default head depth. Default value is 1, which results in a head network composed of 9 layers, for a total of 155 | 4MB weights. 156 | 157 | **Clustering parameters:** these are used for the ensemble experiments (ACE Poker variant) we ran on the Cambridge 158 | Landmarks 159 | dataset. They are used to split the input scene into multiple independent clusters, and training the head network on one 160 | of them (see Section 4.2 of the main paper and Section 1.3 of the supplementary material for details). 161 | 162 | - `--num_clusters`: How many clusters to split the training scene in. Default `None` (disabled). 163 | - `--cluster_idx`: Selects a specific cluster for training. 164 | 165 | **Visualization parameters:** these are used to generate the videos available in the project page (they actually 166 | generate 167 | individual frames that can be collated into a video later). _Note: enabling them will significantly slow down the 168 | training._ 169 | 170 | - `--render_visualization`: Set to `True` to enable generating frames showing the training process. Default `False`. 171 | - `--render_target_path`: Base folder where the frames will be saved. The script automatically appends the current map 172 | name to the folder. Default is `renderings`. 173 | 174 | There are other options available, they can be discovered by running the script with the `--help` flag. 175 | 176 | ### ACE Evaluation 177 | 178 | The pose estimation for a testing scene can be performed using the `test_ace.py` script. 179 | Basic usage: 180 | 181 | ```shell 182 | ./test_ace.py 183 | # Example: 184 | ./test_ace.py datasets/7scenes_chess output/7scenes_chess.pt 185 | ``` 186 | 187 | The script loads (a) the scene-specific ACE head network and (b) the pre-trained scene-agnostic encoder and, for each 188 | testing frame: 189 | 190 | - Computes its per-pixel 3D scene coordinates, resulting in a set of 2D-3D correspondences. 191 | - The correspondences are then passed to a RANSAC algorithm that is able to estimate a 6DoF camera pose. 192 | - The camera poses are compared with the ground truth, and various cumulative metrics are then computed and printed 193 | at the end of the script. 194 | 195 | The metrics include: %-age of frames within certain translation/angle thresholds of the ground truth, 196 | median translation, median rotation error. 197 | 198 | The script also creates a file containing per-frame results so that they can be parsed by other tools or analyzed 199 | separately. 200 | The output file is located alongside the head network and is named: `poses__.txt`. 201 | 202 | Each line in the output file contains the results for an individual query frame, in this format: 203 | 204 | ``` 205 | file_name rot_quaternion_w rot_quaternion_x rot_quaternion_y rot_quaternion_z translation_x translation_y translation_z rot_err_deg tr_err_m inlier_count 206 | ``` 207 | 208 | There are some parameters that can be passed to the script to customize the RANSAC behavior: 209 | 210 | - `--session`: Custom suffix to append to the name of the file containing the estimated camera poses (see paragraph 211 | above). 212 | - `--hypotheses`: How many pose hypotheses to generate and evaluate (i.e. the number of RANSAC iterations). Default is 213 | 64. 214 | - `--threshold`: Inlier threshold (in pixels) to consider a 2D-3D correspondence as valid. 215 | - `--render_visualization`: Set to `True` to enable generating frames showing the evaluation process. Will slow down the 216 | testing significantly if enabled. Default `False`. 217 | - `--render_target_path`: Base folder where the frames will be saved. The script automatically appends the current map 218 | name to the folder. Default is `renderings`. 219 | 220 | There are other options available, they can be discovered by running the script with the `--help` flag. 221 | 222 | #### Ensemble Evaluation 223 | 224 | To deploy the ensemble variants (such as the 4-cluster ACE Poker variant), we simply need to run the training script 225 | multiple times (once per cluster), thus training multiple head networks. 226 | 227 | At localization time we run the testing script once for each trained head and save the per-frame `poses_...` files, 228 | passing the `--session` parameter to the test script to tag each file according to the network used to generate it. 229 | 230 | We provide two more scripts: 231 | 232 | 1. `merge_ensemble_results.py`: Merge multiple `poses_` files -- choosing for each frame the pose that resulted in the 233 | best inlier count. 234 | 2. `eval_poses.py`: Compute the overall accuracy metrics (%-age of poses within threshold and median errors) that we showed 235 | in the paper. 236 | 237 | ACE Poker example for a scene in the Cambridge dataset: 238 | 239 | ```shell 240 | mkdir -p output/Cambridge_GreatCourt 241 | 242 | # Head training: 243 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/0_4.pt --num_clusters 4 --cluster_idx 0 244 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/1_4.pt --num_clusters 4 --cluster_idx 1 245 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/2_4.pt --num_clusters 4 --cluster_idx 2 246 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/3_4.pt --num_clusters 4 --cluster_idx 3 247 | 248 | # Per-cluster evaluation: 249 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/0_4.pt --session 0_4 250 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/1_4.pt --session 1_4 251 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/2_4.pt --session 2_4 252 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/3_4.pt --session 3_4 253 | 254 | # Merging results and computing metrics. 255 | 256 | # The merging script takes a --poses_suffix argument that's used to select only the 257 | # poses generated for the requested number of clusters. 258 | ./merge_ensemble_results.py output/Cambridge_GreatCourt output/Cambridge_GreatCourt/merged_poses_4.txt --poses_suffix "_4.txt" 259 | 260 | # The output poses output by the previous script are then evaluated against the scene ground truth data. 261 | ./eval_poses.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/merged_poses_4.txt 262 | ``` 263 | 264 | ### Complete training and evaluation scripts 265 | 266 | We provide several scripts to run training and evaluation on the various datasets we tested our method with. 267 | These allow replicating the results we showcased in the paper. 268 | They are located under the `scripts` folder: `scripts/train_*.sh`. 269 | 270 | In the same folder we also provide scripts to generate videos of the training/testing protocol, as can be seen in the 271 | project page. They are under `scripts/viz_*.sh`. 272 | 273 | ### Pretrained ACE Networks 274 | 275 | We also make available the set of pretrained ACE Heads we used for the experiments in the paper. 276 | Each head was trained for 5 minutes on one of the scenes in the various datasets, and was used to compute the accuracy 277 | metrics we showed in the main text. 278 | 279 | Each network can be passed directly to the `test_ace.py` script, together with the path to its dataset scene, to run 280 | camera relocalization on the images of the testing split and compute the accuracy metrics, like this: 281 | 282 | ```shell 283 | ./test_ace.py datasets/7scenes_chess /7Scenes/7scenes_chess.pt 284 | ``` 285 | 286 | **The data is available 287 | at [this location](https://storage.googleapis.com/niantic-lon-static/research/ace/ace_models.tar.gz).** 288 | 289 | ### Encoder Training 290 | 291 | As mentioned above, in this repository we provide a set of weights for the pretrained feature extraction 292 | backbone (`ace_encoder_pretrained.pt`) that was used in our experiments. 293 | You are welcome to use them to experiment with ACE in novel scenes. 294 | Both indoor and outdoor environments perform reasonably well with this set of weights, as shown in the paper. 295 | 296 | **Unfortunately, we cannot provide the code to train the encoder as part of this release.** 297 | 298 | The feature extractor has been trained on 100 scenes from [ScanNet](http://www.scan-net.org/), in parallel, for ~1 week, 299 | as described in Section 3.3 of the paper and Section 1.1 of the supplementary material. 300 | It is possible to reimplement the encoder training protocol following those instructions. 301 | 302 | ## Publications 303 | 304 | If you use ACE or parts of its code in your own work, please cite: 305 | 306 | ``` 307 | @inproceedings{brachmann2023ace, 308 | title={Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses}, 309 | author={Brachmann, Eric and Cavallari, Tommaso and Prisacariu, Victor Adrian}, 310 | booktitle={CVPR}, 311 | year={2023}, 312 | } 313 | ``` 314 | 315 | This code builds on previous camera relocalization pipelines, namely DSAC, DSAC++, and DSAC*. Please consider citing: 316 | 317 | ``` 318 | @inproceedings{brachmann2017dsac, 319 | title={{DSAC}-{Differentiable RANSAC} for Camera Localization}, 320 | author={Brachmann, Eric and Krull, Alexander and Nowozin, Sebastian and Shotton, Jamie and Michel, Frank and Gumhold, Stefan and Rother, Carsten}, 321 | booktitle={CVPR}, 322 | year={2017} 323 | } 324 | 325 | @inproceedings{brachmann2018lessmore, 326 | title={Learning less is more - {6D} camera localization via {3D} surface regression}, 327 | author={Brachmann, Eric and Rother, Carsten}, 328 | booktitle={CVPR}, 329 | year={2018} 330 | } 331 | 332 | @article{brachmann2021dsacstar, 333 | title={Visual Camera Re-Localization from {RGB} and {RGB-D} Images Using {DSAC}}, 334 | author={Brachmann, Eric and Rother, Carsten}, 335 | journal={TPAMI}, 336 | year={2021} 337 | } 338 | ``` 339 | 340 | ## License 341 | 342 | Copyright © Niantic, Inc. 2023. Patent Pending. 343 | All rights reserved. 344 | Please see the [license file](LICENSE) for terms. -------------------------------------------------------------------------------- /ace_encoder_pretrained.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nianticlabs/ace/e9e90f2d02ee92c348bf411a5a60e230af6c315e/ace_encoder_pretrained.pt -------------------------------------------------------------------------------- /ace_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright © Niantic, Inc. 2022. 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def weighted_tanh(repro_errs, weight): 8 | return weight * torch.tanh(repro_errs / weight).sum() 9 | 10 | 11 | class ReproLoss: 12 | """ 13 | Compute per-pixel reprojection loss using different configurable approaches. 14 | 15 | - tanh: tanh loss with a constant scale factor given by the `soft_clamp` parameter (when a pixel's reprojection 16 | error is equal to `soft_clamp`, its loss is equal to `soft_clamp * tanh(1)`). 17 | - dyntanh: Used in the paper, similar to the tanh loss above, but the scaling factor decreases during the course of 18 | the training from `soft_clamp` to `soft_clamp_min`. The decrease is linear, unless `circle_schedule` 19 | is True (default), in which case it applies a circular scheduling. See paper for details. 20 | - l1: Standard L1 loss, computed only on those pixels having an error lower than `soft_clamp` 21 | - l1+sqrt: L1 loss for pixels with reprojection error smaller than `soft_clamp` and 22 | `sqrt(soft_clamp * reprojection_error)` for pixels with a higher error. 23 | - l1+logl1: Similar to the above, but using log L1 for pixels with high reprojection error. 24 | """ 25 | 26 | def __init__(self, 27 | total_iterations, 28 | soft_clamp, 29 | soft_clamp_min, 30 | type='dyntanh', 31 | circle_schedule=True): 32 | 33 | self.total_iterations = total_iterations 34 | self.soft_clamp = soft_clamp 35 | self.soft_clamp_min = soft_clamp_min 36 | self.type = type 37 | self.circle_schedule = circle_schedule 38 | 39 | def compute(self, repro_errs_b1N, iteration): 40 | if repro_errs_b1N.nelement() == 0: 41 | return 0 42 | 43 | if self.type == "tanh": 44 | return weighted_tanh(repro_errs_b1N, self.soft_clamp) 45 | 46 | elif self.type == "dyntanh": 47 | # Compute the progress over the training process. 48 | schedule_weight = iteration / self.total_iterations 49 | 50 | if self.circle_schedule: 51 | # Optionally scale it using the circular schedule. 52 | schedule_weight = 1 - np.sqrt(1 - schedule_weight ** 2) 53 | 54 | # Compute the weight to use in the tanh loss. 55 | loss_weight = (1 - schedule_weight) * self.soft_clamp + self.soft_clamp_min 56 | 57 | # Compute actual loss. 58 | return weighted_tanh(repro_errs_b1N, loss_weight) 59 | 60 | elif self.type == "l1": 61 | # L1 loss on all pixels with small-enough error. 62 | softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp 63 | return repro_errs_b1N[~softclamp_mask_b1].sum() 64 | 65 | elif self.type == "l1+sqrt": 66 | # L1 loss on pixels with small errors and sqrt for the others. 67 | softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp 68 | loss_l1 = repro_errs_b1N[~softclamp_mask_b1].sum() 69 | loss_sqrt = torch.sqrt(self.soft_clamp * repro_errs_b1N[softclamp_mask_b1]).sum() 70 | 71 | return loss_l1 + loss_sqrt 72 | 73 | else: 74 | # l1+logl1: same as above, but use log(L1) for pixels with a larger error. 75 | softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp 76 | loss_l1 = repro_errs_b1N[~softclamp_mask_b1].sum() 77 | loss_logl1 = torch.log(1 + (self.soft_clamp * repro_errs_b1N[softclamp_mask_b1])).sum() 78 | 79 | return loss_l1 + loss_logl1 80 | -------------------------------------------------------------------------------- /ace_network.py: -------------------------------------------------------------------------------- 1 | # Copyright © Niantic, Inc. 2022. 2 | 3 | import logging 4 | import math 5 | import re 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | _logger = logging.getLogger(__name__) 12 | 13 | 14 | class Encoder(nn.Module): 15 | """ 16 | FCN encoder, used to extract features from the input images. 17 | 18 | The number of output channels is configurable, the default used in the paper is 512. 19 | """ 20 | 21 | def __init__(self, out_channels=512): 22 | super(Encoder, self).__init__() 23 | 24 | self.out_channels = out_channels 25 | 26 | self.conv1 = nn.Conv2d(1, 32, 3, 1, 1) 27 | self.conv2 = nn.Conv2d(32, 64, 3, 2, 1) 28 | self.conv3 = nn.Conv2d(64, 128, 3, 2, 1) 29 | self.conv4 = nn.Conv2d(128, 256, 3, 2, 1) 30 | 31 | self.res1_conv1 = nn.Conv2d(256, 256, 3, 1, 1) 32 | self.res1_conv2 = nn.Conv2d(256, 256, 1, 1, 0) 33 | self.res1_conv3 = nn.Conv2d(256, 256, 3, 1, 1) 34 | 35 | self.res2_conv1 = nn.Conv2d(256, 512, 3, 1, 1) 36 | self.res2_conv2 = nn.Conv2d(512, 512, 1, 1, 0) 37 | self.res2_conv3 = nn.Conv2d(512, self.out_channels, 3, 1, 1) 38 | 39 | self.res2_skip = nn.Conv2d(256, self.out_channels, 1, 1, 0) 40 | 41 | def forward(self, x): 42 | x = F.relu(self.conv1(x)) 43 | x = F.relu(self.conv2(x)) 44 | x = F.relu(self.conv3(x)) 45 | res = F.relu(self.conv4(x)) 46 | 47 | x = F.relu(self.res1_conv1(res)) 48 | x = F.relu(self.res1_conv2(x)) 49 | x = F.relu(self.res1_conv3(x)) 50 | 51 | res = res + x 52 | 53 | x = F.relu(self.res2_conv1(res)) 54 | x = F.relu(self.res2_conv2(x)) 55 | x = F.relu(self.res2_conv3(x)) 56 | 57 | x = self.res2_skip(res) + x 58 | 59 | return x 60 | 61 | 62 | class Head(nn.Module): 63 | """ 64 | MLP network predicting per-pixel scene coordinates given a feature vector. All layers are 1x1 convolutions. 65 | """ 66 | 67 | def __init__(self, 68 | mean, 69 | num_head_blocks, 70 | use_homogeneous, 71 | homogeneous_min_scale=0.01, 72 | homogeneous_max_scale=4.0, 73 | in_channels=512): 74 | super(Head, self).__init__() 75 | 76 | self.use_homogeneous = use_homogeneous 77 | self.in_channels = in_channels # Number of encoder features. 78 | self.head_channels = 512 # Hardcoded. 79 | 80 | # We may need a skip layer if the number of features output by the encoder is different. 81 | self.head_skip = nn.Identity() if self.in_channels == self.head_channels else nn.Conv2d(self.in_channels, 82 | self.head_channels, 1, 83 | 1, 0) 84 | 85 | self.res3_conv1 = nn.Conv2d(self.in_channels, self.head_channels, 1, 1, 0) 86 | self.res3_conv2 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0) 87 | self.res3_conv3 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0) 88 | 89 | self.res_blocks = [] 90 | 91 | for block in range(num_head_blocks): 92 | self.res_blocks.append(( 93 | nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0), 94 | nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0), 95 | nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0), 96 | )) 97 | 98 | super(Head, self).add_module(str(block) + 'c0', self.res_blocks[block][0]) 99 | super(Head, self).add_module(str(block) + 'c1', self.res_blocks[block][1]) 100 | super(Head, self).add_module(str(block) + 'c2', self.res_blocks[block][2]) 101 | 102 | self.fc1 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0) 103 | self.fc2 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0) 104 | 105 | if self.use_homogeneous: 106 | self.fc3 = nn.Conv2d(self.head_channels, 4, 1, 1, 0) 107 | 108 | # Use buffers because they need to be saved in the state dict. 109 | self.register_buffer("max_scale", torch.tensor([homogeneous_max_scale])) 110 | self.register_buffer("min_scale", torch.tensor([homogeneous_min_scale])) 111 | self.register_buffer("max_inv_scale", 1. / self.max_scale) 112 | self.register_buffer("h_beta", math.log(2) / (1. - self.max_inv_scale)) 113 | self.register_buffer("min_inv_scale", 1. / self.min_scale) 114 | else: 115 | self.fc3 = nn.Conv2d(self.head_channels, 3, 1, 1, 0) 116 | 117 | # Learn scene coordinates relative to a mean coordinate (e.g. center of the scene). 118 | self.register_buffer("mean", mean.clone().detach().view(1, 3, 1, 1)) 119 | 120 | def forward(self, res): 121 | 122 | x = F.relu(self.res3_conv1(res)) 123 | x = F.relu(self.res3_conv2(x)) 124 | x = F.relu(self.res3_conv3(x)) 125 | 126 | res = self.head_skip(res) + x 127 | 128 | for res_block in self.res_blocks: 129 | x = F.relu(res_block[0](res)) 130 | x = F.relu(res_block[1](x)) 131 | x = F.relu(res_block[2](x)) 132 | 133 | res = res + x 134 | 135 | sc = F.relu(self.fc1(res)) 136 | sc = F.relu(self.fc2(sc)) 137 | sc = self.fc3(sc) 138 | 139 | if self.use_homogeneous: 140 | # Dehomogenize coords: 141 | # Softplus ensures we have a smooth homogeneous parameter with a minimum value = self.max_inv_scale. 142 | h_slice = F.softplus(sc[:, 3, :, :].unsqueeze(1), beta=self.h_beta.item()) + self.max_inv_scale 143 | h_slice.clamp_(max=self.min_inv_scale) 144 | sc = sc[:, :3] / h_slice 145 | 146 | # Add the mean to the predicted coordinates. 147 | sc += self.mean 148 | 149 | return sc 150 | 151 | 152 | class Regressor(nn.Module): 153 | """ 154 | FCN architecture for scene coordinate regression. 155 | 156 | The network predicts a 3d scene coordinates, the output is subsampled by a factor of 8 compared to the input. 157 | """ 158 | 159 | OUTPUT_SUBSAMPLE = 8 160 | 161 | def __init__(self, mean, num_head_blocks, use_homogeneous, num_encoder_features=512): 162 | """ 163 | Constructor. 164 | 165 | mean: Learn scene coordinates relative to a mean coordinate (e.g. the center of the scene). 166 | num_head_blocks: How many extra residual blocks to use in the head (one is always used). 167 | use_homogeneous: Whether to learn homogeneous or 3D coordinates. 168 | num_encoder_features: Number of channels output of the encoder network. 169 | """ 170 | super(Regressor, self).__init__() 171 | 172 | self.feature_dim = num_encoder_features 173 | 174 | self.encoder = Encoder(out_channels=self.feature_dim) 175 | self.heads = Head(mean, num_head_blocks, use_homogeneous, in_channels=self.feature_dim) 176 | 177 | @classmethod 178 | def create_from_encoder(cls, encoder_state_dict, mean, num_head_blocks, use_homogeneous): 179 | """ 180 | Create a regressor using a pretrained encoder, loading encoder-specific parameters from the state dict. 181 | 182 | encoder_state_dict: pretrained encoder state dictionary. 183 | mean: Learn scene coordinates relative to a mean coordinate (e.g. the center of the scene). 184 | num_head_blocks: How many extra residual blocks to use in the head (one is always used). 185 | use_homogeneous: Whether to learn homogeneous or 3D coordinates. 186 | """ 187 | 188 | # Number of output channels of the last encoder layer. 189 | num_encoder_features = encoder_state_dict['res2_conv3.weight'].shape[0] 190 | 191 | # Create a regressor. 192 | _logger.info(f"Creating Regressor using pretrained encoder with {num_encoder_features} feature size.") 193 | regressor = cls(mean, num_head_blocks, use_homogeneous, num_encoder_features) 194 | 195 | # Load encoder weights. 196 | regressor.encoder.load_state_dict(encoder_state_dict) 197 | 198 | # Done. 199 | return regressor 200 | 201 | @classmethod 202 | def create_from_state_dict(cls, state_dict): 203 | """ 204 | Instantiate a regressor from a pretrained state dictionary. 205 | 206 | state_dict: pretrained state dictionary. 207 | """ 208 | # Mean is zero (will be loaded from the state dict). 209 | mean = torch.zeros((3,)) 210 | 211 | # Count how many head blocks are in the dictionary. 212 | pattern = re.compile(r"^heads\.\d+c0\.weight$") 213 | num_head_blocks = sum(1 for k in state_dict.keys() if pattern.match(k)) 214 | 215 | # Whether the network uses homogeneous coordinates. 216 | use_homogeneous = state_dict["heads.fc3.weight"].shape[0] == 4 217 | 218 | # Number of output channels of the last encoder layer. 219 | num_encoder_features = state_dict['encoder.res2_conv3.weight'].shape[0] 220 | 221 | # Create a regressor. 222 | _logger.info(f"Creating regressor from pretrained state_dict:" 223 | f"\n\tNum head blocks: {num_head_blocks}" 224 | f"\n\tHomogeneous coordinates: {use_homogeneous}" 225 | f"\n\tEncoder feature size: {num_encoder_features}") 226 | regressor = cls(mean, num_head_blocks, use_homogeneous, num_encoder_features) 227 | 228 | # Load all weights. 229 | regressor.load_state_dict(state_dict) 230 | 231 | # Done. 232 | return regressor 233 | 234 | @classmethod 235 | def create_from_split_state_dict(cls, encoder_state_dict, head_state_dict): 236 | """ 237 | Instantiate a regressor from a pretrained encoder (scene-agnostic) and a scene-specific head. 238 | 239 | encoder_state_dict: encoder state dictionary 240 | head_state_dict: scene-specific head state dictionary 241 | """ 242 | # We simply merge the dictionaries and call the other constructor. 243 | merged_state_dict = {} 244 | 245 | for k, v in encoder_state_dict.items(): 246 | merged_state_dict[f"encoder.{k}"] = v 247 | 248 | for k, v in head_state_dict.items(): 249 | merged_state_dict[f"heads.{k}"] = v 250 | 251 | return cls.create_from_state_dict(merged_state_dict) 252 | 253 | def load_encoder(self, encoder_dict_file): 254 | """ 255 | Load weights into the encoder network. 256 | """ 257 | self.encoder.load_state_dict(torch.load(encoder_dict_file)) 258 | 259 | def get_features(self, inputs): 260 | return self.encoder(inputs) 261 | 262 | def get_scene_coordinates(self, features): 263 | return self.heads(features) 264 | 265 | def forward(self, inputs): 266 | """ 267 | Forward pass. 268 | """ 269 | features = self.get_features(inputs) 270 | return self.get_scene_coordinates(features) 271 | -------------------------------------------------------------------------------- /ace_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright © Niantic, Inc. 2022. 2 | 3 | import logging 4 | import random 5 | import time 6 | 7 | import numpy as np 8 | import torch 9 | import torch.optim as optim 10 | import torchvision.transforms.functional as TF 11 | from torch.cuda.amp import GradScaler, autocast 12 | from torch.utils.data import DataLoader 13 | from torch.utils.data import sampler 14 | 15 | from ace_util import get_pixel_grid, to_homogeneous 16 | from ace_loss import ReproLoss 17 | from ace_network import Regressor 18 | from dataset import CamLocDataset 19 | 20 | import ace_vis_util as vutil 21 | from ace_visualizer import ACEVisualizer 22 | 23 | _logger = logging.getLogger(__name__) 24 | 25 | 26 | def set_seed(seed): 27 | """ 28 | Seed all sources of randomness. 29 | """ 30 | torch.manual_seed(seed) 31 | np.random.seed(seed) 32 | random.seed(seed) 33 | 34 | class TrainerACE: 35 | def __init__(self, options): 36 | self.options = options 37 | 38 | self.device = torch.device('cuda') 39 | 40 | # The flag below controls whether to allow TF32 on matmul. This flag defaults to True. 41 | # torch.backends.cuda.matmul.allow_tf32 = False 42 | 43 | # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True. 44 | # torch.backends.cudnn.allow_tf32 = False 45 | 46 | # Setup randomness for reproducibility. 47 | self.base_seed = 2089 48 | set_seed(self.base_seed) 49 | 50 | # Used to generate batch indices. 51 | self.batch_generator = torch.Generator() 52 | self.batch_generator.manual_seed(self.base_seed + 1023) 53 | 54 | # Dataloader generator, used to seed individual workers by the dataloader. 55 | self.loader_generator = torch.Generator() 56 | self.loader_generator.manual_seed(self.base_seed + 511) 57 | 58 | # Generator used to sample random features (runs on the GPU). 59 | self.sampling_generator = torch.Generator(device=self.device) 60 | self.sampling_generator.manual_seed(self.base_seed + 4095) 61 | 62 | # Generator used to permute the feature indices during each training epoch. 63 | self.training_generator = torch.Generator() 64 | self.training_generator.manual_seed(self.base_seed + 8191) 65 | 66 | self.iteration = 0 67 | self.training_start = None 68 | self.num_data_loader_workers = 12 69 | 70 | # Create dataset. 71 | self.dataset = CamLocDataset( 72 | root_dir=self.options.scene / "train", 73 | mode=0, # Default for ACE, we don't need scene coordinates/RGB-D. 74 | use_half=self.options.use_half, 75 | image_height=self.options.image_resolution, 76 | augment=self.options.use_aug, 77 | aug_rotation=self.options.aug_rotation, 78 | aug_scale_max=self.options.aug_scale, 79 | aug_scale_min=1 / self.options.aug_scale, 80 | num_clusters=self.options.num_clusters, # Optional clustering for Cambridge experiments. 81 | cluster_idx=self.options.cluster_idx, # Optional clustering for Cambridge experiments. 82 | ) 83 | 84 | _logger.info("Loaded training scan from: {} -- {} images, mean: {:.2f} {:.2f} {:.2f}".format( 85 | self.options.scene, 86 | len(self.dataset), 87 | self.dataset.mean_cam_center[0], 88 | self.dataset.mean_cam_center[1], 89 | self.dataset.mean_cam_center[2])) 90 | 91 | # Create network using the state dict of the pretrained encoder. 92 | encoder_state_dict = torch.load(self.options.encoder_path, map_location="cpu") 93 | self.regressor = Regressor.create_from_encoder( 94 | encoder_state_dict, 95 | mean=self.dataset.mean_cam_center, 96 | num_head_blocks=self.options.num_head_blocks, 97 | use_homogeneous=self.options.use_homogeneous 98 | ) 99 | _logger.info(f"Loaded pretrained encoder from: {self.options.encoder_path}") 100 | 101 | self.regressor = self.regressor.to(self.device) 102 | self.regressor.train() 103 | 104 | # Setup optimization parameters. 105 | self.optimizer = optim.AdamW(self.regressor.parameters(), lr=self.options.learning_rate_min) 106 | 107 | # Setup learning rate scheduler. 108 | steps_per_epoch = self.options.training_buffer_size // self.options.batch_size 109 | self.scheduler = optim.lr_scheduler.OneCycleLR(self.optimizer, 110 | max_lr=self.options.learning_rate_max, 111 | epochs=self.options.epochs, 112 | steps_per_epoch=steps_per_epoch, 113 | cycle_momentum=False) 114 | 115 | # Gradient scaler in case we train with half precision. 116 | self.scaler = GradScaler(enabled=self.options.use_half) 117 | 118 | # Generate grid of target reprojection pixel positions. 119 | pixel_grid_2HW = get_pixel_grid(self.regressor.OUTPUT_SUBSAMPLE) 120 | self.pixel_grid_2HW = pixel_grid_2HW.to(self.device) 121 | 122 | # Compute total number of iterations. 123 | self.iterations = self.options.epochs * self.options.training_buffer_size // self.options.batch_size 124 | self.iterations_output = 100 # print loss every n iterations, and (optionally) write a visualisation frame 125 | 126 | # Setup reprojection loss function. 127 | self.repro_loss = ReproLoss( 128 | total_iterations=self.iterations, 129 | soft_clamp=self.options.repro_loss_soft_clamp, 130 | soft_clamp_min=self.options.repro_loss_soft_clamp_min, 131 | type=self.options.repro_loss_type, 132 | circle_schedule=(self.options.repro_loss_schedule == 'circle') 133 | ) 134 | 135 | # Will be filled at the beginning of the training process. 136 | self.training_buffer = None 137 | 138 | # Generate video of training process 139 | if self.options.render_visualization: 140 | # infer rendering folder from map file name 141 | target_path = vutil.get_rendering_target_path( 142 | self.options.render_target_path, 143 | self.options.output_map_file) 144 | self.ace_visualizer = ACEVisualizer( 145 | target_path, 146 | self.options.render_flipped_portrait, 147 | self.options.render_map_depth_filter, 148 | mapping_vis_error_threshold=self.options.render_map_error_threshold) 149 | else: 150 | self.ace_visualizer = None 151 | 152 | def train(self): 153 | """ 154 | Main training method. 155 | 156 | Fills a feature buffer using the pretrained encoder and subsequently trains a scene coordinate regression head. 157 | """ 158 | 159 | if self.ace_visualizer is not None: 160 | 161 | # Setup the ACE render pipeline. 162 | self.ace_visualizer.setup_mapping_visualisation( 163 | self.dataset.pose_files, 164 | self.dataset.rgb_files, 165 | self.iterations // self.iterations_output + 1, 166 | self.options.render_camera_z_offset 167 | ) 168 | 169 | creating_buffer_time = 0. 170 | training_time = 0. 171 | 172 | self.training_start = time.time() 173 | 174 | # Create training buffer. 175 | buffer_start_time = time.time() 176 | self.create_training_buffer() 177 | buffer_end_time = time.time() 178 | creating_buffer_time += buffer_end_time - buffer_start_time 179 | _logger.info(f"Filled training buffer in {buffer_end_time - buffer_start_time:.1f}s.") 180 | 181 | # Train the regression head. 182 | for self.epoch in range(self.options.epochs): 183 | epoch_start_time = time.time() 184 | self.run_epoch() 185 | training_time += time.time() - epoch_start_time 186 | 187 | # Save trained model. 188 | self.save_model() 189 | 190 | end_time = time.time() 191 | _logger.info(f'Done without errors. ' 192 | f'Creating buffer time: {creating_buffer_time:.1f} seconds. ' 193 | f'Training time: {training_time:.1f} seconds. ' 194 | f'Total time: {end_time - self.training_start:.1f} seconds.') 195 | 196 | if self.ace_visualizer is not None: 197 | 198 | # Finalize the rendering by animating the fully trained map. 199 | vis_dataset = CamLocDataset( 200 | root_dir=self.options.scene / "train", 201 | mode=0, 202 | use_half=self.options.use_half, 203 | image_height=self.options.image_resolution, 204 | augment=False) # No data augmentation when visualizing the map 205 | 206 | vis_dataset_loader = torch.utils.data.DataLoader( 207 | vis_dataset, 208 | shuffle=False, # Process data in order for a growing effect later when rendering 209 | num_workers=self.num_data_loader_workers) 210 | 211 | self.ace_visualizer.finalize_mapping(self.regressor, vis_dataset_loader) 212 | 213 | def create_training_buffer(self): 214 | # Disable benchmarking, since we have variable tensor sizes. 215 | torch.backends.cudnn.benchmark = False 216 | 217 | # Sampler. 218 | batch_sampler = sampler.BatchSampler(sampler.RandomSampler(self.dataset, generator=self.batch_generator), 219 | batch_size=1, 220 | drop_last=False) 221 | 222 | # Used to seed workers in a reproducible manner. 223 | def seed_worker(worker_id): 224 | # Different seed per epoch. Initial seed is generated by the main process consuming one random number from 225 | # the dataloader generator. 226 | worker_seed = torch.initial_seed() % 2 ** 32 227 | np.random.seed(worker_seed) 228 | random.seed(worker_seed) 229 | 230 | # Batching is handled at the dataset level (the dataset __getitem__ receives a list of indices, because we 231 | # need to rescale all images in the batch to the same size). 232 | training_dataloader = DataLoader(dataset=self.dataset, 233 | sampler=batch_sampler, 234 | batch_size=None, 235 | worker_init_fn=seed_worker, 236 | generator=self.loader_generator, 237 | pin_memory=True, 238 | num_workers=self.num_data_loader_workers, 239 | persistent_workers=self.num_data_loader_workers > 0, 240 | timeout=60 if self.num_data_loader_workers > 0 else 0, 241 | ) 242 | 243 | _logger.info("Starting creation of the training buffer.") 244 | 245 | # Create a training buffer that lives on the GPU. 246 | self.training_buffer = { 247 | 'features': torch.empty((self.options.training_buffer_size, self.regressor.feature_dim), 248 | dtype=(torch.float32, torch.float16)[self.options.use_half], device=self.device), 249 | 'target_px': torch.empty((self.options.training_buffer_size, 2), dtype=torch.float32, device=self.device), 250 | 'gt_poses_inv': torch.empty((self.options.training_buffer_size, 3, 4), dtype=torch.float32, 251 | device=self.device), 252 | 'intrinsics': torch.empty((self.options.training_buffer_size, 3, 3), dtype=torch.float32, 253 | device=self.device), 254 | 'intrinsics_inv': torch.empty((self.options.training_buffer_size, 3, 3), dtype=torch.float32, 255 | device=self.device) 256 | } 257 | 258 | # Features are computed in evaluation mode. 259 | self.regressor.eval() 260 | 261 | # The encoder is pretrained, so we don't compute any gradient. 262 | with torch.no_grad(): 263 | # Iterate until the training buffer is full. 264 | buffer_idx = 0 265 | dataset_passes = 0 266 | 267 | while buffer_idx < self.options.training_buffer_size: 268 | dataset_passes += 1 269 | for image_B1HW, image_mask_B1HW, gt_pose_B44, gt_pose_inv_B44, intrinsics_B33, intrinsics_inv_B33, _, _ in training_dataloader: 270 | 271 | # Copy to device. 272 | image_B1HW = image_B1HW.to(self.device, non_blocking=True) 273 | image_mask_B1HW = image_mask_B1HW.to(self.device, non_blocking=True) 274 | gt_pose_inv_B44 = gt_pose_inv_B44.to(self.device, non_blocking=True) 275 | intrinsics_B33 = intrinsics_B33.to(self.device, non_blocking=True) 276 | intrinsics_inv_B33 = intrinsics_inv_B33.to(self.device, non_blocking=True) 277 | 278 | # Compute image features. 279 | with autocast(enabled=self.options.use_half): 280 | features_BCHW = self.regressor.get_features(image_B1HW) 281 | 282 | # Dimensions after the network's downsampling. 283 | B, C, H, W = features_BCHW.shape 284 | 285 | # The image_mask needs to be downsampled to the actual output resolution and cast to bool. 286 | image_mask_B1HW = TF.resize(image_mask_B1HW, [H, W], interpolation=TF.InterpolationMode.NEAREST) 287 | image_mask_B1HW = image_mask_B1HW.bool() 288 | 289 | # If the current mask has no valid pixels, continue. 290 | if image_mask_B1HW.sum() == 0: 291 | continue 292 | 293 | # Create a tensor with the pixel coordinates of every feature vector. 294 | pixel_positions_B2HW = self.pixel_grid_2HW[:, :H, :W].clone() # It's 2xHxW (actual H and W) now. 295 | pixel_positions_B2HW = pixel_positions_B2HW[None] # 1x2xHxW 296 | pixel_positions_B2HW = pixel_positions_B2HW.expand(B, 2, H, W) # Bx2xHxW 297 | 298 | # Bx3x4 -> Nx3x4 (for each image, repeat pose per feature) 299 | gt_pose_inv = gt_pose_inv_B44[:, :3] 300 | gt_pose_inv = gt_pose_inv.unsqueeze(1).expand(B, H * W, 3, 4).reshape(-1, 3, 4) 301 | 302 | # Bx3x3 -> Nx3x3 (for each image, repeat intrinsics per feature) 303 | intrinsics = intrinsics_B33.unsqueeze(1).expand(B, H * W, 3, 3).reshape(-1, 3, 3) 304 | intrinsics_inv = intrinsics_inv_B33.unsqueeze(1).expand(B, H * W, 3, 3).reshape(-1, 3, 3) 305 | 306 | def normalize_shape(tensor_in): 307 | """Bring tensor from shape BxCxHxW to NxC""" 308 | return tensor_in.transpose(0, 1).flatten(1).transpose(0, 1) 309 | 310 | batch_data = { 311 | 'features': normalize_shape(features_BCHW), 312 | 'target_px': normalize_shape(pixel_positions_B2HW), 313 | 'gt_poses_inv': gt_pose_inv, 314 | 'intrinsics': intrinsics, 315 | 'intrinsics_inv': intrinsics_inv 316 | } 317 | 318 | # Turn image mask into sampling weights (all equal). 319 | image_mask_B1HW = image_mask_B1HW.float() 320 | image_mask_N1 = normalize_shape(image_mask_B1HW) 321 | 322 | # Over-sample according to image mask. 323 | features_to_select = self.options.samples_per_image * B 324 | features_to_select = min(features_to_select, self.options.training_buffer_size - buffer_idx) 325 | 326 | # Sample indices uniformly, with replacement. 327 | sample_idxs = torch.multinomial(image_mask_N1.view(-1), 328 | features_to_select, 329 | replacement=True, 330 | generator=self.sampling_generator) 331 | 332 | # Select the data to put in the buffer. 333 | for k in batch_data: 334 | batch_data[k] = batch_data[k][sample_idxs] 335 | 336 | # Write to training buffer. Start at buffer_idx and end at buffer_offset - 1. 337 | buffer_offset = buffer_idx + features_to_select 338 | for k in batch_data: 339 | self.training_buffer[k][buffer_idx:buffer_offset] = batch_data[k] 340 | 341 | buffer_idx = buffer_offset 342 | if buffer_idx >= self.options.training_buffer_size: 343 | break 344 | 345 | buffer_memory = sum([v.element_size() * v.nelement() for k, v in self.training_buffer.items()]) 346 | buffer_memory /= 1024 * 1024 * 1024 347 | 348 | _logger.info(f"Created buffer of {buffer_memory:.2f}GB with {dataset_passes} passes over the training data.") 349 | self.regressor.train() 350 | 351 | def run_epoch(self): 352 | """ 353 | Run one epoch of training, shuffling the feature buffer and iterating over it. 354 | """ 355 | # Enable benchmarking since all operations work on the same tensor size. 356 | torch.backends.cudnn.benchmark = True 357 | 358 | # Shuffle indices. 359 | random_indices = torch.randperm(self.options.training_buffer_size, generator=self.training_generator) 360 | 361 | # Iterate with mini batches. 362 | for batch_start in range(0, self.options.training_buffer_size, self.options.batch_size): 363 | batch_end = batch_start + self.options.batch_size 364 | 365 | # Drop last batch if not full. 366 | if batch_end > self.options.training_buffer_size: 367 | continue 368 | 369 | # Sample indices. 370 | random_batch_indices = random_indices[batch_start:batch_end] 371 | 372 | # Call the training step with the sampled features and relevant metadata. 373 | self.training_step( 374 | self.training_buffer['features'][random_batch_indices].contiguous(), 375 | self.training_buffer['target_px'][random_batch_indices].contiguous(), 376 | self.training_buffer['gt_poses_inv'][random_batch_indices].contiguous(), 377 | self.training_buffer['intrinsics'][random_batch_indices].contiguous(), 378 | self.training_buffer['intrinsics_inv'][random_batch_indices].contiguous() 379 | ) 380 | self.iteration += 1 381 | 382 | def training_step(self, features_bC, target_px_b2, gt_inv_poses_b34, Ks_b33, invKs_b33): 383 | """ 384 | Run one iteration of training, computing the reprojection error and minimising it. 385 | """ 386 | batch_size = features_bC.shape[0] 387 | channels = features_bC.shape[1] 388 | 389 | # Reshape to a "fake" BCHW shape, since it's faster to run through the network compared to the original shape. 390 | features_bCHW = features_bC[None, None, ...].view(-1, 16, 32, channels).permute(0, 3, 1, 2) 391 | with autocast(enabled=self.options.use_half): 392 | pred_scene_coords_b3HW = self.regressor.get_scene_coordinates(features_bCHW) 393 | 394 | # Back to the original shape. Convert to float32 as well. 395 | pred_scene_coords_b31 = pred_scene_coords_b3HW.permute(0, 2, 3, 1).flatten(0, 2).unsqueeze(-1).float() 396 | 397 | # Make 3D points homogeneous so that we can easily matrix-multiply them. 398 | pred_scene_coords_b41 = to_homogeneous(pred_scene_coords_b31) 399 | 400 | # Scene coordinates to camera coordinates. 401 | pred_cam_coords_b31 = torch.bmm(gt_inv_poses_b34, pred_scene_coords_b41) 402 | 403 | # Project scene coordinates. 404 | pred_px_b31 = torch.bmm(Ks_b33, pred_cam_coords_b31) 405 | 406 | # Avoid division by zero. 407 | # Note: negative values are also clamped at +self.options.depth_min. The predicted pixel would be wrong, 408 | # but that's fine since we mask them out later. 409 | pred_px_b31[:, 2].clamp_(min=self.options.depth_min) 410 | 411 | # Dehomogenise. 412 | pred_px_b21 = pred_px_b31[:, :2] / pred_px_b31[:, 2, None] 413 | 414 | # Measure reprojection error. 415 | reprojection_error_b2 = pred_px_b21.squeeze() - target_px_b2 416 | reprojection_error_b1 = torch.norm(reprojection_error_b2, dim=1, keepdim=True, p=1) 417 | 418 | # 419 | # Compute masks used to ignore invalid pixels. 420 | # 421 | # Predicted coordinates behind or close to camera plane. 422 | invalid_min_depth_b1 = pred_cam_coords_b31[:, 2] < self.options.depth_min 423 | # Very large reprojection errors. 424 | invalid_repro_b1 = reprojection_error_b1 > self.options.repro_loss_hard_clamp 425 | # Predicted coordinates beyond max distance. 426 | invalid_max_depth_b1 = pred_cam_coords_b31[:, 2] > self.options.depth_max 427 | 428 | # Invalid mask is the union of all these. Valid mask is the opposite. 429 | invalid_mask_b1 = (invalid_min_depth_b1 | invalid_repro_b1 | invalid_max_depth_b1) 430 | valid_mask_b1 = ~invalid_mask_b1 431 | 432 | # Reprojection error for all valid scene coordinates. 433 | valid_reprojection_error_b1 = reprojection_error_b1[valid_mask_b1] 434 | # Compute the loss for valid predictions. 435 | loss_valid = self.repro_loss.compute(valid_reprojection_error_b1, self.iteration) 436 | 437 | # Handle the invalid predictions: generate proxy coordinate targets with constant depth assumption. 438 | pixel_grid_crop_b31 = to_homogeneous(target_px_b2.unsqueeze(2)) 439 | target_camera_coords_b31 = self.options.depth_target * torch.bmm(invKs_b33, pixel_grid_crop_b31) 440 | 441 | # Compute the distance to target camera coordinates. 442 | invalid_mask_b11 = invalid_mask_b1.unsqueeze(2) 443 | loss_invalid = torch.abs(target_camera_coords_b31 - pred_cam_coords_b31).masked_select(invalid_mask_b11).sum() 444 | 445 | # Final loss is the sum of all 2. 446 | loss = loss_valid + loss_invalid 447 | loss /= batch_size 448 | 449 | # We need to check if the step actually happened, since the scaler might skip optimisation steps. 450 | old_optimizer_step = self.optimizer._step_count 451 | 452 | # Optimization steps. 453 | self.optimizer.zero_grad(set_to_none=True) 454 | self.scaler.scale(loss).backward() 455 | self.scaler.step(self.optimizer) 456 | self.scaler.update() 457 | 458 | if self.iteration % self.iterations_output == 0: 459 | # Print status. 460 | time_since_start = time.time() - self.training_start 461 | fraction_valid = float(valid_mask_b1.sum() / batch_size) 462 | # median_depth = float(pred_cam_coords_b31[:, 2].median()) 463 | 464 | _logger.info(f'Iteration: {self.iteration:6d} / Epoch {self.epoch:03d}|{self.options.epochs:03d}, ' 465 | f'Loss: {loss:.1f}, Valid: {fraction_valid * 100:.1f}%, Time: {time_since_start:.2f}s') 466 | 467 | if self.ace_visualizer is not None: 468 | vis_scene_coords = pred_scene_coords_b31.detach().cpu().squeeze().numpy() 469 | vis_errors = reprojection_error_b1.detach().cpu().squeeze().numpy() 470 | self.ace_visualizer.render_mapping_frame(vis_scene_coords, vis_errors) 471 | 472 | # Only step if the optimizer stepped and if we're not over-stepping the total_steps supported by the scheduler. 473 | if old_optimizer_step < self.optimizer._step_count < self.scheduler.total_steps: 474 | self.scheduler.step() 475 | 476 | def save_model(self): 477 | # NOTE: This would save the whole regressor (encoder weights included) in full precision floats (~30MB). 478 | # torch.save(self.regressor.state_dict(), self.options.output_map_file) 479 | 480 | # This saves just the head weights as half-precision floating point numbers for a total of ~4MB, as mentioned 481 | # in the paper. The scene-agnostic encoder weights can then be loaded from the pretrained encoder file. 482 | head_state_dict = self.regressor.heads.state_dict() 483 | for k, v in head_state_dict.items(): 484 | head_state_dict[k] = head_state_dict[k].half() 485 | torch.save(head_state_dict, self.options.output_map_file) 486 | _logger.info(f"Saved trained head weights to: {self.options.output_map_file}") -------------------------------------------------------------------------------- /ace_util.py: -------------------------------------------------------------------------------- 1 | # Copyright © Niantic, Inc. 2022. 2 | 3 | import numpy as np 4 | import torch 5 | 6 | def get_pixel_grid(subsampling_factor): 7 | """ 8 | Generate target pixel positions according to a subsampling factor, assuming prediction at center pixel. 9 | """ 10 | pix_range = torch.arange(np.ceil(5000 / subsampling_factor), dtype=torch.float32) 11 | yy, xx = torch.meshgrid(pix_range, pix_range, indexing='ij') 12 | return subsampling_factor * (torch.stack([xx, yy]) + 0.5) 13 | 14 | def to_homogeneous(input_tensor, dim=1): 15 | """ 16 | Converts tensor to homogeneous coordinates by adding ones to the specified dimension 17 | """ 18 | ones = torch.ones_like(input_tensor.select(dim, 0).unsqueeze(dim)) 19 | output = torch.cat([input_tensor, ones], dim=dim) 20 | return output -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import random 4 | from pathlib import Path 5 | 6 | import cv2 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | import torchvision.transforms.functional as TF 11 | from skimage import color 12 | from skimage import io 13 | from skimage.transform import rotate, resize 14 | from torch.utils.data import Dataset 15 | from torch.utils.data.dataloader import default_collate 16 | from torchvision import transforms 17 | 18 | from ace_network import Regressor 19 | 20 | _logger = logging.getLogger(__name__) 21 | 22 | 23 | class CamLocDataset(Dataset): 24 | """Camera localization dataset. 25 | 26 | Access to image, calibration and ground truth data given a dataset directory. 27 | """ 28 | 29 | def __init__(self, 30 | root_dir, 31 | mode=0, 32 | sparse=False, 33 | augment=False, 34 | aug_rotation=15, 35 | aug_scale_min=2 / 3, 36 | aug_scale_max=3 / 2, 37 | aug_black_white=0.1, 38 | aug_color=0.3, 39 | image_height=480, 40 | use_half=True, 41 | num_clusters=None, 42 | cluster_idx=None, 43 | ): 44 | """Constructor. 45 | 46 | Parameters: 47 | root_dir: Folder of the data (training or test). 48 | mode: 49 | 0 = RGB only, load no initialization targets. Default for the ACE paper. 50 | 1 = RGB + ground truth scene coordinates, load or generate ground truth scene coordinate targets 51 | 2 = RGB-D, load camera coordinates instead of scene coordinates 52 | sparse: for mode = 1 (RGB+GT SC), load sparse initialization targets when True, load dense depth maps and 53 | generate initialization targets when False 54 | augment: Use random data augmentation, note: not supported for mode = 2 (RGB-D) since pre-generated eye 55 | coordinates cannot be augmented 56 | aug_rotation: Max 2D image rotation angle, sampled uniformly around 0, both directions, degrees. 57 | aug_scale_min: Lower limit of image scale factor for uniform sampling 58 | aug_scale_min: Upper limit of image scale factor for uniform sampling 59 | aug_black_white: Max relative scale factor for image brightness/contrast sampling, e.g. 0.1 -> [0.9,1.1] 60 | aug_color: Max relative scale factor for image saturation/hue sampling, e.g. 0.1 -> [0.9,1.1] 61 | image_height: RGB images are rescaled to this maximum height (if augmentation is disabled, and in the range 62 | [aug_scale_min * image_height, aug_scale_max * image_height] otherwise). 63 | use_half: Enabled if training with half-precision floats. 64 | num_clusters: split the input frames into disjoint clusters using hierarchical clustering in order to train 65 | an ensemble model. Clustering is deterministic, so multiple training calls with the same number of 66 | target clusters will result in the same split. See the paper for details of the approach. Disabled by 67 | default. 68 | cluster_idx: If num_clusters is not None, then use this parameter to choose the cluster used for training. 69 | """ 70 | 71 | self.use_half = use_half 72 | 73 | self.init = (mode == 1) 74 | self.sparse = sparse 75 | self.eye = (mode == 2) 76 | 77 | self.image_height = image_height 78 | 79 | self.augment = augment 80 | self.aug_rotation = aug_rotation 81 | self.aug_scale_min = aug_scale_min 82 | self.aug_scale_max = aug_scale_max 83 | self.aug_black_white = aug_black_white 84 | self.aug_color = aug_color 85 | 86 | self.num_clusters = num_clusters 87 | self.cluster_idx = cluster_idx 88 | 89 | if self.num_clusters is not None: 90 | if self.num_clusters < 1: 91 | raise ValueError("num_clusters must be at least 1") 92 | 93 | if self.cluster_idx is None: 94 | raise ValueError("cluster_idx needs to be specified when num_clusters is set") 95 | 96 | if self.cluster_idx < 0 or self.cluster_idx >= self.num_clusters: 97 | raise ValueError(f"cluster_idx needs to be between 0 and {self.num_clusters - 1}") 98 | 99 | if self.eye and self.augment and (self.aug_rotation > 0 or self.aug_scale_min != 1 or self.aug_scale_max != 1): 100 | # pre-generated eye coordinates cannot be augmented 101 | _logger.warning("WARNING: Check your augmentation settings. Camera coordinates will not be augmented.") 102 | 103 | # Setup data paths. 104 | root_dir = Path(root_dir) 105 | 106 | # Main folders. 107 | rgb_dir = root_dir / 'rgb' 108 | pose_dir = root_dir / 'poses' 109 | calibration_dir = root_dir / 'calibration' 110 | 111 | # Optional folders. Unused in ACE. 112 | if self.eye: 113 | coord_dir = root_dir / 'eye' 114 | elif self.sparse: 115 | coord_dir = root_dir / 'init' 116 | else: 117 | coord_dir = root_dir / 'depth' 118 | 119 | # Find all images. The assumption is that it only contains image files. 120 | self.rgb_files = sorted(rgb_dir.iterdir()) 121 | 122 | # Find all ground truth pose files. One per image. 123 | self.pose_files = sorted(pose_dir.iterdir()) 124 | 125 | # Load camera calibrations. One focal length per image. 126 | self.calibration_files = sorted(calibration_dir.iterdir()) 127 | 128 | if self.init or self.eye: 129 | # Load GT scene coordinates. 130 | self.coord_files = sorted(coord_dir.iterdir()) 131 | else: 132 | self.coord_files = None 133 | 134 | if len(self.rgb_files) != len(self.pose_files): 135 | raise RuntimeError('RGB file count does not match pose file count!') 136 | 137 | if len(self.rgb_files) != len(self.calibration_files): 138 | raise RuntimeError('RGB file count does not match calibration file count!') 139 | 140 | if self.coord_files and len(self.rgb_files) != len(self.coord_files): 141 | raise RuntimeError('RGB file count does not match coordinate file count!') 142 | 143 | # Create grid of 2D pixel positions used when generating scene coordinates from depth. 144 | if self.init and not self.sparse: 145 | self.prediction_grid = self._create_prediction_grid() 146 | else: 147 | self.prediction_grid = None 148 | 149 | # Image transformations. Excluding scale since that can vary batch-by-batch. 150 | if self.augment: 151 | self.image_transform = transforms.Compose([ 152 | # transforms.ToPILImage(), 153 | # transforms.Resize(int(self.image_height * scale_factor)), 154 | transforms.Grayscale(), 155 | transforms.ColorJitter(brightness=self.aug_black_white, contrast=self.aug_black_white), 156 | # saturation=self.aug_color, hue=self.aug_color), # Disable colour augmentation. 157 | transforms.ToTensor(), 158 | transforms.Normalize( 159 | mean=[0.4], # statistics calculated over 7scenes training set, should generalize fairly well 160 | std=[0.25] 161 | ), 162 | ]) 163 | else: 164 | self.image_transform = transforms.Compose([ 165 | # transforms.ToPILImage(), 166 | # transforms.Resize(self.image_height), 167 | transforms.Grayscale(), 168 | transforms.ToTensor(), 169 | transforms.Normalize( 170 | mean=[0.4], # statistics calculated over 7scenes training set, should generalize fairly well 171 | std=[0.25] 172 | ), 173 | ]) 174 | 175 | # We use this to iterate over all frames. If clustering is enabled this is used to filter them. 176 | self.valid_file_indices = np.arange(len(self.rgb_files)) 177 | 178 | # If clustering is enabled. 179 | if self.num_clusters is not None: 180 | _logger.info(f"Clustering the {len(self.rgb_files)} into {num_clusters} clusters.") 181 | _, _, cluster_labels = self._cluster(num_clusters) 182 | 183 | self.valid_file_indices = np.flatnonzero(cluster_labels == cluster_idx) 184 | _logger.info(f"After clustering, chosen cluster: {cluster_idx}, Using {len(self.valid_file_indices)} images.") 185 | 186 | # Calculate mean camera center (using the valid frames only). 187 | self.mean_cam_center = self._compute_mean_camera_center() 188 | 189 | @staticmethod 190 | def _create_prediction_grid(): 191 | # Assumes all input images have a resolution smaller than 5000x5000. 192 | prediction_grid = np.zeros((2, 193 | math.ceil(5000 / Regressor.OUTPUT_SUBSAMPLE), 194 | math.ceil(5000 / Regressor.OUTPUT_SUBSAMPLE))) 195 | 196 | for x in range(0, prediction_grid.shape[2]): 197 | for y in range(0, prediction_grid.shape[1]): 198 | prediction_grid[0, y, x] = x * Regressor.OUTPUT_SUBSAMPLE 199 | prediction_grid[1, y, x] = y * Regressor.OUTPUT_SUBSAMPLE 200 | 201 | return prediction_grid 202 | 203 | @staticmethod 204 | def _resize_image(image, image_height): 205 | # Resize a numpy image as PIL. Works slightly better than resizing the tensor using torch's internal function. 206 | image = TF.to_pil_image(image) 207 | image = TF.resize(image, image_height) 208 | return image 209 | 210 | @staticmethod 211 | def _rotate_image(image, angle, order, mode='constant'): 212 | # Image is a torch tensor (CxHxW), convert it to numpy as HxWxC. 213 | image = image.permute(1, 2, 0).numpy() 214 | # Apply rotation. 215 | image = rotate(image, angle, order=order, mode=mode) 216 | # Back to torch tensor. 217 | image = torch.from_numpy(image).permute(2, 0, 1).float() 218 | return image 219 | 220 | def _cluster(self, num_clusters): 221 | """ 222 | Clusters the dataset using hierarchical kMeans. 223 | Initialization: 224 | Put all images in one cluster. 225 | Interate: 226 | Pick largest cluster. 227 | Split with kMeans and k=2. 228 | Input for kMeans is the 3D median scene coordiante per image. 229 | Terminate: 230 | When number of target clusters has been reached. 231 | Returns: 232 | cam_centers: For each cluster the mean (not median) scene coordinate 233 | labels: For each image the cluster ID 234 | """ 235 | num_images = len(self.pose_files) 236 | _logger.info(f'Clustering a dataset with {num_images} frames into {num_clusters} clusters.') 237 | 238 | # A tensor holding all camera centers used for clustering. 239 | cam_centers = np.zeros((num_images, 3), dtype=np.float32) 240 | for i in range(num_images): 241 | pose = self._load_pose(i) 242 | cam_centers[i] = pose[:3, 3] 243 | 244 | # Setup kMEans 245 | criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.1) 246 | flags = cv2.KMEANS_PP_CENTERS 247 | 248 | # Label of next cluster. 249 | label_counter = 0 250 | 251 | # Initialise list of clusters with all images. 252 | clusters = [] 253 | clusters.append((cam_centers, label_counter, np.zeros(3))) 254 | 255 | # All images belong to cluster 0. 256 | labels = np.zeros(num_images) 257 | 258 | # iterate kMeans with k=2 259 | while len(clusters) < num_clusters: 260 | # Select largest cluster (list is sorted). 261 | cur_cluster = clusters.pop(0) 262 | label_counter += 1 263 | 264 | # Split cluster. 265 | cur_error, cur_labels, cur_centroids = cv2.kmeans(cur_cluster[0], 2, None, criteria, 10, flags) 266 | 267 | # Update cluster list. 268 | cur_mask = (cur_labels == 0)[:, 0] 269 | cur_cam_centers0 = cur_cluster[0][cur_mask, :] 270 | clusters.append((cur_cam_centers0, cur_cluster[1], cur_centroids[0])) 271 | 272 | cur_mask = (cur_labels == 1)[:, 0] 273 | cur_cam_centers1 = cur_cluster[0][cur_mask, :] 274 | clusters.append((cur_cam_centers1, label_counter, cur_centroids[1])) 275 | 276 | cluster_labels = labels[labels == cur_cluster[1]] 277 | cluster_labels[cur_mask] = label_counter 278 | labels[labels == cur_cluster[1]] = cluster_labels 279 | 280 | # Sort updated list. 281 | clusters = sorted(clusters, key=lambda cluster: cluster[0].shape[0], reverse=True) 282 | 283 | # clusters are sorted but cluster indices are random, remap cluster indices to sorted indices 284 | remapped_labels = np.zeros(num_images) 285 | remapped_clusters = [] 286 | 287 | for cluster_idx_new, cluster in enumerate(clusters): 288 | cluster_idx_old = cluster[1] 289 | remapped_labels[labels == cluster_idx_old] = cluster_idx_new 290 | remapped_clusters.append((cluster[0], cluster_idx_new, cluster[2])) 291 | 292 | labels = remapped_labels 293 | clusters = remapped_clusters 294 | 295 | cluster_centers = np.zeros((num_clusters, 3)) 296 | cluster_sizes = np.zeros((num_clusters, 1)) 297 | 298 | for cluster in clusters: 299 | # Compute distance of each cam to the center of the cluster. 300 | cam_num = cluster[0].shape[0] 301 | cam_data = np.zeros((cam_num, 3)) 302 | cam_count = 0 303 | 304 | # First compute the center of the cluster (mean). 305 | for i, cam_center in enumerate(cam_centers): 306 | if labels[i] == cluster[1]: 307 | cam_data[cam_count] = cam_center 308 | cam_count += 1 309 | 310 | cluster_centers[cluster[1]] = cam_data.mean(0) 311 | 312 | # Compute the distance of each cam from the cluster center. Then average and square. 313 | cam_dists = np.broadcast_to(cluster_centers[cluster[1]][np.newaxis, :], (cam_num, 3)) 314 | cam_dists = cam_data - cam_dists 315 | cam_dists = np.linalg.norm(cam_dists, axis=1) 316 | cam_dists = cam_dists ** 2 317 | 318 | cluster_sizes[cluster[1]] = cam_dists.mean() 319 | 320 | _logger.info("Cluster %i: %.1fm, %.1fm, %.1fm, images: %i, mean squared dist: %f" % ( 321 | cluster[1], cluster_centers[cluster[1]][0], cluster_centers[cluster[1]][1], cluster_centers[cluster[1]][2], 322 | cluster[0].shape[0], cluster_sizes[cluster[1]])) 323 | 324 | _logger.info('Clustering done.') 325 | 326 | return cluster_centers, cluster_sizes, labels 327 | 328 | def _compute_mean_camera_center(self): 329 | mean_cam_center = torch.zeros((3,)) 330 | 331 | for idx in self.valid_file_indices: 332 | pose = self._load_pose(idx) 333 | 334 | # Get the translation component. 335 | mean_cam_center += pose[0:3, 3] 336 | 337 | # Avg. 338 | mean_cam_center /= len(self) 339 | return mean_cam_center 340 | 341 | def _load_image(self, idx): 342 | image = io.imread(self.rgb_files[idx]) 343 | 344 | if len(image.shape) < 3: 345 | # Convert to RGB if needed. 346 | image = color.gray2rgb(image) 347 | 348 | return image 349 | 350 | def _load_pose(self, idx): 351 | # Stored as a 4x4 matrix. 352 | pose = np.loadtxt(self.pose_files[idx]) 353 | pose = torch.from_numpy(pose).float() 354 | 355 | return pose 356 | 357 | def _get_single_item(self, idx, image_height): 358 | # Apply index indirection. 359 | idx = self.valid_file_indices[idx] 360 | 361 | # Load image. 362 | image = self._load_image(idx) 363 | 364 | # Load intrinsics. 365 | k = np.loadtxt(self.calibration_files[idx]) 366 | if k.size == 1: 367 | focal_length = float(k) 368 | centre_point = None 369 | elif k.shape == (3, 3): 370 | k = k.tolist() 371 | focal_length = [k[0][0], k[1][1]] 372 | centre_point = [k[0][2], k[1][2]] 373 | else: 374 | raise Exception("Calibration file must contain either a 3x3 camera \ 375 | intrinsics matrix or a single float giving the focal length \ 376 | of the camera.") 377 | 378 | # The image will be scaled to image_height, adjust focal length as well. 379 | f_scale_factor = image_height / image.shape[0] 380 | if centre_point: 381 | centre_point = [c * f_scale_factor for c in centre_point] 382 | focal_length = [f * f_scale_factor for f in focal_length] 383 | else: 384 | focal_length *= f_scale_factor 385 | 386 | # Rescale image. 387 | image = self._resize_image(image, image_height) 388 | 389 | # Create mask of the same size as the resized image (it's a PIL image at this point). 390 | image_mask = torch.ones((1, image.size[1], image.size[0])) 391 | 392 | # Apply remaining transforms. 393 | image = self.image_transform(image) 394 | 395 | # Load pose. 396 | pose = self._load_pose(idx) 397 | 398 | # Load ground truth scene coordinates, if needed. 399 | if self.init: 400 | if self.sparse: 401 | coords = torch.load(self.coord_files[idx]) 402 | else: 403 | depth = io.imread(self.coord_files[idx]) 404 | depth = depth.astype(np.float64) 405 | depth /= 1000 # from millimeters to meters 406 | elif self.eye: 407 | coords = torch.load(self.coord_files[idx]) 408 | else: 409 | coords = 0 # Default for ACE, we don't need them. 410 | 411 | # Apply data augmentation if necessary. 412 | if self.augment: 413 | # Generate a random rotation angle. 414 | angle = random.uniform(-self.aug_rotation, self.aug_rotation) 415 | 416 | # Rotate input image and mask. 417 | image = self._rotate_image(image, angle, 1, 'reflect') 418 | image_mask = self._rotate_image(image_mask, angle, order=1, mode='constant') 419 | 420 | # If we loaded the GT scene coordinates. 421 | if self.init: 422 | if self.sparse: 423 | # rotate and scale initalization targets 424 | coords_w = math.ceil(image.size(2) / Regressor.OUTPUT_SUBSAMPLE) 425 | coords_h = math.ceil(image.size(1) / Regressor.OUTPUT_SUBSAMPLE) 426 | coords = F.interpolate(coords.unsqueeze(0), size=(coords_h, coords_w))[0] 427 | 428 | coords = self._rotate_image(coords, angle, 0) 429 | else: 430 | # rotate and scale depth maps 431 | depth = resize(depth, image.shape[1:], order=0) 432 | depth = rotate(depth, angle, order=0, mode='constant') 433 | 434 | # Rotate ground truth camera pose as well. 435 | angle = angle * math.pi / 180. 436 | # Create a rotation matrix. 437 | pose_rot = torch.eye(4) 438 | pose_rot[0, 0] = math.cos(angle) 439 | pose_rot[0, 1] = -math.sin(angle) 440 | pose_rot[1, 0] = math.sin(angle) 441 | pose_rot[1, 1] = math.cos(angle) 442 | 443 | # Apply rotation matrix to the ground truth camera pose. 444 | pose = torch.matmul(pose, pose_rot) 445 | 446 | # Not used for ACE. 447 | if self.init and not self.sparse: 448 | # generate initialization targets from depth map 449 | offsetX = int(Regressor.OUTPUT_SUBSAMPLE / 2) 450 | offsetY = int(Regressor.OUTPUT_SUBSAMPLE / 2) 451 | 452 | coords = torch.zeros(( 453 | 3, 454 | math.ceil(image.shape[1] / Regressor.OUTPUT_SUBSAMPLE), 455 | math.ceil(image.shape[2] / Regressor.OUTPUT_SUBSAMPLE))) 456 | 457 | # subsample to network output size 458 | depth = depth[offsetY::Regressor.OUTPUT_SUBSAMPLE, offsetX::Regressor.OUTPUT_SUBSAMPLE] 459 | 460 | # construct x and y coordinates of camera coordinate 461 | xy = self.prediction_grid[:, :depth.shape[0], :depth.shape[1]].copy() 462 | # add random pixel shift 463 | xy[0] += offsetX 464 | xy[1] += offsetY 465 | # substract principal point (assume image center) 466 | xy[0] -= image.shape[2] / 2 467 | xy[1] -= image.shape[1] / 2 468 | # reproject 469 | xy /= focal_length 470 | xy[0] *= depth 471 | xy[1] *= depth 472 | 473 | # assemble camera coordinates tensor 474 | eye = np.ndarray((4, depth.shape[0], depth.shape[1])) 475 | eye[0:2] = xy 476 | eye[2] = depth 477 | eye[3] = 1 478 | 479 | # eye to scene coordinates 480 | sc = np.matmul(pose.numpy(), eye.reshape(4, -1)) 481 | sc = sc.reshape(4, depth.shape[0], depth.shape[1]) 482 | 483 | # mind pixels with invalid depth 484 | sc[:, depth == 0] = 0 485 | sc[:, depth > 1000] = 0 486 | sc = torch.from_numpy(sc[0:3]) 487 | 488 | coords[:, :sc.shape[1], :sc.shape[2]] = sc 489 | 490 | # Convert to half if needed. 491 | if self.use_half and torch.cuda.is_available(): 492 | image = image.half() 493 | 494 | # Binarize the mask. 495 | image_mask = image_mask > 0 496 | 497 | # Invert the pose. 498 | pose_inv = pose.inverse() 499 | 500 | # Create the intrinsics matrix. 501 | intrinsics = torch.eye(3) 502 | 503 | # Hardcode the principal point to the centre of the image unless otherwise specified. 504 | if centre_point: 505 | intrinsics[0, 0] = focal_length[0] 506 | intrinsics[1, 1] = focal_length[1] 507 | intrinsics[0, 2] = centre_point[0] 508 | intrinsics[1, 2] = centre_point[1] 509 | else: 510 | intrinsics[0, 0] = focal_length 511 | intrinsics[1, 1] = focal_length 512 | intrinsics[0, 2] = image.shape[2] / 2 513 | intrinsics[1, 2] = image.shape[1] / 2 514 | 515 | # Also need the inverse. 516 | intrinsics_inv = intrinsics.inverse() 517 | 518 | return image, image_mask, pose, pose_inv, intrinsics, intrinsics_inv, coords, str(self.rgb_files[idx]) 519 | 520 | def __len__(self): 521 | return len(self.valid_file_indices) 522 | 523 | def __getitem__(self, idx): 524 | if self.augment: 525 | scale_factor = random.uniform(self.aug_scale_min, self.aug_scale_max) 526 | # scale_factor = 1 / scale_factor #inverse scale sampling, not used for ACE mapping 527 | else: 528 | scale_factor = 1 529 | 530 | # Target image height. We compute it here in case we are asked for a full batch of tensors because we need 531 | # to apply the same scale factor to all of them. 532 | image_height = int(self.image_height * scale_factor) 533 | 534 | if type(idx) == list: 535 | # Whole batch. 536 | tensors = [self._get_single_item(i, image_height) for i in idx] 537 | return default_collate(tensors) 538 | else: 539 | # Single element. 540 | return self._get_single_item(idx, image_height) 541 | -------------------------------------------------------------------------------- /datasets/dataset_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from scipy.spatial.transform import Rotation as Rotation 4 | 5 | def mkdir(directory): 6 | """Checks whether the directory exists and creates it if necessacy.""" 7 | if not os.path.exists(directory): 8 | os.makedirs(directory) 9 | 10 | def dlheidata(doi, filename): 11 | """Download and unpack data from heiData host.""" 12 | os.system(f"wget https://heidata.uni-heidelberg.de/api/access/datafile/:persistentId?persistentId=doi:{doi} -O {filename}") 13 | os.system(f"tar -xvzf {filename}") 14 | os.system(f"rm {filename}") 15 | 16 | 17 | def get_base_file_name(file_name): 18 | ''' 19 | Extracts the base file name by removing file extension and modality string. 20 | eg. frame-000000.color.jpg -> frame-000000 21 | ''' 22 | 23 | base_file = os.path.splitext(file_name)[0] 24 | base_file = os.path.splitext(base_file)[0] 25 | 26 | return base_file 27 | 28 | def read_pose_data(file_name): 29 | ''' 30 | Expects path to file with one pose per line. 31 | Input pose is expected to map world to camera coordinates. 32 | Output pose maps camera to world coordinates. 33 | Pose format: file qw qx qy qz tx ty tz (f) 34 | Return dictionary that maps a file name to a tuple of (4x4 pose, focal_length) 35 | Sets focal_length to None if not contained in file. 36 | ''' 37 | 38 | with open(file_name, "r") as f: 39 | pose_data = f.readlines() 40 | 41 | # create a dict from the poses with file name as key 42 | pose_dict = {} 43 | for pose_string in pose_data: 44 | 45 | pose_string = pose_string.split() 46 | file_name = pose_string[0] 47 | 48 | pose_q = np.array(pose_string[1:5]) 49 | pose_q = np.array([pose_q[1], pose_q[2], pose_q[3], pose_q[0]]) 50 | pose_t = np.array(pose_string[5:8]) 51 | pose_R = Rotation.from_quat(pose_q).as_matrix() 52 | 53 | pose_4x4 = np.identity(4) 54 | pose_4x4[0:3, 0:3] = pose_R 55 | pose_4x4[0:3, 3] = pose_t 56 | 57 | # convert world->cam to cam->world for evaluation 58 | pose_4x4 = np.linalg.inv(pose_4x4) 59 | 60 | if len(pose_string) > 8: 61 | focal_length = float(pose_string[8]) 62 | else: 63 | focal_length = None 64 | 65 | pose_dict[get_base_file_name(file_name)] = (pose_4x4, focal_length) 66 | 67 | return pose_dict 68 | 69 | def write_cam_pose(file_path, cam_pose): 70 | ''' 71 | Writes 4x4 camera pose to a human readable text file. 72 | ''' 73 | with open(file_path, 'w') as f: 74 | f.write(str(float(cam_pose[0, 0])) + ' ' + str(float(cam_pose[0, 1])) + ' ' + str( 75 | float(cam_pose[0, 2])) + ' ' + str(float(cam_pose[0, 3])) + '\n') 76 | f.write(str(float(cam_pose[1, 0])) + ' ' + str(float(cam_pose[1, 1])) + ' ' + str( 77 | float(cam_pose[1, 2])) + ' ' + str(float(cam_pose[1, 3])) + '\n') 78 | f.write(str(float(cam_pose[2, 0])) + ' ' + str(float(cam_pose[2, 1])) + ' ' + str( 79 | float(cam_pose[2, 2])) + ' ' + str(float(cam_pose[2, 3])) + '\n') 80 | f.write(str(float(cam_pose[3, 0])) + ' ' + str(float(cam_pose[3, 1])) + ' ' + str( 81 | float(cam_pose[3, 2])) + ' ' + str(float(cam_pose[3, 3])) + '\n') 82 | 83 | def write_focal_length(file_path, focal_length): 84 | ''' 85 | Write the focal length to a human readable text file. 86 | ''' 87 | with open(file_path, 'w') as f: 88 | f.write(str(focal_length)) 89 | 90 | def clone_external_pose_files(): 91 | ''' 92 | Clone repository containing SfM pose files for 7Scenes and 12Scenes. 93 | 94 | From paper: 95 | On the Limits of Pseudo Ground Truth in Visual Camera Re-localisation 96 | Eric Brachmann, Martin Humenberger, Carsten Rother, Torsten Sattler 97 | ICCV 2021 98 | 99 | @return folder name of the repo 100 | ''' 101 | repo_url = "https://github.com/tsattler/visloc_pseudo_gt_limitations.git" 102 | repo_folder = os.path.splitext(os.path.basename(repo_url))[0] 103 | 104 | if not os.path.exists(repo_folder): 105 | os.system('git clone https://github.com/tsattler/visloc_pseudo_gt_limitations.git') 106 | 107 | return os.path.join(repo_folder, 'pgt', 'sfm') 108 | -------------------------------------------------------------------------------- /datasets/setup_12scenes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import zipfile 6 | 7 | import dataset_util as dutil 8 | import numpy as np 9 | import torch 10 | from joblib import Parallel, delayed 11 | from skimage import io 12 | 13 | # name of the folder where we download the original 12scenes dataset to 14 | # we restructure the dataset by creating symbolic links to that folder 15 | src_folder = '12scenes_source' 16 | 17 | # sub sampling factor of eye coordinate tensor 18 | nn_subsampling = 8 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser( 22 | description='Download and setup the 12Scenes dataset.', 23 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 24 | 25 | parser.add_argument('--depth', type=str, choices=['none', 'rendered', 'original'], default='none', 26 | help='none: ignore depth maps; rendered: download depth rendered using 3D scene model (18GB), original: original depth maps') 27 | 28 | parser.add_argument('--eye', type=str, choices=['none', 'original'], default='none', 29 | help='none: ignore eye coordinates; original: precompute eye coordinates from original depth maps') 30 | 31 | parser.add_argument('--poses', type=str, choices=['original', 'pgt'], default='original', 32 | help='original: original pose files; ' 33 | 'pgt: get SfM poses from external repository (Brachmann et al., ICCV21)') 34 | 35 | opt = parser.parse_args() 36 | 37 | if opt.depth == 'rendered' and opt.poses == 'pgt': 38 | print("Sorry. Rendered depth files are not compatible with SfM pose files, " 39 | "since both have missing frames and we would have to figure out the intersection. " 40 | "It can be done, but is not supported atm.") 41 | exit() 42 | 43 | print("\n#####################################################################") 44 | print("# Please make sure to check this dataset's license before using it! #") 45 | print("# http://graphics.stanford.edu/projects/reloc/ #") 46 | print("#####################################################################\n\n") 47 | 48 | license_response = input('Please confirm with "yes" or abort. ') 49 | if not (license_response == "yes" or license_response == "y"): 50 | print(f"Your response: {license_response}. Aborting.") 51 | exit() 52 | 53 | if opt.poses == 'pgt': 54 | 55 | print("\n###################################################################") 56 | print("# You requested external pose files. Please check the license at: #") 57 | print("# https://github.com/tsattler/visloc_pseudo_gt_limitations #") 58 | print("###################################################################\n\n") 59 | 60 | license_response = input('Please confirm with "yes" or abort. ') 61 | if not (license_response == "yes" or license_response == "y"): 62 | print(f"Your response: {license_response}. Aborting.") 63 | exit() 64 | 65 | print("Getting external pose files...") 66 | external_pgt_folder = dutil.clone_external_pose_files() 67 | 68 | # download the original 12 scenes dataset for calibration, poses and images 69 | dutil.mkdir(src_folder) 70 | os.chdir(src_folder) 71 | 72 | for ds in ['apt1', 'apt2', 'office1', 'office2']: 73 | 74 | if not os.path.exists(ds): 75 | 76 | print("=== Downloading 12scenes Data:", ds, "===============================") 77 | 78 | os.system('wget http://graphics.stanford.edu/projects/reloc/data/' + ds + '.zip') 79 | 80 | # unpack and delete zip file 81 | f = zipfile.PyZipFile(ds + '.zip') 82 | f.extractall() 83 | 84 | os.system('rm ' + ds + '.zip') 85 | 86 | else: 87 | print(f"Found data of scene {ds} already. Assuming its complete and skipping download.") 88 | 89 | 90 | def process_dataset(ds): 91 | 92 | scenes = os.listdir(ds) 93 | 94 | for scene in scenes: 95 | 96 | data_folder = ds + '/' + scene + '/data/' 97 | 98 | if not os.path.isdir(data_folder): 99 | # skip README files 100 | continue 101 | 102 | print("Linking files for 12scenes_" + ds + "_" + scene + "...") 103 | 104 | if opt.poses == 'pgt': 105 | target_folder = '../pgt_12scenes_' + ds + '_' + scene + '/' 106 | else: 107 | target_folder = '../12scenes_' + ds + '_' + scene + '/' 108 | 109 | # create subfolders for training and test 110 | dutil.mkdir(target_folder + 'test/rgb/') 111 | dutil.mkdir(target_folder + 'test/poses/') 112 | dutil.mkdir(target_folder + 'test/calibration/') 113 | if opt.depth == 'original': 114 | dutil.mkdir(target_folder + 'test/depth/') 115 | if opt.eye == 'original': 116 | dutil.mkdir(target_folder + 'test/eye/') 117 | 118 | dutil.mkdir(target_folder + 'train/rgb/') 119 | dutil.mkdir(target_folder + 'train/poses/') 120 | dutil.mkdir(target_folder + 'train/calibration/') 121 | if opt.depth == 'original': 122 | dutil.mkdir(target_folder + 'train/depth/') 123 | if opt.eye == 'original': 124 | dutil.mkdir(target_folder + 'train/eye/') 125 | 126 | # read the train / test split - the first sequence is used for testing, everything else for training 127 | with open(ds + '/' + scene + '/split.txt', 'r') as f: 128 | split = f.readlines() 129 | split = int(split[0].split()[1][8:-1]) 130 | 131 | # read the calibration parameters 132 | with open(ds + '/' + scene + '/info.txt', 'r') as f: 133 | calibration_info = f.readlines() 134 | 135 | im_h = int(calibration_info[3].split()[2]) 136 | 137 | focallength = calibration_info[7].split() 138 | focallength = (float(focallength[2]) + float(focallength[7])) / 2 139 | 140 | files = os.listdir(data_folder) 141 | 142 | images = [f for f in files if f.endswith('color.jpg')] 143 | images.sort() 144 | 145 | poses = [f for f in files if f.endswith('pose.txt')] 146 | poses.sort() 147 | 148 | if opt.depth == 'original' or opt.eye == 'original': 149 | depth_maps = [f for f in files if f.endswith('depth.png')] 150 | depth_maps.sort() 151 | 152 | # read external poses and calibration if requested 153 | if opt.poses == 'pgt': 154 | pgt_file = os.path.join('..', external_pgt_folder, '12scenes', f'{ds}_{scene}_test.txt') 155 | pgt_test_poses = dutil.read_pose_data(pgt_file) 156 | pgt_file = os.path.join('..', external_pgt_folder, '12scenes', f'{ds}_{scene}_train.txt') 157 | pgt_train_poses = dutil.read_pose_data(pgt_file) 158 | else: 159 | pgt_test_poses = None 160 | pgt_train_poses = None 161 | 162 | def link_frame(i, variant, pgt_poses): 163 | """ Links calibration, pose and image of frame i in either test or training. """ 164 | 165 | # some image have invalid pose files, skip those 166 | if opt.poses == 'pgt': 167 | valid = os.path.join('data', dutil.get_base_file_name(poses[i])) in pgt_poses 168 | else: 169 | 170 | valid = True 171 | with open(ds + '/' + scene + '/data/' + poses[i], 'r') as f: 172 | pose = f.readlines() 173 | for line in pose: 174 | if 'INF' in line: 175 | valid = False 176 | 177 | if not valid: 178 | print("Skipping frame", i, "(" + variant + ") - Corrupt pose.") 179 | else: 180 | # link image 181 | os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + images[ 182 | i] + ' ' + target_folder + variant + '/rgb/') 183 | 184 | if opt.poses == 'pgt': 185 | cam_pose, _ = pgt_poses[os.path.join('data', dutil.get_base_file_name(poses[i]))] 186 | dutil.write_cam_pose(target_folder + variant + '/poses/' + poses[i], cam_pose) 187 | else: 188 | # link pose 189 | os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + poses[ 190 | i] + ' ' + target_folder + variant + '/poses/') 191 | 192 | # create a calibration file 193 | with open(target_folder + variant + '/calibration/frame-%s.calibration.txt' % str(i).zfill(6), 194 | 'w') as f: 195 | f.write(str(focallength)) 196 | 197 | if opt.depth == 'original': 198 | # link original depth files 199 | os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + depth_maps[ 200 | i] + ' ' + target_folder + variant + '/depth/') 201 | 202 | if opt.eye == 'original': 203 | 204 | depth = io.imread(data_folder + '/' + depth_maps[i]) 205 | depth = depth.astype(np.float64) 206 | depth /= 1000 # from millimeters to meters 207 | 208 | d_h = depth.shape[0] 209 | d_w = depth.shape[1] 210 | 211 | # get RGB focal length and adjust to depth resolution 212 | if opt.poses == 'pgt': 213 | _, rgb_f = pgt_poses[os.path.join('data', dutil.get_base_file_name(poses[i]))] 214 | else: 215 | rgb_f = focallength 216 | 217 | d_f = rgb_f * d_h / im_h 218 | 219 | # generate sub-sampled eye coordinate tensor from calibrated depth 220 | out_h = int(d_h / nn_subsampling) 221 | out_w = int(d_w / nn_subsampling) 222 | nn_offset = int(nn_subsampling / 2) 223 | 224 | eye_tensor = np.zeros((3, out_h, out_w)) 225 | 226 | # generate pixel coordinates 227 | eye_tensor[0] = np.dstack([np.arange(0, out_w)] * out_h)[0].T * nn_subsampling + nn_offset 228 | eye_tensor[1] = np.dstack([np.arange(0, out_h)] * out_w)[0] * nn_subsampling + nn_offset 229 | 230 | # substract principal point 231 | eye_tensor[0] -= d_w / 2 232 | eye_tensor[1] -= d_h / 2 233 | 234 | # subsample depth 235 | depth = depth[nn_offset::nn_subsampling, nn_offset::nn_subsampling] 236 | 237 | # project 238 | eye_tensor[0:2] /= d_f 239 | eye_tensor[2, 0:depth.shape[0], 0:depth.shape[1]] = depth 240 | eye_tensor[0] *= eye_tensor[2] 241 | eye_tensor[1] *= eye_tensor[2] 242 | 243 | eye_tensor = torch.from_numpy(eye_tensor).float() 244 | 245 | torch.save(eye_tensor, target_folder + variant + '/eye/' + depth_maps[i][:-10] + '.eye.dat') 246 | 247 | # frames up to split are test images 248 | for i in range(split): 249 | link_frame(i, 'test', pgt_test_poses) 250 | 251 | # all remaining frames are training images 252 | for i in range(split, len(images)): 253 | link_frame(i, 'train', pgt_train_poses) 254 | 255 | 256 | Parallel(n_jobs=4, verbose=0)( 257 | map(delayed(process_dataset), ['apt1', 'apt2', 'office1', 'office2'])) 258 | 259 | if opt.depth == 'rendered': 260 | os.chdir('..') 261 | dutil.dlheidata("10.11588/data/N07HKC/OMLKR1", "12scenes_depth.tar.gz") 262 | -------------------------------------------------------------------------------- /datasets/setup_7scenes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import warnings 6 | 7 | import dataset_util as dutil 8 | import numpy as np 9 | import torch 10 | from joblib import Parallel, delayed 11 | from skimage import io 12 | 13 | # name of the folder where we download the original 7scenes dataset to 14 | # we restructure the dataset by creating symbolic links to that folder 15 | src_folder = '7scenes_source' 16 | focal_length = 525.0 17 | 18 | # focal length of the depth sensor (source: https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/) 19 | d_focal_length = 585.0 20 | 21 | # RGB image dimensions 22 | img_w = 640 23 | img_h = 480 24 | 25 | # sub sampling factor of eye coordinate tensor 26 | nn_subsampling = 8 27 | 28 | # transformation from depth sensor to RGB sensor 29 | # calibration according to https://projet.liris.cnrs.fr/voir/activities-dataset/kinect-calibration.html 30 | d_to_rgb = np.array([ 31 | [9.9996518012567637e-01, 2.6765126468950343e-03, -7.9041012313000904e-03, -2.5558943178152542e-02], 32 | [-2.7409311281316700e-03, 9.9996302803027592e-01, -8.1504520778013286e-03, 1.0109636268061706e-04], 33 | [7.8819942130445332e-03, 8.1718328771890631e-03, 9.9993554558014031e-01, 2.0318321729487039e-03], 34 | [0, 0, 0, 1] 35 | ]) 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser( 39 | description='Download and setup the 7Scenes dataset.', 40 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 41 | 42 | parser.add_argument('--depth', type=str, choices=['none', 'rendered', 'calibrated'], default='none', 43 | help='none: ignore depth maps; rendered: download depth rendered using 3D scene model (28GB); calibrated: register original depth maps to RGB') 44 | 45 | parser.add_argument('--eye', type=str, choices=['none', 'calibrated'], default='none', 46 | help='none: ignore eye coordinates; original: calibrate original depth maps and precompute eye coordinates') 47 | 48 | parser.add_argument('--poses', type=str, choices=['original', 'calibrated', 'pgt'], default='calibrated', 49 | help='original: use raw poses of depth sensor; ' 50 | 'calibrated: register poses to RGB sensor; ' 51 | 'pgt: get SfM poses from external repository (Brachmann et al., ICCV21)') 52 | 53 | opt = parser.parse_args() 54 | 55 | print("\n############################################################################") 56 | print("# Please make sure to check this dataset's license before using it! #") 57 | print("# https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/ #") 58 | print("############################################################################\n\n") 59 | 60 | license_response = input('Please confirm with "yes" or abort. ') 61 | if not (license_response == "yes" or license_response == "y"): 62 | print(f"Your response: {license_response}. Aborting.") 63 | exit() 64 | 65 | if opt.poses == 'pgt': 66 | 67 | print("\n###################################################################") 68 | print("# You requested external pose files. Please check the license at: #") 69 | print("# https://github.com/tsattler/visloc_pseudo_gt_limitations #") 70 | print("###################################################################\n\n") 71 | 72 | license_response = input('Please confirm with "yes" or abort. ') 73 | if not (license_response == "yes" or license_response == "y"): 74 | print(f"Your response: {license_response}. Aborting.") 75 | exit() 76 | 77 | print("Getting external pose files...") 78 | external_pgt_folder = dutil.clone_external_pose_files() 79 | 80 | # download the original 7 scenes dataset for poses and images 81 | dutil.mkdir(src_folder) 82 | os.chdir(src_folder) 83 | 84 | for ds in ['chess', 'fire', 'heads', 'office', 'pumpkin', 'redkitchen', 'stairs']: 85 | 86 | if not os.path.exists(ds): 87 | 88 | print("=== Downloading 7scenes Data:", ds, "===============================") 89 | 90 | os.system( 91 | 'wget http://download.microsoft.com/download/2/8/5/28564B23-0828-408F-8631-23B1EFF1DAC8/' + ds + '.zip') 92 | os.system('unzip ' + ds + '.zip') 93 | os.system('rm ' + ds + '.zip') 94 | 95 | sequences = os.listdir(ds) 96 | 97 | for file in sequences: 98 | if file.endswith('.zip'): 99 | print("Unpacking", file) 100 | os.system('unzip ' + ds + '/' + file + ' -d ' + ds) 101 | os.system('rm ' + ds + '/' + file) 102 | else: 103 | print(f"Found data of scene {ds} already. Assuming its complete and skipping download.") 104 | 105 | print("Processing frames...") 106 | 107 | 108 | def process_scene(ds): 109 | 110 | if opt.poses == 'pgt': 111 | target_folder = '../pgt_7scenes_' + ds + '/' 112 | else: 113 | target_folder = '../7scenes_' + ds + '/' 114 | 115 | def link_frames(split_file, variant): 116 | 117 | # create subfolders 118 | dutil.mkdir(target_folder + variant + '/rgb/') 119 | dutil.mkdir(target_folder + variant + '/poses/') 120 | dutil.mkdir(target_folder + variant + '/calibration/') 121 | if opt.depth == 'calibrated': 122 | dutil.mkdir(target_folder + variant + '/depth/') 123 | if opt.eye == 'calibrated': 124 | dutil.mkdir(target_folder + variant + '/eye/') 125 | 126 | # read the split file 127 | with open(ds + '/' + split_file, 'r') as f: 128 | split = f.readlines() 129 | # map sequences to folder names 130 | split = ['seq-' + s.strip()[8:].zfill(2) for s in split] 131 | 132 | # read external poses and calibration if requested 133 | if opt.poses == 'pgt': 134 | pgt_file = os.path.join('..', external_pgt_folder, '7scenes', f'{ds}_{variant}.txt') 135 | pgt_poses = dutil.read_pose_data(pgt_file) 136 | 137 | for seq in split: 138 | files = os.listdir(ds + '/' + seq) 139 | 140 | # link images 141 | images = [f for f in files if f.endswith('color.png')] 142 | for img in images: 143 | os.system( 144 | 'ln -s ../../../' + src_folder + '/' + ds + '/' + seq + '/' + img + ' ' + target_folder + variant + '/rgb/' + seq + '-' + img) 145 | 146 | pose_files = [f for f in files if f.endswith('pose.txt')] 147 | 148 | # create pose files 149 | if opt.poses == 'original': 150 | 151 | # link original poses 152 | for p_file in pose_files: 153 | os.system( 154 | 'ln -s ../../../' + src_folder + '/' + ds + '/' + seq + '/' + p_file + ' ' + target_folder + variant + '/poses/' + seq + '-' + p_file) 155 | 156 | elif opt.poses == 'pgt': 157 | 158 | # use poses as externally provided 159 | for p_file in pose_files: 160 | cam_pose, _ = pgt_poses[os.path.join(seq, dutil.get_base_file_name(p_file))] 161 | dutil.write_cam_pose(target_folder + variant + '/poses/' + seq + '-' + p_file, cam_pose) 162 | 163 | else: 164 | 165 | # adjust original camera pose files by mapping to RGB sensor 166 | for p_file in pose_files: 167 | # load original pose (aligned to the depth sensor) 168 | cam_pose = np.loadtxt(ds + '/' + seq + '/' + p_file) 169 | 170 | # apply relative transform from depth to RGB sensor 171 | cam_pose = np.matmul(cam_pose, np.linalg.inv(d_to_rgb)) 172 | 173 | # write adjusted pose (aligned to the RGB sensor) 174 | dutil.write_cam_pose(target_folder + variant + '/poses/' + seq + '-' + p_file, cam_pose) 175 | 176 | # create calibration files 177 | if opt.poses == 'pgt': 178 | 179 | for p_file in pose_files: 180 | base_file = dutil.get_base_file_name(p_file) 181 | calib_file = f'{base_file}.calibration.txt' 182 | 183 | _, rgb_f = pgt_poses[os.path.join(seq, base_file)] 184 | dutil.write_focal_length(target_folder + variant + '/calibration/' + seq + '-' + calib_file, 185 | rgb_f) 186 | else: 187 | for i in range(len(images)): 188 | dutil.write_focal_length( 189 | target_folder + variant + '/calibration/%s-frame-%s.calibration.txt' % ( 190 | seq, str(i).zfill(6)), 191 | focal_length) 192 | 193 | if opt.depth != 'calibrated' and opt.eye != 'calibrated': 194 | # no calibration requested, done 195 | continue 196 | 197 | # adjust depth files by mapping to RGB sensor 198 | depth_files = [f for f in files if f.endswith('depth.png')] 199 | 200 | for d_file in depth_files: 201 | 202 | if opt.poses == 'pgt': 203 | # use correct per-frame focal length if provided 204 | _, rgb_f = pgt_poses[os.path.join(seq, dutil.get_base_file_name(d_file))] 205 | else: 206 | # use default focal length as fall back 207 | rgb_f = focal_length 208 | 209 | depth = io.imread(ds + '/' + seq + '/' + d_file) 210 | depth = depth.astype(np.float64) 211 | depth /= 1000 # from millimeters to meters 212 | 213 | d_h = depth.shape[0] 214 | d_w = depth.shape[1] 215 | 216 | # reproject depth map to 3D eye coordinates 217 | eye_coords = np.zeros((4, d_h, d_w)) 218 | # set x and y coordinates 219 | eye_coords[0] = np.dstack([np.arange(0, d_w)] * d_h)[0].T 220 | eye_coords[1] = np.dstack([np.arange(0, d_h)] * d_w)[0] 221 | 222 | eye_coords = eye_coords.reshape(4, -1) 223 | depth = depth.reshape(-1) 224 | 225 | # filter pixels with invalid depth 226 | depth_mask = (depth > 0) & (depth < 100) 227 | eye_coords = eye_coords[:, depth_mask] 228 | depth = depth[depth_mask] 229 | 230 | # substract depth principal point (assume image center) 231 | eye_coords[0] -= d_w / 2 232 | eye_coords[1] -= d_h / 2 233 | # reproject 234 | eye_coords[0:2] /= d_focal_length 235 | eye_coords[0] *= depth 236 | eye_coords[1] *= depth 237 | eye_coords[2] = depth 238 | eye_coords[3] = 1 239 | 240 | # transform from depth sensor to RGB sensor 241 | eye_coords = np.matmul(d_to_rgb, eye_coords) 242 | 243 | # project 244 | depth = eye_coords[2] 245 | 246 | eye_coords[0] /= depth 247 | eye_coords[1] /= depth 248 | eye_coords[0:2] *= rgb_f 249 | 250 | # add RGB principal point (assume image center) 251 | eye_coords[0] += img_w / 2 252 | eye_coords[1] += img_h / 2 253 | 254 | registered_depth = np.zeros((img_h, img_w), dtype='uint16') 255 | 256 | for pt in range(eye_coords.shape[1]): 257 | x = round(eye_coords[0, pt]) 258 | y = round(eye_coords[1, pt]) 259 | d = eye_coords[2, pt] 260 | 261 | registered_depth[y, x] = d * 1000 262 | 263 | if opt.depth == 'calibrated': 264 | # store calibrated depth 265 | with warnings.catch_warnings(): 266 | warnings.simplefilter("ignore") 267 | io.imsave(target_folder + variant + '/depth/' + seq + '-' + d_file, registered_depth) 268 | 269 | if opt.eye != 'calibrated': 270 | # done 271 | continue 272 | 273 | # generate sub-sampled eye coordinate tensor from calibrated depth 274 | out_h = int(img_h / nn_subsampling) 275 | out_w = int(img_w / nn_subsampling) 276 | nn_offset = int(nn_subsampling / 2) 277 | 278 | eye_tensor = np.zeros((3, out_h, out_w)) 279 | 280 | # generate pixel coordinates 281 | eye_tensor[0] = np.dstack([np.arange(0, out_w)] * out_h)[0].T * nn_subsampling + nn_offset 282 | eye_tensor[1] = np.dstack([np.arange(0, out_h)] * out_w)[0] * nn_subsampling + nn_offset 283 | 284 | # substract principal point 285 | eye_tensor[0] -= img_w / 2 286 | eye_tensor[1] -= img_h / 2 287 | 288 | # prepare depth (convert to meters and subsample) 289 | registered_depth = registered_depth.astype(float) 290 | registered_depth /= 1000 291 | registered_depth = registered_depth[nn_offset::nn_subsampling, nn_offset::nn_subsampling] 292 | 293 | # project 294 | eye_tensor[0:2] /= rgb_f 295 | eye_tensor[2, 0:registered_depth.shape[0], 0:registered_depth.shape[1]] = registered_depth 296 | eye_tensor[0] *= eye_tensor[2] 297 | eye_tensor[1] *= eye_tensor[2] 298 | 299 | eye_tensor = torch.from_numpy(eye_tensor).float() 300 | 301 | torch.save(eye_tensor, 302 | target_folder + variant + '/eye/' + seq + '-' + d_file[:-10] + '.eye.dat') 303 | 304 | link_frames('TrainSplit.txt', 'train') 305 | link_frames('TestSplit.txt', 'test') 306 | 307 | 308 | Parallel(n_jobs=7, verbose=0)( 309 | map(delayed(process_scene), ['chess', 'fire', 'heads', 'office', 'pumpkin', 'redkitchen', 'stairs'])) 310 | 311 | if opt.depth == 'rendered': 312 | os.chdir('..') 313 | dutil.dlheidata("10.11588/data/N07HKC/4PLEEJ", "7scenes_depth.tar.gz") 314 | -------------------------------------------------------------------------------- /datasets/setup_cambridge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import math 5 | import os 6 | 7 | import cv2 as cv 8 | import dataset_util as dutil 9 | import numpy as np 10 | import torch 11 | from skimage import io 12 | 13 | # setup individual scene IDs and their download location 14 | scenes = [ 15 | 'https://www.repository.cam.ac.uk/bitstream/handle/1810/251342/KingsCollege.zip', 16 | 'https://www.repository.cam.ac.uk/bitstream/handle/1810/251340/OldHospital.zip', 17 | 'https://www.repository.cam.ac.uk/bitstream/handle/1810/251336/ShopFacade.zip', 18 | 'https://www.repository.cam.ac.uk/bitstream/handle/1810/251294/StMarysChurch.zip', 19 | 'https://www.repository.cam.ac.uk/bitstream/handle/1810/251291/GreatCourt.zip', 20 | ] 21 | 22 | target_height = 480 # rescale images 23 | nn_subsampling = 8 # sub sampling of our CNN architecture, for size of the initalization targets 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser( 27 | description='Download and setup the Cambridge dataset.', 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | 30 | parser.add_argument('--init', type=str, choices=['none', 'sfm'], default='none', 31 | help='none: no initialisation targets for scene coordinates; sfm: scene coordinate targets by rendering the SfM point cloud') 32 | 33 | opt = parser.parse_args() 34 | 35 | print("\n###############################################################################") 36 | print("# Please make sure to check this dataset's license before using it! #") 37 | print("# https://www.repository.cam.ac.uk/items/53788265-cb98-42ee-b85b-7a0cbc8eddb3 #") 38 | print("###############################################################################\n\n") 39 | 40 | license_response = input('Please confirm with "yes" or abort. ') 41 | if not (license_response == "yes" or license_response == "y"): 42 | print(f"Your response: {license_response}. Aborting.") 43 | exit() 44 | 45 | for scene in scenes: 46 | 47 | scene_file = scene.split('/')[-1] 48 | scene_name = scene_file[:-4] 49 | 50 | print("===== Processing " + scene_name + " ===================") 51 | 52 | print("Downloading and unzipping data...") 53 | os.system('wget ' + scene) 54 | os.system('unzip ' + scene_file) 55 | os.system('rm ' + scene_file) 56 | os.system('mv ' + scene_name + ' Cambridge_' + scene_name) 57 | os.chdir('Cambridge_' + scene_name) 58 | 59 | modes = ['train', 'test'] 60 | input_file = 'reconstruction.nvm' 61 | 62 | print("Loading SfM reconstruction...") 63 | 64 | f = open(input_file) 65 | reconstruction = f.readlines() 66 | f.close() 67 | 68 | num_cams = int(reconstruction[2]) 69 | num_pts = int(reconstruction[num_cams + 4]) 70 | 71 | if opt.init == 'sfm': 72 | 73 | # read points 74 | pts_dict = {} 75 | for cam_idx in range(0, num_cams): 76 | pts_dict[cam_idx] = [] 77 | 78 | pt = pts_start = num_cams + 5 79 | pts_end = pts_start + num_pts 80 | 81 | while pt < pts_end: 82 | 83 | pt_list = reconstruction[pt].split() 84 | pt_3D = [float(x) for x in pt_list[0:3]] 85 | pt_3D.append(1.0) 86 | 87 | for pt_view in range(0, int(pt_list[6])): 88 | cam_view = int(pt_list[7 + pt_view * 4]) 89 | pts_dict[cam_view].append(pt_3D) 90 | 91 | pt += 1 92 | 93 | print("Reconstruction contains %d cameras and %d 3D points." % (num_cams, num_pts)) 94 | 95 | for mode in modes: 96 | 97 | print("Converting " + mode + " data...") 98 | 99 | img_output_folder = mode + '/rgb/' 100 | cal_output_folder = mode + '/calibration/' 101 | pose_output_folder = mode + '/poses/' 102 | 103 | dutil.mkdir(img_output_folder) 104 | dutil.mkdir(cal_output_folder) 105 | dutil.mkdir(pose_output_folder) 106 | 107 | if opt.init != 'none': 108 | target_output_folder = mode + '/init/' 109 | dutil.mkdir(target_output_folder) 110 | 111 | # get list of images for current mode (train vs. test) 112 | image_list = 'dataset_' + mode + '.txt' 113 | 114 | f = open(image_list) 115 | camera_list = f.readlines() 116 | f.close() 117 | camera_list = camera_list[3:] 118 | 119 | image_list = [camera.split()[0] for camera in camera_list] 120 | 121 | for cam_idx in range(num_cams): 122 | 123 | print("Processing camera %d of %d." % (cam_idx, num_cams)) 124 | image_file = reconstruction[3 + cam_idx].split()[0] 125 | image_file = image_file[:-3] + 'png' 126 | 127 | if image_file not in image_list: 128 | print("Skipping image " + image_file + ". Not part of set: " + mode + ".") 129 | continue 130 | 131 | image_idx = image_list.index(image_file) 132 | 133 | # read camera 134 | camera = camera_list[image_idx].split() 135 | cam_rot = [float(r) for r in camera[4:]] 136 | 137 | # quaternion to axis-angle 138 | angle = 2 * math.acos(cam_rot[0]) 139 | x = cam_rot[1] / math.sqrt(1 - cam_rot[0] ** 2) 140 | y = cam_rot[2] / math.sqrt(1 - cam_rot[0] ** 2) 141 | z = cam_rot[3] / math.sqrt(1 - cam_rot[0] ** 2) 142 | 143 | cam_rot = [x * angle, y * angle, z * angle] 144 | 145 | cam_rot = np.asarray(cam_rot) 146 | cam_rot, _ = cv.Rodrigues(cam_rot) 147 | 148 | cam_trans = [float(r) for r in camera[1:4]] 149 | cam_trans = np.asarray([cam_trans]) 150 | cam_trans = np.transpose(cam_trans) 151 | cam_trans = - np.matmul(cam_rot, cam_trans) 152 | 153 | if np.absolute(cam_trans).max() > 10000: 154 | print("Skipping image " + image_file + ". Extremely large translation. Outlier?") 155 | print(cam_trans) 156 | continue 157 | 158 | cam_pose = np.concatenate((cam_rot, cam_trans), axis=1) 159 | cam_pose = np.concatenate((cam_pose, [[0, 0, 0, 1]]), axis=0) 160 | cam_pose = torch.tensor(cam_pose).float() 161 | 162 | focal_length = float(reconstruction[3 + cam_idx].split()[1]) 163 | 164 | # load image 165 | image = io.imread(image_file) 166 | image_file = image_file.replace('/', '_') 167 | 168 | img_aspect = image.shape[0] / image.shape[1] 169 | 170 | if img_aspect > 1: 171 | # portrait 172 | img_w = target_height 173 | img_h = int(math.ceil(target_height * img_aspect)) 174 | else: 175 | # landscape 176 | img_w = int(math.ceil(target_height / img_aspect)) 177 | img_h = target_height 178 | 179 | out_w = int(math.ceil(img_w / nn_subsampling)) 180 | out_h = int(math.ceil(img_h / nn_subsampling)) 181 | 182 | out_scale = out_w / image.shape[1] 183 | img_scale = img_w / image.shape[1] 184 | 185 | image = cv.resize(image, (img_w, img_h)) 186 | io.imsave(img_output_folder + image_file, image) 187 | 188 | with open(cal_output_folder + image_file[:-3] + 'txt', 'w') as f: 189 | f.write(str(focal_length * img_scale)) 190 | 191 | inv_cam_pose = cam_pose.inverse() 192 | 193 | with open(pose_output_folder + image_file[:-3] + 'txt', 'w') as f: 194 | f.write(str(float(inv_cam_pose[0, 0])) + ' ' + str(float(inv_cam_pose[0, 1])) + ' ' + str( 195 | float(inv_cam_pose[0, 2])) + ' ' + str(float(inv_cam_pose[0, 3])) + '\n') 196 | f.write(str(float(inv_cam_pose[1, 0])) + ' ' + str(float(inv_cam_pose[1, 1])) + ' ' + str( 197 | float(inv_cam_pose[1, 2])) + ' ' + str(float(inv_cam_pose[1, 3])) + '\n') 198 | f.write(str(float(inv_cam_pose[2, 0])) + ' ' + str(float(inv_cam_pose[2, 1])) + ' ' + str( 199 | float(inv_cam_pose[2, 2])) + ' ' + str(float(inv_cam_pose[2, 3])) + '\n') 200 | f.write(str(float(inv_cam_pose[3, 0])) + ' ' + str(float(inv_cam_pose[3, 1])) + ' ' + str( 201 | float(inv_cam_pose[3, 2])) + ' ' + str(float(inv_cam_pose[3, 3])) + '\n') 202 | 203 | if opt.init == 'sfm': 204 | 205 | # load 3D points from reconstruction 206 | pts_3D = torch.tensor(pts_dict[cam_idx]) 207 | 208 | out_tensor = torch.zeros((3, out_h, out_w)) 209 | out_zbuffer = torch.zeros((out_h, out_w)) 210 | 211 | fine = 0 212 | conflict = 0 213 | 214 | for pt_idx in range(0, pts_3D.size(0)): 215 | 216 | scene_pt = pts_3D[pt_idx] 217 | scene_pt = scene_pt.unsqueeze(0) 218 | scene_pt = scene_pt.transpose(0, 1) 219 | 220 | # scene to camera coordinates 221 | cam_pt = torch.mm(cam_pose, scene_pt) 222 | # projection to image 223 | img_pt = cam_pt[0:2, 0] * focal_length / cam_pt[2, 0] * out_scale 224 | 225 | y = img_pt[1] + out_h / 2 226 | x = img_pt[0] + out_w / 2 227 | 228 | x = int(torch.clamp(x, min=0, max=out_tensor.size(2) - 1)) 229 | y = int(torch.clamp(y, min=0, max=out_tensor.size(1) - 1)) 230 | 231 | if cam_pt[2, 0] > 1000: # filter some outlier points (large depth) 232 | continue 233 | 234 | if out_zbuffer[y, x] == 0 or out_zbuffer[y, x] > cam_pt[2, 0]: 235 | out_zbuffer[y, x] = cam_pt[2, 0] 236 | out_tensor[:, y, x] = pts_3D[pt_idx, 0:3] 237 | 238 | torch.save(out_tensor, target_output_folder + image_file[:-4] + '.dat') 239 | 240 | os.chdir('..') 241 | -------------------------------------------------------------------------------- /datasets/setup_wayspots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | if __name__ == '__main__': 6 | dataset_url = 'https://storage.googleapis.com/niantic-lon-static/research/ace/wayspots.tar.gz' 7 | dataset_file = dataset_url.split('/')[-1] 8 | 9 | print("\n#####################################################################") 10 | print("# Please make sure to check this dataset's license before using it! #") 11 | print("# https://nianticlabs.github.io/ace/wayspots.html #") 12 | print("#####################################################################\n\n") 13 | 14 | license_response = input('Please confirm with "yes" or abort. ') 15 | if not (license_response == "yes" or license_response == "y"): 16 | print(f"Your response: {license_response}. Aborting.") 17 | exit() 18 | 19 | print("Downloading and unzipping data...") 20 | os.system('wget ' + dataset_url) 21 | os.system('tar -xzf ' + dataset_file) 22 | os.system('rm ' + dataset_file) 23 | print("Done.") 24 | -------------------------------------------------------------------------------- /dsacstar/dsacstar_loss.h: -------------------------------------------------------------------------------- 1 | /* 2 | Based on the DSAC++ and ESAC code. 3 | https://github.com/vislearn/LessMore 4 | https://github.com/vislearn/esac 5 | 6 | Copyright (c) 2016, TU Dresden 7 | Copyright (c) 2020, Heidelberg University 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | * Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | * Neither the name of the TU Dresden, Heidelberg University nor the 18 | names of its contributors may be used to endorse or promote products 19 | derived from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #pragma once 34 | 35 | #define MAXLOSS 10000000.0 // clamp for stability 36 | 37 | namespace dsacstar 38 | { 39 | /** 40 | * @brief Calculates the rotational distance in degree between two transformations. 41 | * Translation will be ignored. 42 | * 43 | * @param trans1 Transformation 1. 44 | * @param trans2 Transformation 2. 45 | * @return Angle in degree. 46 | */ 47 | double calcAngularDistance(const dsacstar::trans_t& trans1, const dsacstar::trans_t& trans2) 48 | { 49 | cv::Mat rot1 = trans1.colRange(0, 3).rowRange(0, 3); 50 | cv::Mat rot2 = trans2.colRange(0, 3).rowRange(0, 3); 51 | 52 | cv::Mat rotDiff= rot2 * rot1.t(); 53 | double trace = cv::trace(rotDiff)[0]; 54 | 55 | trace = std::min(3.0, std::max(-1.0, trace)); 56 | return 180*acos((trace-1.0)/2.0)/PI; 57 | } 58 | 59 | /** 60 | * @brief Weighted average of translational error and rotational error between two pose hypothesis. 61 | * @param h1 Pose 1. 62 | * @param h2 Pose 2. 63 | * @param wRot Weight of rotation error. 64 | * @param wTrans Weight of translation error. 65 | * @param cut Apply soft clamping after this value. 66 | * @return Loss. 67 | */ 68 | double loss( 69 | const dsacstar::trans_t& trans1, 70 | const dsacstar::trans_t& trans2, 71 | double wRot = 1.0, 72 | double wTrans = 1.0, 73 | double cut = 100) 74 | { 75 | double rotErr = dsacstar::calcAngularDistance(trans1, trans2); 76 | double tErr = cv::norm( 77 | trans1.col(3).rowRange(0, 3) - trans2.col(3).rowRange(0, 3)); 78 | 79 | double loss = wRot * rotErr + wTrans * tErr; 80 | 81 | if(loss > cut) 82 | loss = std::sqrt(cut * loss); 83 | 84 | return std::min(loss, MAXLOSS); 85 | } 86 | 87 | /** 88 | * @brief Calculate the derivative of the loss w.r.t. the estimated pose. 89 | * @param est Estimated pose (6 DoF). 90 | * @param gt Ground truth pose (6 DoF). 91 | * @param wRot Weight of rotation error. 92 | * @param wTrans Weight of translation error. 93 | * @param cut Apply soft clamping after this value. 94 | * @return 1x6 Jacobean. 95 | */ 96 | cv::Mat_ dLoss( 97 | const dsacstar::pose_t& est, 98 | const dsacstar::pose_t& gt, 99 | double wRot = 1.0, 100 | double wTrans = 1.0, 101 | double cut = 100) 102 | { 103 | cv::Mat rot1, rot2, dRod; 104 | cv::Rodrigues(est.first, rot1, dRod); 105 | cv::Rodrigues(gt.first, rot2); 106 | 107 | // measure loss of inverted poses (camera pose instead of scene pose) 108 | cv::Mat_ invRot1 = rot1.t(); 109 | cv::Mat_ invRot2 = rot2.t(); 110 | 111 | // get the difference rotation 112 | cv::Mat diffRot = rot1 * invRot2; 113 | 114 | // calculate rotational and translational error 115 | double trace = cv::trace(diffRot)[0]; 116 | trace = std::min(3.0, std::max(-1.0, trace)); 117 | double rotErr = 180*acos((trace-1.0)/2.0)/CV_PI; 118 | 119 | cv::Mat_ invT1 = est.second.clone(); 120 | invT1 = invRot1 * invT1; 121 | 122 | cv::Mat_ invT2 = gt.second.clone(); 123 | invT2 = invRot2 * invT2; 124 | 125 | // zero error, abort 126 | double tErr = cv::norm(invT1 - invT2); 127 | 128 | cv::Mat_ jacobean = cv::Mat_::zeros(1, 6); 129 | 130 | // clamped loss, return zero gradient if loss is bigger than threshold 131 | double loss = wRot * rotErr + wTrans * tErr; 132 | bool cutLoss = false; 133 | 134 | 135 | if(loss > cut) 136 | { 137 | loss = std::sqrt(loss); 138 | cutLoss = true; 139 | } 140 | 141 | if(loss > MAXLOSS) 142 | return jacobean; 143 | 144 | if((tErr + rotErr) < EPS) 145 | return jacobean; 146 | 147 | 148 | // return gradient of translational error 149 | cv::Mat_ dDist_dInvT1(1, 3); 150 | for(unsigned i = 0; i < 3; i++) 151 | dDist_dInvT1(0, i) = (invT1(i, 0) - invT2(i, 0)) / tErr; 152 | 153 | cv::Mat_ dInvT1_dEstT(3, 3); 154 | dInvT1_dEstT = invRot1; 155 | 156 | cv::Mat_ dDist_dEstT = dDist_dInvT1 * dInvT1_dEstT; 157 | jacobean.colRange(3, 6) += dDist_dEstT * wTrans; 158 | 159 | cv::Mat_ dInvT1_dInvRot1 = cv::Mat_::zeros(3, 9); 160 | 161 | dInvT1_dInvRot1(0, 0) = est.second.at(0, 0); 162 | dInvT1_dInvRot1(0, 3) = est.second.at(1, 0); 163 | dInvT1_dInvRot1(0, 6) = est.second.at(2, 0); 164 | 165 | dInvT1_dInvRot1(1, 1) = est.second.at(0, 0); 166 | dInvT1_dInvRot1(1, 4) = est.second.at(1, 0); 167 | dInvT1_dInvRot1(1, 7) = est.second.at(2, 0); 168 | 169 | dInvT1_dInvRot1(2, 2) = est.second.at(0, 0); 170 | dInvT1_dInvRot1(2, 5) = est.second.at(1, 0); 171 | dInvT1_dInvRot1(2, 8) = est.second.at(2, 0); 172 | 173 | dRod = dRod.t(); 174 | 175 | cv::Mat_ dDist_dRod = dDist_dInvT1 * dInvT1_dInvRot1 * dRod; 176 | jacobean.colRange(0, 3) += dDist_dRod * wTrans; 177 | 178 | 179 | // return gradient of rotational error 180 | cv::Mat_ dRotDiff = cv::Mat_::zeros(9, 9); 181 | invRot2.row(0).copyTo(dRotDiff.row(0).colRange(0, 3)); 182 | invRot2.row(1).copyTo(dRotDiff.row(1).colRange(0, 3)); 183 | invRot2.row(2).copyTo(dRotDiff.row(2).colRange(0, 3)); 184 | 185 | invRot2.row(0).copyTo(dRotDiff.row(3).colRange(3, 6)); 186 | invRot2.row(1).copyTo(dRotDiff.row(4).colRange(3, 6)); 187 | invRot2.row(2).copyTo(dRotDiff.row(5).colRange(3, 6)); 188 | 189 | invRot2.row(0).copyTo(dRotDiff.row(6).colRange(6, 9)); 190 | invRot2.row(1).copyTo(dRotDiff.row(7).colRange(6, 9)); 191 | invRot2.row(2).copyTo(dRotDiff.row(8).colRange(6, 9)); 192 | 193 | dRotDiff = dRotDiff.t(); 194 | 195 | cv::Mat_ dTrace = cv::Mat_::zeros(1, 9); 196 | dTrace(0, 0) = 1; 197 | dTrace(0, 4) = 1; 198 | dTrace(0, 8) = 1; 199 | 200 | cv::Mat_ dAngle = (180 / CV_PI * -1 / sqrt(3 - trace * trace + 2 * trace)) * dTrace * dRotDiff * dRod; 201 | 202 | jacobean.colRange(0, 3) += dAngle * wRot; 203 | 204 | if(cutLoss) 205 | jacobean *= 0.5 / loss; 206 | 207 | 208 | if(cv::sum(cv::Mat(jacobean != jacobean))[0] > 0) //check for NaNs 209 | return cv::Mat_::zeros(1, 6); 210 | 211 | return jacobean; 212 | } 213 | 214 | 215 | } 216 | -------------------------------------------------------------------------------- /dsacstar/dsacstar_types.h: -------------------------------------------------------------------------------- 1 | /* 2 | Based on the DSAC++ and ESAC code. 3 | https://github.com/vislearn/LessMore 4 | https://github.com/vislearn/esac 5 | 6 | Copyright (c) 2016, TU Dresden 7 | Copyright (c) 2020, Heidelberg University 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | * Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | * Neither the name of the TU Dresden, Heidelberg University nor the 18 | names of its contributors may be used to endorse or promote products 19 | derived from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include "opencv2/opencv.hpp" 36 | 37 | /** Several important types used troughout all this code. If types have to be changed, it can be done here, conveniently. */ 38 | 39 | namespace dsacstar 40 | { 41 | // scene pose type (OpenCV convention: axis-angle + translation) 42 | typedef std::pair pose_t; 43 | // camera transformation type (inverted scene pose as 4x4 matrix) 44 | typedef cv::Mat_ trans_t; 45 | // ATen accessor type 46 | typedef at::TensorAccessor coord_t; 47 | } 48 | -------------------------------------------------------------------------------- /dsacstar/dsacstar_util_rgbd.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018-2020, Heidelberg University 3 | 4 | This code fragment contains adapted sources from PyTorch. See license agreement of PyTorch below. 5 | 6 | From PyTorch: 7 | 8 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 9 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 10 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 11 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 12 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 13 | Copyright (c) 2011-2013 NYU (Clement Farabet) 14 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 15 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 16 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 17 | 18 | From Caffe2: 19 | 20 | Copyright (c) 2016-present, Facebook Inc. All rights reserved. 21 | 22 | All contributions by Facebook: 23 | Copyright (c) 2016 Facebook Inc. 24 | 25 | All contributions by Google: 26 | Copyright (c) 2015 Google Inc. 27 | All rights reserved. 28 | 29 | All contributions by Yangqing Jia: 30 | Copyright (c) 2015 Yangqing Jia 31 | All rights reserved. 32 | 33 | All contributions from Caffe: 34 | Copyright(c) 2013, 2014, 2015, the respective contributors 35 | All rights reserved. 36 | 37 | All other contributions: 38 | Copyright(c) 2015, 2016 the respective contributors 39 | All rights reserved. 40 | 41 | Caffe2 uses a copyright model similar to Caffe: each contributor holds 42 | copyright over their contributions to Caffe2. The project versioning records 43 | all such contribution and copyright details. If a contributor wants to further 44 | mark their specific copyright on a particular contribution, they should 45 | indicate their copyright solely in the commit message of the change when it is 46 | committed. 47 | 48 | All rights reserved. 49 | 50 | Redistribution and use in source and binary forms, with or without 51 | modification, are permitted provided that the following conditions are met: 52 | 53 | 1. Redistributions of source code must retain the above copyright 54 | notice, this list of conditions and the following disclaimer. 55 | 56 | 2. Redistributions in binary form must reproduce the above copyright 57 | notice, this list of conditions and the following disclaimer in the 58 | documentation and/or other materials provided with the distribution. 59 | 60 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America 61 | Heidelberg University and IDIAP Research Institute nor the names of its contributors may be 62 | used to endorse or promote products derived from this software without 63 | specific prior written permission. 64 | 65 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 69 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 75 | POSSIBILITY OF SUCH DAMAGE. 76 | 77 | */ 78 | 79 | #pragma once 80 | 81 | 82 | //#include "opencv2/core/types_c.h" 83 | //#include "opencv2/calib3d/calib3d_c.h" 84 | 85 | /* 86 | * @brief reimplementation of PyTorch svd_backward in C++ 87 | * 88 | * ref: https://github.com/pytorch/pytorch/blob/1d427fd6f66b0822db62f30e7654cae95abfd207/tools/autograd/templates/Functions.cpp 89 | * ref: https://j-towns.github.io/papers/svd-derivative.pdf 90 | * 91 | * This makes no assumption on the signs of sigma. 92 | * 93 | * @param grad: gradients w.r.t U, W, V 94 | * @param self: matrix decomposed by SVD 95 | * @param raw_u: U from SVD output 96 | * @param sigma: W from SVD output 97 | * @param raw_v: V from SVD output 98 | * 99 | */ 100 | 101 | cv::Mat svd_backward( 102 | const std::vector &grads, 103 | const cv::Mat& self, 104 | const cv::Mat& raw_u, 105 | const cv::Mat& sigma, 106 | const cv::Mat& raw_v) 107 | { 108 | auto m = self.rows; 109 | auto n = self.cols; 110 | auto k = sigma.cols; 111 | auto gsigma = grads[1]; 112 | 113 | auto u = raw_u; 114 | auto v = raw_v; 115 | auto gu = grads[0]; 116 | auto gv = grads[2]; 117 | 118 | auto vt = v.t(); 119 | 120 | cv::Mat sigma_term; 121 | 122 | if (!gsigma.empty()) 123 | { 124 | sigma_term = (u * cv::Mat::diag(gsigma)) * vt; 125 | } 126 | else 127 | { 128 | sigma_term = cv::Mat::zeros(self.size(), self.type()); 129 | } 130 | // in case that there are no gu and gv, we can avoid the series of kernel 131 | // calls below 132 | if (gv.empty() && gu.empty()) 133 | { 134 | return sigma_term; 135 | } 136 | 137 | auto ut = u.t(); 138 | auto im = cv::Mat::eye((int)m, (int)m, self.type()); 139 | auto in = cv::Mat::eye((int)n, (int)n, self.type()); 140 | auto sigma_mat = cv::Mat::diag(sigma); 141 | 142 | cv::Mat sigma_mat_inv; 143 | cv::pow(sigma, -1, sigma_mat_inv); 144 | sigma_mat_inv = cv::Mat::diag(sigma_mat_inv); 145 | 146 | cv::Mat sigma_sq, sigma_expanded_sq; 147 | cv::pow(sigma, 2, sigma_sq); 148 | sigma_expanded_sq = cv::repeat(sigma_sq, sigma_mat.rows, 1); 149 | 150 | cv::Mat F = sigma_expanded_sq - sigma_expanded_sq.t(); 151 | // The following two lines invert values of F, and fills the diagonal with 0s. 152 | // Notice that F currently has 0s on diagonal. So we fill diagonal with +inf 153 | // first to prevent nan from appearing in backward of this function. 154 | F.diag().setTo(std::numeric_limits::max()); 155 | cv::pow(F, -1, F); 156 | 157 | cv::Mat u_term, v_term; 158 | 159 | if (!gu.empty()) 160 | { 161 | cv::multiply(F, ut*gu-gu.t()*u, u_term); 162 | u_term = (u * u_term) * sigma_mat; 163 | if (m > k) 164 | { 165 | u_term = u_term + ((im - u*ut)*gu)*sigma_mat_inv; 166 | } 167 | u_term = u_term * vt; 168 | } 169 | else 170 | { 171 | u_term = cv::Mat::zeros(self.size(), self.type()); 172 | } 173 | 174 | if (!gv.empty()) 175 | { 176 | auto gvt = gv.t(); 177 | cv::multiply(F, vt*gv - gvt*v, v_term); 178 | v_term = (sigma_mat*v_term) * vt; 179 | if (n > k) 180 | { 181 | v_term = v_term + sigma_mat_inv*(gvt*(in - v*vt)); 182 | } 183 | v_term = u * v_term; 184 | } 185 | else 186 | { 187 | v_term = cv::Mat::zeros(self.size(), self.type()); 188 | } 189 | 190 | return u_term + sigma_term + v_term; 191 | } 192 | 193 | /* 194 | * @brief Compute partial derivatives of the matrix product for each multiplied matrix. 195 | * This wrapper function avoids unnecessary computation 196 | * 197 | * @param _Amat: First multiplied matrix. 198 | * @param _Bmat: Second multiplied matrix. 199 | * @param _dABdA Output parameter: First output derivative matrix. Pass cv::noArray() if not needed. 200 | * @param _dABdB Output parameter: Second output derivative matrix. Pass cv::noArray() if not needed. 201 | * 202 | */ 203 | void matMulDerivWrapper( 204 | cv::InputArray _Amat, 205 | cv::InputArray _Bmat, 206 | cv::OutputArray _dABdA, 207 | cv::OutputArray _dABdB) 208 | { 209 | cv::Mat A = _Amat.getMat(), B = _Bmat.getMat(); 210 | 211 | if (_dABdA.needed()) 212 | { 213 | _dABdA.create(A.rows*B.cols, A.rows*A.cols, A.type()); 214 | } 215 | 216 | if (_dABdB.needed()) 217 | { 218 | _dABdB.create(A.rows*B.cols, B.rows*B.cols, A.type()); 219 | } 220 | 221 | CvMat matA = A, matB = B, c_dABdA=_dABdA.getMat(), c_dABdB=_dABdB.getMat(); 222 | cvCalcMatMulDeriv(&matA, &matB, _dABdA.needed() ? &c_dABdA : 0, _dABdB.needed() ? &c_dABdB : 0); 223 | } 224 | 225 | /* 226 | * @brief Computes extrinsic camera parameters using Kabsch algorithm. 227 | * If jacobean matrix is passed as argument, it further computes the analytical gradients. 228 | * 229 | * @param imgdPts: measurements 230 | * @param objPts: scene points 231 | * @param extCam Output parameter: extrinsic camera matrix (i.e. rotation vector and translation vector) 232 | * @param _jacobean Output parameter: 6x3N jacobean matrix of rotation and translation vector 233 | * w.r.t scene point coordinates. 234 | * If gradient computation is not successful, jacobean matrix is set empty. 235 | * 236 | */ 237 | void kabsch( 238 | const std::vector& imgdPts, 239 | const std::vector& objPts, 240 | dsacstar::pose_t& extCam, 241 | cv::OutputArray _jacobean=cv::noArray()) 242 | { 243 | unsigned int N = objPts.size(); //number of scene points 244 | bool calc = _jacobean.needed(); //check if computation of gradient is required 245 | bool degenerate = false; //indicate if SVD gives degenerate case, i.e. non-distinct or zero singular values 246 | 247 | cv::Mat P, X, Pc, Xc; //Nx3 248 | cv::Mat A, U, W, Vt, V, D, R; //3x3 249 | cv::Mat cx, cp, r, t; //1x3 250 | cv::Mat invN; //1xN 251 | cv::Mat gRodr; //9x3 252 | 253 | // construct the datasets P and X from input vectors, set false to avoid data copying 254 | P = cv::Mat(imgdPts, false).reshape(1, N); 255 | X = cv::Mat(objPts, false).reshape(1, N); 256 | 257 | // compute centroid as average of each coordinate axis 258 | invN = cv::Mat(1, N, CV_32F, 1.f/N); //average filter 259 | cx = invN * X; 260 | cp = invN * P; 261 | 262 | // move centroid of datasets to origin 263 | Xc = X - cv::repeat(cx, N, 1); 264 | Pc = P - cv::repeat(cp, N, 1); 265 | 266 | // compute covariance matrix 267 | A = Pc.t() * Xc; 268 | 269 | // compute SVD of covariance matrix 270 | cv::SVD::compute(A, W, U, Vt); 271 | 272 | // degenerate if any singular value is zero 273 | if ((unsigned int)cv::countNonZero(W) != (unsigned int)W.total()) 274 | degenerate = true; 275 | 276 | // degenerate if singular values are not distinct 277 | if (std::abs(W.at(0,0)-W.at(1,0)) < 1e-6 278 | || std::abs(W.at(0,0)-W.at(2,0)) < 1e-6 279 | || std::abs(W.at(1,0)-W.at(2,0)) < 1e-6) 280 | degenerate = true; 281 | 282 | // for correcting rotation matrix to ensure a right-handed coordinate system 283 | float d = cv::determinant(U * Vt); 284 | 285 | D = (cv::Mat_(3,3) << 286 | 1., 0., 0., 287 | 0., 1., 0., 288 | 0., 0., d ); 289 | 290 | // calculates rotation matrix R 291 | R = U * (D * Vt); 292 | 293 | // convert rotation matrix to rotation vector, 294 | // if needed, also compute jacobean matrix of rotation matrix w.r.t rotation vector 295 | calc ? cv::Rodrigues(R, r, gRodr) : cv::Rodrigues(R, r); 296 | 297 | // calculates translation vector 298 | t = cp - cx * R.t(); //equiv: cp - (R*cx.t()).t(); 299 | 300 | // store results 301 | extCam.first = cv::Mat_(r.reshape(1, 3)); 302 | extCam.second = cv::Mat_(t.reshape(1, 3)); 303 | 304 | // end here no gradient is required 305 | if (!calc) 306 | return; 307 | 308 | // if SVD is degenerate, return empty jacobean matrix 309 | if (degenerate) 310 | { 311 | _jacobean.release(); 312 | return; 313 | } 314 | 315 | // allocate matrix data 316 | _jacobean.create(6, N*3, CV_64F); 317 | cv::Mat jacobean = _jacobean.getMat(); 318 | 319 | // cv::Mat dRidU, dRidVt, dRidV, dRidA, dRidXc, dRidX; 320 | cv::Mat dRdU, dRdVt; //9x9 321 | cv::Mat dAdXc; //9x3N 322 | cv::Mat dtdR; //3x9 323 | cv::Mat dtdcx; //3x3 324 | cv::Mat dcxdX, drdX, dtdX; //3x3N 325 | cv::Mat dRdX = cv::Mat_::zeros(9, N*3); //9x3N 326 | 327 | // jacobean matrices of each dot product operation in kabsch algorithm 328 | matMulDerivWrapper(U, Vt, dRdU, dRdVt); 329 | matMulDerivWrapper(Pc.t(), Xc, cv::noArray(), dAdXc); 330 | matMulDerivWrapper(R, cx.t(), dtdR, dtdcx); 331 | matMulDerivWrapper(invN, X, cv::noArray(), dcxdX); 332 | 333 | V = Vt.t(); 334 | W = W.reshape(1, 1); 335 | 336 | // #pragma omp parallel for 337 | for (int i = 0; i < 9; ++i) 338 | { 339 | cv::Mat dRidU, dRidVt, dRidV, dRidA; //3x3 340 | cv::Mat dRidXc, dRidX; //Nx3 341 | 342 | dRidU = dRdU.row(i).reshape(1, 3); 343 | dRidVt = dRdVt.row(i).reshape(1, 3); 344 | 345 | dRidV = dRidVt.t(); 346 | 347 | //W is not used in computation of R, no gradient of W is needed 348 | std::vector grads{dRidU, cv::Mat(), dRidV}; 349 | 350 | dRidA = svd_backward(grads, A, U, W, V); 351 | dRidA = dRidA.reshape(1, 1); 352 | dRidXc = dRidA * dAdXc; 353 | dRidXc = dRidXc.reshape(1, N); 354 | dRidX = cv::Mat::zeros(dRidXc.size(), dRidXc.type()); 355 | 356 | int bstep = dRidXc.step/CV_ELEM_SIZE(dRidXc.type()); 357 | 358 | // #pragma omp parallel for 359 | for (int j = 0; j < 3; ++j) 360 | { 361 | // compute dRidXj = dRidXcj * dXcjdXj 362 | float* pdRidXj = (float*)dRidX.data + j; 363 | const float* pdRidXcj = (const float*)dRidXc.data + j; 364 | 365 | float tmp = 0.f; 366 | for (unsigned int k = 0; k < N; ++k) 367 | { 368 | tmp += pdRidXcj[k*bstep]; 369 | } 370 | tmp /= N; 371 | 372 | for (unsigned int k = 0; k < N; ++k) 373 | { 374 | pdRidXj[k*bstep] = pdRidXcj[k*bstep] - tmp; 375 | } 376 | } 377 | 378 | dRidX = dRidX.reshape(1, 1); 379 | dRidX.copyTo(dRdX.rowRange(i, i+1)); 380 | } 381 | 382 | drdX = gRodr.t() * dRdX; 383 | drdX.copyTo(jacobean.rowRange(0, 3)); 384 | dtdX = - (dtdR * dRdX + dtdcx * dcxdX); 385 | dtdX.copyTo(jacobean.rowRange(3, 6)); 386 | 387 | } 388 | 389 | /** 390 | * @brief Checks whether the given matrix contains NaN entries. 391 | * @param m Input matrix. 392 | * @return True if m contrains NaN entries. 393 | */ 394 | inline bool containsNaNs(const cv::Mat& m) 395 | { 396 | return cv::sum(cv::Mat(m != m))[0] > 0; 397 | } 398 | 399 | /* 400 | * @brief Computes gradient of Kabsch algorithm and differentiation 401 | * using central finite differences 402 | * 403 | * @param imgdPts: measurement points 404 | * @param objPts: scene points 405 | * @param jacobean Output parameter: 6x3N jacobean matrix of rotation and translation vector 406 | * w.r.t scene point coordinates 407 | * @param eps: step size in finite difference approximation 408 | * 409 | */ 410 | void dKabschFD( 411 | std::vector& imgdPts, 412 | std::vector objPts, 413 | cv::OutputArray _jacobean, 414 | float eps = 0.001f) 415 | { 416 | _jacobean.create(6, objPts.size()*3, CV_64F); 417 | cv::Mat jacobean = _jacobean.getMat(); 418 | 419 | for (unsigned int i = 0; i < objPts.size(); ++i) 420 | { 421 | for (unsigned int j = 0; j < 3; ++j) 422 | { 423 | 424 | if(j == 0) objPts[i].x += eps; 425 | else if(j == 1) objPts[i].y += eps; 426 | else if(j == 2) objPts[i].z += eps; 427 | 428 | // forward step 429 | 430 | dsacstar::pose_t fStep; 431 | kabsch(imgdPts, objPts, fStep); 432 | 433 | if(j == 0) objPts[i].x -= 2 * eps; 434 | else if(j == 1) objPts[i].y -= 2 * eps; 435 | else if(j == 2) objPts[i].z -= 2 * eps; 436 | 437 | // backward step 438 | dsacstar::pose_t bStep; 439 | kabsch(imgdPts, objPts, bStep); 440 | 441 | if(j == 0) objPts[i].x += eps; 442 | else if(j == 1) objPts[i].y += eps; 443 | else if(j == 2) objPts[i].z += eps; 444 | 445 | // gradient calculation 446 | fStep.first = (fStep.first - bStep.first) / (2 * eps); 447 | fStep.second = (fStep.second - bStep.second) / (2 * eps); 448 | 449 | fStep.first.copyTo(jacobean.col(i * 3 + j).rowRange(0, 3)); 450 | fStep.second.copyTo(jacobean.col(i * 3 + j).rowRange(3, 6)); 451 | 452 | if(containsNaNs(jacobean.col(i * 3 + j))) 453 | jacobean.setTo(0); 454 | 455 | } 456 | } 457 | 458 | } 459 | 460 | void transform( 461 | const std::vector& objPts, 462 | const dsacstar::pose_t& transform, 463 | std::vector& transformedPts) 464 | { 465 | 466 | cv::Mat rot; 467 | cv::Rodrigues(transform.first, rot); 468 | 469 | // P = RX + T 470 | for (unsigned int i = 0; i < objPts.size(); ++i) 471 | { 472 | cv::Mat_ res= rot * cv::Mat_(objPts[i], false); 473 | 474 | cv::add(res, transform.second, res); 475 | 476 | transformedPts.push_back(cv::Point3f( 477 | res.at(0,0), 478 | res.at(1,0), 479 | res.at(2,0))); 480 | } 481 | } -------------------------------------------------------------------------------- /dsacstar/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension 3 | import os 4 | 5 | opencv_inc_dir = '' # directory containing OpenCV header files 6 | opencv_lib_dir = '' # directory containing OpenCV library files 7 | 8 | #if not explicitly provided, we try to locate OpenCV in the current Conda environment 9 | conda_env = os.environ['CONDA_PREFIX'] 10 | 11 | if len(conda_env) > 0 and len(opencv_inc_dir) == 0 and len(opencv_lib_dir) == 0: 12 | print("Detected active conda environment:", conda_env) 13 | 14 | opencv_inc_dir = conda_env + '/include/opencv4' 15 | opencv_lib_dir = conda_env + '/lib/opencv4' 16 | 17 | print("Assuming OpenCV dependencies in:") 18 | print(opencv_inc_dir) 19 | print(opencv_lib_dir) 20 | 21 | if len(opencv_inc_dir) == 0: 22 | print("Error: You have to provide an OpenCV include directory. Edit this file.") 23 | exit() 24 | if len(opencv_lib_dir) == 0: 25 | print("Error: You have to provide an OpenCV library directory. Edit this file.") 26 | exit() 27 | 28 | setup( 29 | name='dsacstar', 30 | ext_modules=[CppExtension( 31 | name='dsacstar', 32 | sources=['dsacstar.cpp','thread_rand.cpp'], 33 | include_dirs=[opencv_inc_dir], 34 | library_dirs=[opencv_lib_dir], 35 | libraries=['opencv_core','opencv_calib3d'], 36 | extra_compile_args=['-fopenmp'] 37 | )], 38 | cmdclass={'build_ext': BuildExtension}) 39 | -------------------------------------------------------------------------------- /dsacstar/stop_watch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, TU Dresden 3 | Copyright (c) 2017, Heidelberg University 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the TU Dresden, Heidelberg University nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY 21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | 30 | #pragma once 31 | 32 | #include 33 | 34 | /** 35 | * @brief Class for time measurements. 36 | */ 37 | class StopWatch 38 | { 39 | public: 40 | /** 41 | * @brief Construction. Initializes the stop watch. 42 | */ 43 | StopWatch(){ init(); } 44 | 45 | /** 46 | * @brief Initialization. Starts the time measurement. 47 | * 48 | * @return void 49 | */ 50 | void init() 51 | { 52 | start = std::chrono::high_resolution_clock::now(); 53 | } 54 | 55 | /** 56 | * @brief Stops and restarts the time measurement. 57 | * 58 | * @return float The time in ms since the last init or stop call. 59 | */ 60 | float stop() 61 | { 62 | std::chrono::high_resolution_clock::time_point now; 63 | now = std::chrono::high_resolution_clock::now(); 64 | 65 | std::chrono::high_resolution_clock::duration duration = now - start; 66 | 67 | start = now; 68 | 69 | return static_cast( 70 | 1000.0 * std::chrono::duration_cast>( 71 | duration).count()); 72 | } 73 | 74 | private: 75 | std::chrono::high_resolution_clock::time_point start; // start time of the current measurement. 76 | }; 77 | -------------------------------------------------------------------------------- /dsacstar/thread_rand.cpp: -------------------------------------------------------------------------------- 1 | #include "thread_rand.h" 2 | #include 3 | 4 | std::vector ThreadRand::generators; 5 | bool ThreadRand::initialised = false; 6 | 7 | void ThreadRand::forceInit(unsigned seed) 8 | { 9 | initialised = false; 10 | init(seed); 11 | } 12 | 13 | void ThreadRand::init(unsigned seed) 14 | { 15 | #pragma omp critical 16 | { 17 | if(!initialised) 18 | { 19 | unsigned nThreads = omp_get_max_threads(); 20 | 21 | for(unsigned i = 0; i < nThreads; i++) 22 | { 23 | generators.push_back(std::mt19937()); 24 | generators[i].seed(i+seed); 25 | } 26 | 27 | initialised = true; 28 | } 29 | } 30 | } 31 | 32 | int ThreadRand::irand(int min, int max, int tid) 33 | { 34 | std::uniform_int_distribution dist(min, max); 35 | 36 | unsigned threadID = omp_get_thread_num(); 37 | if(tid >= 0) threadID = tid; 38 | 39 | if(!initialised) init(); 40 | 41 | return dist(ThreadRand::generators[threadID]); 42 | } 43 | 44 | double ThreadRand::drand(double min, double max, int tid) 45 | { 46 | std::uniform_real_distribution dist(min, max); 47 | 48 | unsigned threadID = omp_get_thread_num(); 49 | if(tid >= 0) threadID = tid; 50 | 51 | if(!initialised) init(); 52 | 53 | return dist(ThreadRand::generators[threadID]); 54 | } 55 | 56 | double ThreadRand::dgauss(double mean, double stdDev, int tid) 57 | { 58 | std::normal_distribution dist(mean, stdDev); 59 | 60 | unsigned threadID = omp_get_thread_num(); 61 | if(tid >= 0) threadID = tid; 62 | 63 | if(!initialised) init(); 64 | 65 | return dist(ThreadRand::generators[threadID]); 66 | } 67 | 68 | int irand(int incMin, int excMax, int tid) 69 | { 70 | return ThreadRand::irand(incMin, excMax - 1, tid); 71 | } 72 | 73 | double drand(double incMin, double incMax,int tid) 74 | { 75 | return ThreadRand::drand(incMin, incMax, tid); 76 | } 77 | 78 | int igauss(int mean, int stdDev, int tid) 79 | { 80 | return (int) ThreadRand::dgauss(mean, stdDev, tid); 81 | } 82 | 83 | double dgauss(double mean, double stdDev, int tid) 84 | { 85 | return ThreadRand::dgauss(mean, stdDev, tid); 86 | } -------------------------------------------------------------------------------- /dsacstar/thread_rand.h: -------------------------------------------------------------------------------- 1 | /* 2 | Based on the DSAC++ and ESAC code. 3 | https://github.com/vislearn/LessMore 4 | https://github.com/vislearn/esac 5 | 6 | Copyright (c) 2016, TU Dresden 7 | Copyright (c) 2019, Heidelberg University 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | * Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | * Neither the name of the TU Dresden, Heidelberg University nor the 18 | names of its contributors may be used to endorse or promote products 19 | derived from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | /** Classes and methods for generating random numbers in multi-threaded programs. */ 38 | 39 | /** 40 | * @brief Provides random numbers for multiple threads. 41 | * 42 | * Singelton class. Holds a random number generator for each thread and gives random numbers for the current thread. 43 | */ 44 | class ThreadRand 45 | { 46 | public: 47 | /** 48 | * @brief Returns a random integer (uniform distribution). 49 | * 50 | * @param min Minimum value of the random integer (inclusive). 51 | * @param max Maximum value of the random integer (exclusive). 52 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 53 | * @return int Random integer value. 54 | */ 55 | static int irand(int min, int max, int tid = -1); 56 | 57 | /** 58 | * @brief Returns a random double value (uniform distribution). 59 | * 60 | * @param min Minimum value of the random double (inclusive). 61 | * @param max Maximum value of the random double (inclusive). 62 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 63 | * @return double Random double value. 64 | */ 65 | static double drand(double min, double max, int tid = -1); 66 | 67 | /** 68 | * @brief Returns a random double value (Gauss distribution). 69 | * 70 | * @param mean Mean of the Gauss distribution to sample from. 71 | * @param stdDev Standard deviation of the Gauss distribution to sample from. 72 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 73 | * @return double Random double value. 74 | */ 75 | static double dgauss(double mean, double stdDev, int tid = -1); 76 | 77 | /** 78 | * @brief Re-Initialize the object with the given seed. 79 | * 80 | * @param seed Seed to initialize the random number generators (seed is incremented by one for each generator). 81 | * @return void 82 | */ 83 | static void forceInit(unsigned seed); 84 | 85 | /** 86 | * @brief List of random number generators. One for each thread. 87 | * 88 | */ 89 | static std::vector generators; 90 | 91 | /** 92 | * @brief Initialize class with the given seed. 93 | * 94 | * Method will create a random number generator for each thread. The given seed 95 | * will be incremented by one for each generator. This methods is automatically 96 | * called when this calss is used the first time. 97 | * 98 | * @param seed Optional parameter. Seed to be used when initializing the generators. Will be incremented by one for each generator. 99 | * @return void 100 | */ 101 | static void init(unsigned seed = 1305); 102 | 103 | private: 104 | /** 105 | * @brief True if the class has been initialized already 106 | */ 107 | static bool initialised; 108 | }; 109 | 110 | /** 111 | * @brief Returns a random integer (uniform distribution). 112 | * 113 | * This method used the ThreadRand class. 114 | * 115 | * @param min Minimum value of the random integer (inclusive). 116 | * @param max Maximum value of the random integer (exclusive). 117 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 118 | * @return int Random integer value. 119 | */ 120 | int irand(int incMin, int excMax, int tid = -1); 121 | /** 122 | * @brief Returns a random double value (uniform distribution). 123 | * 124 | * This method used the ThreadRand class. 125 | * 126 | * @param min Minimum value of the random double (inclusive). 127 | * @param max Maximum value of the random double (inclusive). 128 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 129 | * @return double Random double value. 130 | */ 131 | double drand(double incMin, double incMax, int tid = -1); 132 | 133 | /** 134 | * @brief Returns a random integer value (Gauss distribution). 135 | * 136 | * This method used the ThreadRand class. 137 | * 138 | * @param mean Mean of the Gauss distribution to sample from. 139 | * @param stdDev Standard deviation of the Gauss distribution to sample from. 140 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 141 | * @return double Random integer value. 142 | */ 143 | int igauss(int mean, int stdDev, int tid = -1); 144 | 145 | /** 146 | * @brief Returns a random double value (Gauss distribution). 147 | * 148 | * This method used the ThreadRand class. 149 | * 150 | * @param mean Mean of the Gauss distribution to sample from. 151 | * @param stdDev Standard deviation of the Gauss distribution to sample from. 152 | * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself. 153 | * @return double Random double value. 154 | */ 155 | double dgauss(double mean, double stdDev, int tid = -1); 156 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ace 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - anaconda 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - _libgcc_mutex=0.1=main 10 | - _openmp_mutex=5.1=1_gnu 11 | - alsa-lib=1.2.3.2=h166bdaf_0 12 | - appdirs=1.4.4=pyhd3eb1b0_0 13 | - blas=1.0=mkl 14 | - blosc=1.21.3=h6a678d5_0 15 | - brotli=1.0.9=h5eee18b_7 16 | - brotli-bin=1.0.9=h5eee18b_7 17 | - brotlipy=0.7.0=py38h27cfd23_1003 18 | - brunsli=0.1=h2531618_0 19 | - bzip2=1.0.8=h7b6447c_0 20 | - c-ares=1.19.0=h5eee18b_0 21 | - ca-certificates=2023.01.10=h06a4308_0 22 | - cairo=1.16.0=h6cf1ce9_1008 23 | - certifi=2022.12.7=py38h06a4308_0 24 | - cffi=1.15.1=py38h5eee18b_3 25 | - cfitsio=3.470=h5893167_7 26 | - charls=2.2.0=h2531618_0 27 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 28 | - cloudpickle=2.0.0=pyhd3eb1b0_0 29 | - contourpy=1.0.5=py38hdb19cb5_0 30 | - cryptography=39.0.1=py38h9ce1e76_0 31 | - cuda-cudart=11.8.89=0 32 | - cuda-cupti=11.8.87=0 33 | - cuda-libraries=11.8.0=0 34 | - cuda-nvrtc=11.8.89=0 35 | - cuda-nvtx=11.8.86=0 36 | - cuda-runtime=11.8.0=0 37 | - cycler=0.11.0=pyhd8ed1ab_0 38 | - cytoolz=0.12.0=py38h5eee18b_0 39 | - dask-core=2022.7.0=py38h06a4308_0 40 | - dataclasses=0.8=pyhc8e2a94_3 41 | - dbus=1.13.6=h48d8840_2 42 | - expat=2.4.8=h27087fc_0 43 | - ffmpeg=4.3=hf484d3e_0 44 | - filelock=3.9.0=py38h06a4308_0 45 | - flit-core=3.8.0=py38h06a4308_0 46 | - fontconfig=2.14.0=h8e229c2_0 47 | - fonttools=4.25.0=pyhd3eb1b0_0 48 | - freetype=2.12.1=h4a9f257_0 49 | - freetype-py=2.3.0=pyhd8ed1ab_0 50 | - fsspec=2023.3.0=py38h06a4308_0 51 | - future=0.18.3=pyhd8ed1ab_0 52 | - gettext=0.19.8.1=h73d1719_1008 53 | - giflib=5.2.1=h5eee18b_3 54 | - glib=2.68.4=h9c3ff4c_1 55 | - glib-tools=2.68.4=h9c3ff4c_1 56 | - gmp=6.2.1=h295c915_3 57 | - gmpy2=2.1.2=py38heeb90bb_0 58 | - gnutls=3.6.15=he1e5248_0 59 | - graphite2=1.3.13=h58526e2_1001 60 | - gst-plugins-base=1.18.5=hf529b03_0 61 | - gstreamer=1.18.5=h76c114f_0 62 | - harfbuzz=2.9.1=h83ec7ef_1 63 | - hdf5=1.10.6=nompi_h6a2412b_1114 64 | - icu=68.2=h9c3ff4c_0 65 | - idna=3.4=py38h06a4308_0 66 | - imagecodecs-lite=2019.12.3=py38h71d37f0_5 67 | - imageio=2.26.0=py38h06a4308_0 68 | - importlib_resources=5.12.0=pyhd8ed1ab_0 69 | - intel-openmp=2021.4.0=h06a4308_3561 70 | - jasper=1.900.1=h07fcdf6_1006 71 | - jinja2=3.1.2=py38h06a4308_0 72 | - joblib=1.1.1=py38h06a4308_0 73 | - jpeg=9e=h5eee18b_1 74 | - jxrlib=1.1=h7b6447c_2 75 | - kiwisolver=1.4.4=py38h6a678d5_0 76 | - krb5=1.19.4=h568e23c_0 77 | - lame=3.100=h7b6447c_0 78 | - lcms2=2.12=h3be6417_0 79 | - ld_impl_linux-64=2.38=h1181459_1 80 | - lerc=3.0=h295c915_0 81 | - libaec=1.0.4=he6710b0_1 82 | - libblas=3.9.0=12_linux64_mkl 83 | - libbrotlicommon=1.0.9=h5eee18b_7 84 | - libbrotlidec=1.0.9=h5eee18b_7 85 | - libbrotlienc=1.0.9=h5eee18b_7 86 | - libcblas=3.9.0=12_linux64_mkl 87 | - libclang=11.1.0=default_ha53f305_1 88 | - libcublas=11.11.3.6=0 89 | - libcufft=10.9.0.58=0 90 | - libcufile=1.6.0.25=0 91 | - libcurand=10.3.2.56=0 92 | - libcurl=7.88.1=h91b91d3_0 93 | - libcusolver=11.4.1.48=0 94 | - libcusparse=11.7.5.86=0 95 | - libdeflate=1.17=h5eee18b_0 96 | - libedit=3.1.20221030=h5eee18b_0 97 | - libev=4.33=h7f8727e_1 98 | - libevent=2.1.10=h9b69904_4 99 | - libffi=3.4.2=h6a678d5_6 100 | - libgcc-ng=11.2.0=h1234567_1 101 | - libgfortran-ng=11.2.0=h00389a5_1 102 | - libgfortran5=11.2.0=h1234567_1 103 | - libglib=2.68.4=h174f98d_1 104 | - libglu=9.0.0=hf484d3e_1 105 | - libgomp=11.2.0=h1234567_1 106 | - libiconv=1.16=h7f8727e_2 107 | - libidn2=2.3.2=h7f8727e_0 108 | - liblapack=3.9.0=12_linux64_mkl 109 | - liblapacke=3.9.0=12_linux64_mkl 110 | - libllvm11=11.1.0=hf817b99_2 111 | - libnghttp2=1.46.0=hce63b2e_0 112 | - libnpp=11.8.0.86=0 113 | - libnvjpeg=11.9.0.86=0 114 | - libogg=1.3.4=h7f98852_1 115 | - libopencv=4.4.0=py38_2 116 | - libopus=1.3.1=h7f98852_1 117 | - libpng=1.6.39=h5eee18b_0 118 | - libpq=13.3=hd57d9b9_0 119 | - libssh2=1.10.0=h8f2d780_0 120 | - libstdcxx-ng=11.2.0=h1234567_1 121 | - libtasn1=4.16.0=h27cfd23_0 122 | - libtiff=4.2.0=hf544144_3 123 | - libunistring=0.9.10=h27cfd23_0 124 | - libuuid=2.32.1=h7f98852_1000 125 | - libvorbis=1.3.7=h9c3ff4c_0 126 | - libwebp=1.2.4=h11a3e52_1 127 | - libwebp-base=1.2.4=h5eee18b_1 128 | - libxcb=1.13=h7f98852_1004 129 | - libxkbcommon=1.0.3=he3ba5ed_0 130 | - libxml2=2.9.12=h72842e0_0 131 | - libzopfli=1.0.3=he6710b0_0 132 | - locket=1.0.0=py38h06a4308_0 133 | - lz4-c=1.9.4=h6a678d5_0 134 | - markupsafe=2.1.1=py38h7f8727e_0 135 | - matplotlib=3.7.1=py38h578d9bd_0 136 | - matplotlib-base=3.7.1=py38h417a72b_1 137 | - mkl=2021.4.0=h06a4308_640 138 | - mkl-service=2.4.0=py38h7f8727e_0 139 | - mkl_fft=1.3.1=py38hd3c417c_0 140 | - mkl_random=1.2.2=py38h51133e4_0 141 | - mpc=1.1.0=h10f8cd9_1 142 | - mpfr=4.0.2=hb69a4c5_1 143 | - mpmath=1.2.1=py38h06a4308_0 144 | - munkres=1.1.4=pyh9f0ad1d_0 145 | - mysql-common=8.0.25=ha770c72_2 146 | - mysql-libs=8.0.25=hfa10184_2 147 | - ncurses=6.4=h6a678d5_0 148 | - nettle=3.7.3=hbbd107a_1 149 | - networkx=2.8.4=py38h06a4308_1 150 | - nspr=4.32=h9c3ff4c_1 151 | - nss=3.69=hb5efdd6_1 152 | - numpy=1.23.5=py38h14f4228_0 153 | - numpy-base=1.23.5=py38h31eccc5_0 154 | - opencv=4.4.0=py38_2 155 | - openh264=2.1.1=h4ff587b_0 156 | - openjpeg=2.4.0=h3ad879b_0 157 | - openssl=1.1.1t=h7f8727e_0 158 | - packaging=23.0=py38h06a4308_0 159 | - partd=1.2.0=pyhd3eb1b0_1 160 | - pcre=8.45=h9c3ff4c_0 161 | - pillow=9.4.0=py38h6a678d5_0 162 | - pip=23.0.1=py38h06a4308_0 163 | - pixman=0.40.0=h36c2ea0_0 164 | - pooch=1.4.0=pyhd3eb1b0_0 165 | - pthread-stubs=0.4=h36c2ea0_1001 166 | - py-opencv=4.4.0=py38h23f93f0_2 167 | - pycparser=2.21=pyhd3eb1b0_0 168 | - pyglet=1.5.27=py38h578d9bd_3 169 | - pyopengl=3.1.6=pyhd8ed1ab_1 170 | - pyopenssl=23.0.0=py38h06a4308_0 171 | - pyparsing=3.0.9=pyhd8ed1ab_0 172 | - pyqt=5.12.3=py38ha8c2ead_4 173 | - pyrender=0.1.45=pyh8a188c0_3 174 | - pysocks=1.7.1=py38h06a4308_0 175 | - python=3.8.16=h7a1cb2a_3 176 | - python-dateutil=2.8.2=pyhd8ed1ab_0 177 | - python_abi=3.8=2_cp38 178 | - pytorch=2.0.0=py3.8_cuda11.8_cudnn8.7.0_0 179 | - pytorch-cuda=11.8=h7e8668a_3 180 | - pytorch-mutex=1.0=cuda 181 | - pywavelets=1.4.1=py38h5eee18b_0 182 | - pyyaml=6.0=py38h5eee18b_1 183 | - qt=5.12.9=hda022c4_4 184 | - readline=8.2=h5eee18b_0 185 | - requests=2.28.1=py38h06a4308_1 186 | - scikit-image=0.19.3=py38h6a678d5_1 187 | - scipy=1.10.0=py38h14f4228_1 188 | - setuptools=65.6.3=py38h06a4308_0 189 | - six=1.16.0=pyhd3eb1b0_1 190 | - snappy=1.1.9=h295c915_0 191 | - sqlite=3.41.1=h5eee18b_0 192 | - sympy=1.11.1=py38h06a4308_0 193 | - tifffile=2020.6.3=py_0 194 | - tk=8.6.12=h1ccaba5_0 195 | - toolz=0.12.0=py38h06a4308_0 196 | - torchaudio=2.0.0=py38_cu118 197 | - torchtriton=2.0.0=py38 198 | - torchvision=0.15.0=py38_cu118 199 | - tornado=6.1=py38h0a891b7_3 200 | - trimesh=3.21.2=pyhd8ed1ab_0 201 | - typing_extensions=4.4.0=py38h06a4308_0 202 | - urllib3=1.26.14=py38h06a4308_0 203 | - wheel=0.38.4=py38h06a4308_0 204 | - x264=1!157.20191217=h7b6447c_0 205 | - xorg-kbproto=1.0.7=h7f98852_1002 206 | - xorg-libice=1.0.10=h7f98852_0 207 | - xorg-libsm=1.2.3=hd9c2040_1000 208 | - xorg-libx11=1.7.2=h7f98852_0 209 | - xorg-libxau=1.0.9=h7f98852_0 210 | - xorg-libxdmcp=1.1.3=h7f98852_0 211 | - xorg-libxext=1.3.4=h7f98852_1 212 | - xorg-libxrender=0.9.10=h7f98852_1003 213 | - xorg-renderproto=0.11.1=h7f98852_1002 214 | - xorg-xextproto=7.3.0=h7f98852_1002 215 | - xorg-xproto=7.0.31=h7f98852_1007 216 | - xz=5.2.10=h5eee18b_1 217 | - yaml=0.2.5=h7b6447c_0 218 | - zfp=0.5.5=h295c915_6 219 | - zipp=3.15.0=pyhd8ed1ab_0 220 | - zlib=1.2.13=h5eee18b_0 221 | - zstd=1.5.2=ha4553b6_0 222 | - pip: 223 | - pyqt5-sip==4.19.18 224 | - pyqtchart==5.12 225 | - pyqtwebengine==5.12.1 226 | -------------------------------------------------------------------------------- /eval_poses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright © Niantic, Inc. 2022. 3 | 4 | import argparse 5 | import logging 6 | import math 7 | from pathlib import Path 8 | 9 | import cv2 10 | import numpy as np 11 | from scipy.spatial.transform import Rotation 12 | 13 | from dataset import CamLocDataset 14 | 15 | _logger = logging.getLogger(__name__) 16 | 17 | if __name__ == '__main__': 18 | logging.basicConfig(level=logging.INFO) 19 | parser = argparse.ArgumentParser( 20 | description='Compute metrics for a pre-existing poses file.', 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | 23 | parser.add_argument('scene', type=Path, help='name of a scene in the dataset folder, e.g. Cambridge_GreatCourt') 24 | parser.add_argument('poses_file', type=Path, help='file containing poses estimated for the input scene') 25 | 26 | opt = parser.parse_args() 27 | 28 | # Load dataset. 29 | testset = CamLocDataset( 30 | opt.scene / "test", 31 | mode=0, # Default for ACE, we don't need scene coordinates/RGB-D. 32 | ) 33 | _logger.info(f"Loaded scene with {len(testset)} frames.") 34 | 35 | # load pre-existing poses 36 | with opt.poses_file.open('r') as f: 37 | frame_poses = f.readlines() 38 | _logger.info(f"Loaded {len(frame_poses)} poses.") 39 | 40 | # Check. 41 | assert len(testset) == len(frame_poses) 42 | 43 | # Keep track of rotation and translation errors for calculation of the median error. 44 | rErrs = [] 45 | tErrs = [] 46 | 47 | # Percentage of frames predicted within certain thresholds from their GT pose. 48 | pct10_5 = 0 49 | pct5 = 0 50 | pct2 = 0 51 | pct1 = 0 52 | 53 | # Iterate over the dataset. 54 | for image_idx, (_, _, gt_pose, _, _, _, _, image_file) in enumerate(testset): 55 | # Parse estimated pose from the input file. 56 | pose_file, qw, qx, qy, qz, tx, ty, tz, _, _, _ = frame_poses[image_idx].split() 57 | 58 | # Check that the files match. 59 | assert Path(image_file).name == pose_file 60 | 61 | # We do everything in np. 62 | gt_pose = gt_pose.numpy() 63 | 64 | # Convert quaternion to rotation matrix. 65 | r_mat = Rotation.from_quat([float(x) for x in [qx, qy, qz, qw]]).as_matrix() 66 | t_vec = np.array([float(x) for x in [tx, ty, tz]]) 67 | 68 | # We saved the inverse pose 69 | estimated_pose_inv = np.eye(4) 70 | estimated_pose_inv[:3, :3] = r_mat 71 | estimated_pose_inv[:3, 3] = t_vec 72 | 73 | # Pose we use for evaluation. 74 | estimated_pose = np.linalg.inv(estimated_pose_inv) 75 | 76 | # calculate pose errors 77 | t_err = float(np.linalg.norm(gt_pose[0:3, 3] - estimated_pose[0:3, 3])) 78 | 79 | gt_R = gt_pose[0:3, 0:3] 80 | out_R = estimated_pose[0:3, 0:3] 81 | 82 | r_err = np.matmul(out_R, np.transpose(gt_R)) 83 | r_err = cv2.Rodrigues(r_err)[0] 84 | r_err = np.linalg.norm(r_err) * 180 / math.pi 85 | 86 | _logger.info("Rotation Error: %.2fdeg, Translation Error: %.1fcm" % (r_err, t_err * 100)) 87 | 88 | # Save the errors. 89 | rErrs.append(r_err) 90 | tErrs.append(t_err * 100) 91 | 92 | # Check various thresholds. 93 | if r_err < 5 and t_err < 0.1: # 10cm/5deg 94 | pct10_5 += 1 95 | if r_err < 5 and t_err < 0.05: # 5cm/5deg 96 | pct5 += 1 97 | if r_err < 2 and t_err < 0.02: # 2cm/2deg 98 | pct2 += 1 99 | if r_err < 1 and t_err < 0.01: # 1cm/1deg 100 | pct1 += 1 101 | 102 | total_frames = len(rErrs) 103 | assert total_frames == len(testset) 104 | 105 | # Compute median errors. 106 | tErrs.sort() 107 | rErrs.sort() 108 | median_idx = total_frames // 2 109 | median_rErr = rErrs[median_idx] 110 | median_tErr = tErrs[median_idx] 111 | 112 | # Compute final metrics. 113 | pct10_5 = pct10_5 / total_frames * 100 114 | pct5 = pct5 / total_frames * 100 115 | pct2 = pct2 / total_frames * 100 116 | pct1 = pct1 / total_frames * 100 117 | 118 | _logger.info("===================================================") 119 | _logger.info("Test complete.") 120 | 121 | _logger.info('Accuracy:') 122 | _logger.info(f'\t10cm/5deg: {pct10_5:.1f}%') 123 | _logger.info(f'\t5cm/5deg: {pct5:.1f}%') 124 | _logger.info(f'\t2cm/2deg: {pct2:.1f}%') 125 | _logger.info(f'\t1cm/1deg: {pct1:.1f}%') 126 | 127 | _logger.info(f"Median Error: {median_rErr:.1f}deg, {median_tErr:.1f}cm") 128 | -------------------------------------------------------------------------------- /merge_ensemble_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright © Niantic, Inc. 2022. 3 | 4 | import logging 5 | from argparse import ArgumentParser 6 | from collections import defaultdict 7 | from dataclasses import dataclass, field 8 | from pathlib import Path 9 | from typing import List 10 | 11 | _logger = logging.getLogger(__name__) 12 | 13 | @dataclass 14 | class FrameResult: 15 | inlier_count: int = 0 16 | quaternion: List[float] = field(default_factory=lambda: [1, 0, 0, 0]) 17 | translation: List[float] = field(default_factory=lambda: [0, 0, 0]) 18 | r_err: float = 0 19 | t_err: float = 0 20 | 21 | 22 | if __name__ == '__main__': 23 | logging.basicConfig(level=logging.INFO) 24 | parser = ArgumentParser( 25 | description="Merge results created by multiple nets trained on clustered datasets, " 26 | "keeping the best pose for each image (in terms of inlier count).") 27 | parser.add_argument('poses_path', type=Path, 28 | help="Path to a folder containing the estimated poses for each network.") 29 | parser.add_argument('out_file', type=Path, 30 | help="Path to the output file containing the best pose for each image.") 31 | parser.add_argument('--poses_suffix', type=str, default='.txt', help='Suffix to select a subset of pose files.') 32 | 33 | args = parser.parse_args() 34 | 35 | poses_path: Path = args.poses_path 36 | out_file: Path = args.out_file 37 | 38 | pose_files = sorted(poses_path.glob(f"poses_*{args.poses_suffix}")) 39 | _logger.info(f"Found {len(pose_files)} pose files.") 40 | 41 | frame_poses = defaultdict(FrameResult) 42 | 43 | for in_file in pose_files: 44 | _logger.info(f"Parsing results from: {in_file}") 45 | with in_file.open('r') as f: 46 | for line in f.readlines(): 47 | current_result = FrameResult() 48 | img, current_result.quaternion[0], current_result.quaternion[1], current_result.quaternion[2], \ 49 | current_result.quaternion[3],\ 50 | current_result.translation[0], current_result.translation[1], current_result.translation[2],\ 51 | current_result.r_err, current_result.t_err, current_result.inlier_count = line.split() 52 | 53 | # Convert to the appropriate datatypes. 54 | current_result.inlier_count = int(current_result.inlier_count) 55 | current_result.quaternion = [float(x) for x in current_result.quaternion] 56 | current_result.translation = [float(x) for x in current_result.translation] 57 | current_result.r_err = float(current_result.r_err) 58 | current_result.t_err = float(current_result.t_err) 59 | 60 | # Update global dict if needed. 61 | if frame_poses[img].inlier_count < current_result.inlier_count: 62 | frame_poses[img] = current_result 63 | 64 | _logger.info(f"Found results for {len(frame_poses)} query frames.") 65 | 66 | # Save the output. 67 | with out_file.open('w') as f: 68 | for img_name in sorted(frame_poses.keys()): 69 | frame_result = frame_poses[img_name] 70 | f.write( 71 | f"{img_name} " 72 | f"{' '.join(str(x) for x in frame_result.quaternion)} " 73 | f"{' '.join(str(x) for x in frame_result.translation)} " 74 | f"{frame_result.r_err} {frame_result.t_err} {frame_result.inlier_count}\n") 75 | _logger.info(f"Saved merged poses to: {out_file}") 76 | -------------------------------------------------------------------------------- /scripts/train_12scenes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("12scenes_apt1_kitchen" "12scenes_apt1_living" "12scenes_apt2_bed" "12scenes_apt2_kitchen" "12scenes_apt2_living" "12scenes_apt2_luke" "12scenes_office1_gates362" "12scenes_office1_gates381" "12scenes_office1_lounge" "12scenes_office1_manolis" "12scenes_office2_5a" "12scenes_office2_5b") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/12Scenes" 14 | mkdir -p "$out_dir" 15 | 16 | for scene in ${scenes[*]}; do 17 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 18 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 19 | done 20 | 21 | for scene in ${scenes[*]}; do 22 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)" 23 | done 24 | -------------------------------------------------------------------------------- /scripts/train_12scenes_pgt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("pgt_12scenes_apt1_kitchen" "pgt_12scenes_apt1_living" "pgt_12scenes_apt2_bed" "pgt_12scenes_apt2_kitchen" "pgt_12scenes_apt2_living" "pgt_12scenes_apt2_luke" "pgt_12scenes_office1_gates362" "pgt_12scenes_office1_gates381" "pgt_12scenes_office1_lounge" "pgt_12scenes_office1_manolis" "pgt_12scenes_office2_5a" "pgt_12scenes_office2_5b") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/pgt_12Scenes" 14 | mkdir -p "$out_dir" 15 | 16 | for scene in ${scenes[*]}; do 17 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 18 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 19 | done 20 | 21 | for scene in ${scenes[*]}; do 22 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)" 23 | done 24 | -------------------------------------------------------------------------------- /scripts/train_7scenes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("7scenes_chess" "7scenes_fire" "7scenes_heads" "7scenes_office" "7scenes_pumpkin" "7scenes_redkitchen" "7scenes_stairs") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/7Scenes" 14 | mkdir -p "$out_dir" 15 | 16 | for scene in ${scenes[*]}; do 17 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 18 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 19 | done 20 | 21 | for scene in ${scenes[*]}; do 22 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)" 23 | done 24 | -------------------------------------------------------------------------------- /scripts/train_7scenes_pgt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("pgt_7scenes_chess" "pgt_7scenes_fire" "pgt_7scenes_heads" "pgt_7scenes_office" "pgt_7scenes_pumpkin" "pgt_7scenes_redkitchen" "pgt_7scenes_stairs") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/pgt_7Scenes" 14 | mkdir -p "$out_dir" 15 | 16 | for scene in ${scenes[*]}; do 17 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 18 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 19 | done 20 | 21 | for scene in ${scenes[*]}; do 22 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)" 23 | done 24 | -------------------------------------------------------------------------------- /scripts/train_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./train_7scenes.sh 4 | ./train_7scenes_pgt.sh 5 | ./train_12scenes.sh 6 | ./train_12scenes_pgt.sh 7 | ./train_cambridge.sh 8 | ./train_wayspots.sh 9 | ./train_cambridge_ensemble.sh 2 10 | ./train_cambridge_ensemble.sh 3 11 | ./train_cambridge_ensemble.sh 4 12 | ./train_cambridge_ensemble.sh 5 13 | ./train_cambridge_ensemble.sh 6 14 | ./train_cambridge_ensemble.sh 7 15 | -------------------------------------------------------------------------------- /scripts/train_cambridge.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("Cambridge_GreatCourt" "Cambridge_KingsCollege" "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_StMarysChurch") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/Cambridge" 14 | 15 | mkdir -p "$out_dir" 16 | 17 | for scene in ${scenes[*]}; do 18 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 19 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 20 | done 21 | 22 | for scene in ${scenes[*]}; do 23 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -2 | head -1)" 24 | done 25 | -------------------------------------------------------------------------------- /scripts/train_cambridge_ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("Cambridge_GreatCourt" "Cambridge_KingsCollege" "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_StMarysChurch") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | merging_exe="${REPO_PATH}/merge_ensemble_results.py" 12 | eval_exe="${REPO_PATH}/eval_poses.py" 13 | 14 | datasets_folder="${REPO_PATH}/datasets" 15 | out_dir="${REPO_PATH}/output/Cambridge_Ensemble" 16 | mkdir -p "$out_dir" 17 | 18 | num_clusters=${1:-1} 19 | 20 | for scene in ${scenes[*]}; do 21 | current_out_dir="$out_dir/$scene" 22 | mkdir -p "$current_out_dir" 23 | 24 | for cluster_idx in $(seq 0 $((num_clusters-1))); do 25 | echo "Training network for scene: $scene and cluster: $cluster_idx/$num_clusters" 26 | 27 | net_file="${current_out_dir}/${cluster_idx}_${num_clusters}.pt" 28 | 29 | [[ -f "$net_file" ]] || python $training_exe "$datasets_folder/$scene" "$net_file" --num_clusters $num_clusters --cluster_idx $cluster_idx | tee "${current_out_dir}/train_${cluster_idx}_${num_clusters}.txt" 30 | python $testing_exe "$datasets_folder/$scene" "$net_file" --session "${cluster_idx}_${num_clusters}" 2>&1 | tee "${current_out_dir}/test_${cluster_idx}_${num_clusters}.txt" 31 | done 32 | 33 | python $merging_exe "$current_out_dir" "$current_out_dir/merged_poses_${num_clusters}.txt" --poses_suffix "_${num_clusters}.txt" 34 | python $eval_exe "$datasets_folder/$scene" "$current_out_dir/merged_poses_${num_clusters}.txt" 2>&1 | tee "$current_out_dir/eval_merged_${num_clusters}.txt" 35 | done 36 | 37 | for scene in ${scenes[*]}; do 38 | current_out_dir="$out_dir/$scene" 39 | echo "${scene}: $(cat "$current_out_dir/eval_merged_${num_clusters}.txt" | tail -1)" 40 | done 41 | -------------------------------------------------------------------------------- /scripts/train_wayspots.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("wayspots_squarebench" "wayspots_bears" "wayspots_cubes" "wayspots_inscription" "wayspots_lawn" "wayspots_map" "wayspots_statue" "wayspots_tendrils" "wayspots_therock" "wayspots_wintersign") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/wayspots" 14 | mkdir -p "$out_dir" 15 | 16 | for scene in ${scenes[*]}; do 17 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 18 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt" 19 | done 20 | 21 | for scene in ${scenes[*]}; do 22 | echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -6 | head -1)" 23 | done 24 | -------------------------------------------------------------------------------- /scripts/viz_12scenes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("12scenes_apt1_kitchen" "12scenes_apt1_living" "12scenes_apt2_bed" "12scenes_apt2_kitchen" "12scenes_apt2_living" "12scenes_apt2_luke" "12scenes_office1_gates362" "12scenes_office1_gates381" "12scenes_office1_lounge" "12scenes_office1_manolis" "12scenes_office2_5a" "12scenes_office2_5b") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/viz_maps/12Scenes" 14 | renderings_dir="${REPO_PATH}/output/renderings/12Scenes" 15 | 16 | mkdir -p "$out_dir" 17 | 18 | for scene in ${scenes[*]}; do 19 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_camera_z_offset 2 20 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_pose_error_threshold 10 21 | /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4" 22 | done -------------------------------------------------------------------------------- /scripts/viz_7scenes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("7scenes_chess" "7scenes_fire" "7scenes_heads" "7scenes_office" "7scenes_pumpkin" "7scenes_redkitchen" "7scenes_stairs") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/viz_maps/7Scenes" 14 | renderings_dir="${REPO_PATH}/output/renderings/7Scenes" 15 | 16 | mkdir -p "$out_dir" 17 | mkdir -p "renderings_dir" 18 | 19 | for scene in ${scenes[*]}; do 20 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_camera_z_offset 2 21 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_pose_error_threshold 10 --render_frame_skip 3 22 | /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4" 23 | done -------------------------------------------------------------------------------- /scripts/viz_cambridge.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=( "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_KingsCollege" "Cambridge_GreatCourt" "Cambridge_StMarysChurch") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/viz_maps/Cambridge" 14 | renderings_dir="${REPO_PATH}/output/renderings/Cambridge" 15 | 16 | mkdir -p "$out_dir" 17 | 18 | for scene in ${scenes[*]}; do 19 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_map_error_threshold 50 --render_map_depth_filter 50 20 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_sparse_queries True --render_pose_error_threshold 100 --render_map_depth_filter 50 --render_camera_z_offset 8 21 | /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4" 22 | done -------------------------------------------------------------------------------- /scripts/viz_wayspots.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Find the path to the root of the repo. 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0")) 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..") 6 | 7 | scenes=("wayspots_squarebench" "wayspots_bears" "wayspots_cubes" "wayspots_inscription" "wayspots_lawn" "wayspots_map" "wayspots_statue" "wayspots_tendrils" "wayspots_therock" "wayspots_wintersign") 8 | 9 | training_exe="${REPO_PATH}/train_ace.py" 10 | testing_exe="${REPO_PATH}/test_ace.py" 11 | 12 | datasets_folder="${REPO_PATH}/datasets" 13 | out_dir="${REPO_PATH}/output/viz_maps/wayspots" 14 | renderings_dir="${REPO_PATH}/output/renderings/wayspots" 15 | 16 | mkdir -p "$out_dir" 17 | 18 | for scene in ${scenes[*]}; do 19 | python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_flipped_portrait True 20 | python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_flipped_portrait True 21 | /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4" 22 | done 23 | 24 | -------------------------------------------------------------------------------- /test_ace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright © Niantic, Inc. 2022. 3 | 4 | import argparse 5 | import logging 6 | import math 7 | import time 8 | from distutils.util import strtobool 9 | from pathlib import Path 10 | 11 | import cv2 12 | import numpy as np 13 | import torch 14 | from torch.cuda.amp import autocast 15 | from torch.utils.data import DataLoader 16 | 17 | import dsacstar 18 | from ace_network import Regressor 19 | from dataset import CamLocDataset 20 | 21 | import ace_vis_util as vutil 22 | from ace_visualizer import ACEVisualizer 23 | 24 | _logger = logging.getLogger(__name__) 25 | 26 | 27 | def _strtobool(x): 28 | return bool(strtobool(x)) 29 | 30 | 31 | if __name__ == '__main__': 32 | # Setup logging. 33 | logging.basicConfig(level=logging.INFO) 34 | 35 | parser = argparse.ArgumentParser( 36 | description='Test a trained network on a specific scene.', 37 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 38 | 39 | parser.add_argument('scene', type=Path, 40 | help='path to a scene in the dataset folder, e.g. "datasets/Cambridge_GreatCourt"') 41 | 42 | parser.add_argument('network', type=Path, help='path to a network trained for the scene (just the head weights)') 43 | 44 | parser.add_argument('--encoder_path', type=Path, default=Path(__file__).parent / "ace_encoder_pretrained.pt", 45 | help='file containing pre-trained encoder weights') 46 | 47 | parser.add_argument('--session', '-sid', default='', 48 | help='custom session name appended to output files, ' 49 | 'useful to separate different runs of a script') 50 | 51 | parser.add_argument('--image_resolution', type=int, default=480, help='base image resolution') 52 | 53 | # ACE is RGB-only, no need for this param. 54 | # parser.add_argument('--mode', '-m', type=int, default=1, choices=[1, 2], help='test mode: 1 = RGB, 2 = RGB-D') 55 | 56 | # DSACStar RANSAC parameters. ACE Keeps them at default. 57 | parser.add_argument('--hypotheses', '-hyps', type=int, default=64, 58 | help='number of hypotheses, i.e. number of RANSAC iterations') 59 | 60 | parser.add_argument('--threshold', '-t', type=float, default=10, 61 | help='inlier threshold in pixels (RGB) or centimeters (RGB-D)') 62 | 63 | parser.add_argument('--inlieralpha', '-ia', type=float, default=100, 64 | help='alpha parameter of the soft inlier count; controls the softness of the ' 65 | 'hypotheses score distribution; lower means softer') 66 | 67 | parser.add_argument('--maxpixelerror', '-maxerrr', type=float, default=100, 68 | help='maximum reprojection (RGB, in px) or 3D distance (RGB-D, in cm) error when checking ' 69 | 'pose consistency towards all measurements; error is clamped to this value for stability') 70 | 71 | # Params for the visualization. If enabled, it will slow down relocalisation considerably. But you get a nice video :) 72 | parser.add_argument('--render_visualization', type=_strtobool, default=False, 73 | help='create a video of the mapping process') 74 | 75 | parser.add_argument('--render_target_path', type=Path, default='renderings', 76 | help='target folder for renderings, visualizer will create a subfolder with the map name') 77 | 78 | parser.add_argument('--render_flipped_portrait', type=_strtobool, default=False, 79 | help='flag for wayspots dataset where images are sideways portrait') 80 | 81 | parser.add_argument('--render_sparse_queries', type=_strtobool, default=False, 82 | help='set to true if your queries are not a smooth video') 83 | 84 | parser.add_argument('--render_pose_error_threshold', type=int, default=20, 85 | help='pose error threshold for the visualisation in cm/deg') 86 | 87 | parser.add_argument('--render_map_depth_filter', type=int, default=10, 88 | help='to clean up the ACE point cloud remove points too far away') 89 | 90 | parser.add_argument('--render_camera_z_offset', type=int, default=4, 91 | help='zoom out of the scene by moving render camera backwards, in meters') 92 | 93 | parser.add_argument('--render_frame_skip', type=int, default=1, 94 | help='skip every xth frame for long and dense query sequences') 95 | 96 | opt = parser.parse_args() 97 | 98 | device = torch.device("cuda") 99 | num_workers = 6 100 | 101 | scene_path = Path(opt.scene) 102 | head_network_path = Path(opt.network) 103 | encoder_path = Path(opt.encoder_path) 104 | session = opt.session 105 | 106 | # Setup dataset. 107 | testset = CamLocDataset( 108 | scene_path / "test", 109 | mode=0, # Default for ACE, we don't need scene coordinates/RGB-D. 110 | image_height=opt.image_resolution, 111 | ) 112 | _logger.info(f'Test images found: {len(testset)}') 113 | 114 | # Setup dataloader. Batch size 1 by default. 115 | testset_loader = DataLoader(testset, shuffle=False, num_workers=6) 116 | 117 | # Load network weights. 118 | encoder_state_dict = torch.load(encoder_path, map_location="cpu") 119 | _logger.info(f"Loaded encoder from: {encoder_path}") 120 | head_state_dict = torch.load(head_network_path, map_location="cpu") 121 | _logger.info(f"Loaded head weights from: {head_network_path}") 122 | 123 | # Create regressor. 124 | network = Regressor.create_from_split_state_dict(encoder_state_dict, head_state_dict) 125 | 126 | # Setup for evaluation. 127 | network = network.to(device) 128 | network.eval() 129 | 130 | # Save the outputs in the same folder as the network being evaluated. 131 | output_dir = head_network_path.parent 132 | scene_name = scene_path.name 133 | # This will contain aggregate scene stats (median translation/rotation errors, and avg processing time per frame). 134 | test_log_file = output_dir / f'test_{scene_name}_{opt.session}.txt' 135 | _logger.info(f"Saving test aggregate statistics to: {test_log_file}") 136 | # This will contain each frame's pose (stored as quaternion + translation) and errors. 137 | pose_log_file = output_dir / f'poses_{scene_name}_{opt.session}.txt' 138 | _logger.info(f"Saving per-frame poses and errors to: {pose_log_file}") 139 | 140 | # Setup output files. 141 | test_log = open(test_log_file, 'w', 1) 142 | pose_log = open(pose_log_file, 'w', 1) 143 | 144 | # Metrics of interest. 145 | avg_batch_time = 0 146 | num_batches = 0 147 | 148 | # Keep track of rotation and translation errors for calculation of the median error. 149 | rErrs = [] 150 | tErrs = [] 151 | 152 | # Percentage of frames predicted within certain thresholds from their GT pose. 153 | pct10_5 = 0 154 | pct5 = 0 155 | pct2 = 0 156 | pct1 = 0 157 | 158 | # Generate video of training process 159 | if opt.render_visualization: 160 | # infer rendering folder from map file name 161 | target_path = vutil.get_rendering_target_path( 162 | opt.render_target_path, 163 | opt.network) 164 | ace_visualizer = ACEVisualizer(target_path, 165 | opt.render_flipped_portrait, 166 | opt.render_map_depth_filter, 167 | reloc_vis_error_threshold=opt.render_pose_error_threshold) 168 | 169 | # we need to pass the training set in case the visualiser has to regenerate the map point cloud 170 | trainset = CamLocDataset( 171 | scene_path / "train", 172 | mode=0, # Default for ACE, we don't need scene coordinates/RGB-D. 173 | image_height=opt.image_resolution, 174 | ) 175 | 176 | # Setup dataloader. Batch size 1 by default. 177 | trainset_loader = DataLoader(trainset, shuffle=False, num_workers=6) 178 | 179 | ace_visualizer.setup_reloc_visualisation( 180 | frame_count=len(testset), 181 | data_loader=trainset_loader, 182 | network=network, 183 | camera_z_offset=opt.render_camera_z_offset, 184 | reloc_frame_skip=opt.render_frame_skip) 185 | else: 186 | ace_visualizer = None 187 | 188 | # Testing loop. 189 | testing_start_time = time.time() 190 | with torch.no_grad(): 191 | for image_B1HW, _, gt_pose_B44, _, intrinsics_B33, _, _, filenames in testset_loader: 192 | batch_start_time = time.time() 193 | batch_size = image_B1HW.shape[0] 194 | 195 | image_B1HW = image_B1HW.to(device, non_blocking=True) 196 | 197 | # Predict scene coordinates. 198 | with autocast(enabled=True): 199 | scene_coordinates_B3HW = network(image_B1HW) 200 | 201 | # We need them on the CPU to run RANSAC. 202 | scene_coordinates_B3HW = scene_coordinates_B3HW.float().cpu() 203 | 204 | # Each frame is processed independently. 205 | for frame_idx, (scene_coordinates_3HW, gt_pose_44, intrinsics_33, frame_path) in enumerate( 206 | zip(scene_coordinates_B3HW, gt_pose_B44, intrinsics_B33, filenames)): 207 | 208 | # Extract focal length and principal point from the intrinsics matrix. 209 | focal_length = intrinsics_33[0, 0].item() 210 | ppX = intrinsics_33[0, 2].item() 211 | ppY = intrinsics_33[1, 2].item() 212 | # We support a single focal length. 213 | assert torch.allclose(intrinsics_33[0, 0], intrinsics_33[1, 1]) 214 | 215 | # Remove path from file name 216 | frame_name = Path(frame_path).name 217 | 218 | # Allocate output variable. 219 | out_pose = torch.zeros((4, 4)) 220 | 221 | # Compute the pose via RANSAC. 222 | inlier_count = dsacstar.forward_rgb( 223 | scene_coordinates_3HW.unsqueeze(0), 224 | out_pose, 225 | opt.hypotheses, 226 | opt.threshold, 227 | focal_length, 228 | ppX, 229 | ppY, 230 | opt.inlieralpha, 231 | opt.maxpixelerror, 232 | network.OUTPUT_SUBSAMPLE, 233 | ) 234 | 235 | # Calculate translation error. 236 | t_err = float(torch.norm(gt_pose_44[0:3, 3] - out_pose[0:3, 3])) 237 | 238 | # Rotation error. 239 | gt_R = gt_pose_44[0:3, 0:3].numpy() 240 | out_R = out_pose[0:3, 0:3].numpy() 241 | 242 | r_err = np.matmul(out_R, np.transpose(gt_R)) 243 | # Compute angle-axis representation. 244 | r_err = cv2.Rodrigues(r_err)[0] 245 | # Extract the angle. 246 | r_err = np.linalg.norm(r_err) * 180 / math.pi 247 | 248 | _logger.info(f"Rotation Error: {r_err:.2f}deg, Translation Error: {t_err * 100:.1f}cm") 249 | 250 | if ace_visualizer is not None: 251 | ace_visualizer.render_reloc_frame( 252 | query_pose=gt_pose_44.numpy(), 253 | query_file=frame_path, 254 | est_pose=out_pose.numpy(), 255 | est_error=max(r_err, t_err*100), 256 | sparse_query=opt.render_sparse_queries) 257 | 258 | # Save the errors. 259 | rErrs.append(r_err) 260 | tErrs.append(t_err * 100) 261 | 262 | # Check various thresholds. 263 | if r_err < 5 and t_err < 0.1: # 10cm/5deg 264 | pct10_5 += 1 265 | if r_err < 5 and t_err < 0.05: # 5cm/5deg 266 | pct5 += 1 267 | if r_err < 2 and t_err < 0.02: # 2cm/2deg 268 | pct2 += 1 269 | if r_err < 1 and t_err < 0.01: # 1cm/1deg 270 | pct1 += 1 271 | 272 | # Write estimated pose to pose file (inverse). 273 | out_pose = out_pose.inverse() 274 | 275 | # Translation. 276 | t = out_pose[0:3, 3] 277 | 278 | # Rotation to axis angle. 279 | rot, _ = cv2.Rodrigues(out_pose[0:3, 0:3].numpy()) 280 | angle = np.linalg.norm(rot) 281 | axis = rot / angle 282 | 283 | # Axis angle to quaternion. 284 | q_w = math.cos(angle * 0.5) 285 | q_xyz = math.sin(angle * 0.5) * axis 286 | 287 | # Write to output file. All in a single line. 288 | pose_log.write(f"{frame_name} " 289 | f"{q_w} {q_xyz[0].item()} {q_xyz[1].item()} {q_xyz[2].item()} " 290 | f"{t[0]} {t[1]} {t[2]} " 291 | f"{r_err} {t_err} {inlier_count}\n") 292 | 293 | avg_batch_time += time.time() - batch_start_time 294 | num_batches += 1 295 | 296 | total_frames = len(rErrs) 297 | assert total_frames == len(testset) 298 | 299 | # Compute median errors. 300 | tErrs.sort() 301 | rErrs.sort() 302 | median_idx = total_frames // 2 303 | median_rErr = rErrs[median_idx] 304 | median_tErr = tErrs[median_idx] 305 | 306 | # Compute average time. 307 | avg_time = avg_batch_time / num_batches 308 | 309 | # Compute final metrics. 310 | pct10_5 = pct10_5 / total_frames * 100 311 | pct5 = pct5 / total_frames * 100 312 | pct2 = pct2 / total_frames * 100 313 | pct1 = pct1 / total_frames * 100 314 | 315 | _logger.info("===================================================") 316 | _logger.info("Test complete.") 317 | 318 | _logger.info('Accuracy:') 319 | _logger.info(f'\t10cm/5deg: {pct10_5:.1f}%') 320 | _logger.info(f'\t5cm/5deg: {pct5:.1f}%') 321 | _logger.info(f'\t2cm/2deg: {pct2:.1f}%') 322 | _logger.info(f'\t1cm/1deg: {pct1:.1f}%') 323 | 324 | _logger.info(f"Median Error: {median_rErr:.1f}deg, {median_tErr:.1f}cm") 325 | _logger.info(f"Avg. processing time: {avg_time * 1000:4.1f}ms") 326 | 327 | # Write to the test log file as well. 328 | test_log.write(f"{median_rErr} {median_tErr} {avg_time}\n") 329 | 330 | test_log.close() 331 | pose_log.close() 332 | -------------------------------------------------------------------------------- /train_ace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright © Niantic, Inc. 2022. 3 | 4 | import argparse 5 | import logging 6 | from distutils.util import strtobool 7 | from pathlib import Path 8 | 9 | from ace_trainer import TrainerACE 10 | 11 | 12 | def _strtobool(x): 13 | return bool(strtobool(x)) 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | # Setup logging levels. 19 | logging.basicConfig(level=logging.INFO) 20 | 21 | parser = argparse.ArgumentParser( 22 | description='Fast training of a scene coordinate regression network.', 23 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 24 | 25 | parser.add_argument('scene', type=Path, 26 | help='path to a scene in the dataset folder, e.g. "datasets/Cambridge_GreatCourt"') 27 | 28 | parser.add_argument('output_map_file', type=Path, 29 | help='target file for the trained network') 30 | 31 | parser.add_argument('--encoder_path', type=Path, default=Path(__file__).parent / "ace_encoder_pretrained.pt", 32 | help='file containing pre-trained encoder weights') 33 | 34 | parser.add_argument('--num_head_blocks', type=int, default=1, 35 | help='depth of the regression head, defines the map size') 36 | 37 | parser.add_argument('--learning_rate_min', type=float, default=0.0005, 38 | help='lowest learning rate of 1 cycle scheduler') 39 | 40 | parser.add_argument('--learning_rate_max', type=float, default=0.005, 41 | help='highest learning rate of 1 cycle scheduler') 42 | 43 | parser.add_argument('--training_buffer_size', type=int, default=8000000, 44 | help='number of patches in the training buffer') 45 | 46 | parser.add_argument('--samples_per_image', type=int, default=1024, 47 | help='number of patches drawn from each image when creating the buffer') 48 | 49 | parser.add_argument('--batch_size', type=int, default=5120, 50 | help='number of patches for each parameter update (has to be a multiple of 512)') 51 | 52 | parser.add_argument('--epochs', type=int, default=16, 53 | help='number of runs through the training buffer') 54 | 55 | parser.add_argument('--repro_loss_hard_clamp', type=int, default=1000, 56 | help='hard clamping threshold for the reprojection losses') 57 | 58 | parser.add_argument('--repro_loss_soft_clamp', type=int, default=50, 59 | help='soft clamping threshold for the reprojection losses') 60 | 61 | parser.add_argument('--repro_loss_soft_clamp_min', type=int, default=1, 62 | help='minimum value of the soft clamping threshold when using a schedule') 63 | 64 | parser.add_argument('--use_half', type=_strtobool, default=True, 65 | help='train with half precision') 66 | 67 | parser.add_argument('--use_homogeneous', type=_strtobool, default=True, 68 | help='train with half precision') 69 | 70 | parser.add_argument('--use_aug', type=_strtobool, default=True, 71 | help='Use any augmentation.') 72 | 73 | parser.add_argument('--aug_rotation', type=int, default=15, 74 | help='max inplane rotation angle') 75 | 76 | parser.add_argument('--aug_scale', type=float, default=1.5, 77 | help='max scale factor') 78 | 79 | parser.add_argument('--image_resolution', type=int, default=480, 80 | help='base image resolution') 81 | 82 | parser.add_argument('--repro_loss_type', type=str, default="dyntanh", 83 | choices=["l1", "l1+sqrt", "l1+log", "tanh", "dyntanh"], 84 | help='Loss function on the reprojection error. Dyn varies the soft clamping threshold') 85 | 86 | parser.add_argument('--repro_loss_schedule', type=str, default="circle", choices=['circle', 'linear'], 87 | help='How to decrease the softclamp threshold during training, circle is slower first') 88 | 89 | parser.add_argument('--depth_min', type=float, default=0.1, 90 | help='enforce minimum depth of network predictions') 91 | 92 | parser.add_argument('--depth_target', type=float, default=10, 93 | help='default depth to regularize training') 94 | 95 | parser.add_argument('--depth_max', type=float, default=1000, 96 | help='enforce maximum depth of network predictions') 97 | 98 | # Clustering params, for the ensemble training used in the Cambridge experiments. Disabled by default. 99 | parser.add_argument('--num_clusters', type=int, default=None, 100 | help='split the training sequence in this number of clusters. disabled by default') 101 | 102 | parser.add_argument('--cluster_idx', type=int, default=None, 103 | help='train on images part of this cluster. required only if --num_clusters is set.') 104 | 105 | # Params for the visualization. If enabled, it will slow down training considerably. But you get a nice video :) 106 | parser.add_argument('--render_visualization', type=_strtobool, default=False, 107 | help='create a video of the mapping process') 108 | 109 | parser.add_argument('--render_target_path', type=Path, default='renderings', 110 | help='target folder for renderings, visualizer will create a subfolder with the map name') 111 | 112 | parser.add_argument('--render_flipped_portrait', type=_strtobool, default=False, 113 | help='flag for wayspots dataset where images are sideways portrait') 114 | 115 | parser.add_argument('--render_map_error_threshold', type=int, default=10, 116 | help='reprojection error threshold for the visualisation in px') 117 | 118 | parser.add_argument('--render_map_depth_filter', type=int, default=10, 119 | help='to clean up the ACE point cloud remove points too far away') 120 | 121 | parser.add_argument('--render_camera_z_offset', type=int, default=4, 122 | help='zoom out of the scene by moving render camera backwards, in meters') 123 | 124 | options = parser.parse_args() 125 | 126 | trainer = TrainerACE(options) 127 | trainer.train() 128 | --------------------------------------------------------------------------------