├── .gitignore
├── LICENSE
├── README.md
├── ace_encoder_pretrained.pt
├── ace_loss.py
├── ace_network.py
├── ace_trainer.py
├── ace_util.py
├── ace_vis_util.py
├── ace_visualizer.py
├── dataset.py
├── datasets
    ├── dataset_util.py
    ├── setup_12scenes.py
    ├── setup_7scenes.py
    ├── setup_cambridge.py
    └── setup_wayspots.py
├── dsacstar
    ├── dsacstar.cpp
    ├── dsacstar_derivative.h
    ├── dsacstar_loss.h
    ├── dsacstar_types.h
    ├── dsacstar_util.h
    ├── dsacstar_util_rgbd.h
    ├── setup.py
    ├── stop_watch.h
    ├── thread_rand.cpp
    └── thread_rand.h
├── environment.yml
├── eval_poses.py
├── merge_ensemble_results.py
├── scripts
    ├── train_12scenes.sh
    ├── train_12scenes_pgt.sh
    ├── train_7scenes.sh
    ├── train_7scenes_pgt.sh
    ├── train_all.sh
    ├── train_cambridge.sh
    ├── train_cambridge_ensemble.sh
    ├── train_wayspots.sh
    ├── viz_12scenes.sh
    ├── viz_7scenes.sh
    ├── viz_cambridge.sh
    └── viz_wayspots.sh
├── test_ace.py
└── train_ace.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .idea/
 3 | 
 4 | # Dataset files.
 5 | datasets/7scenes_*
 6 | datasets/12scenes_*
 7 | datasets/Cambridge_*
 8 | datasets/wayspots_*
 9 | datasets/pgt_*
10 | datasets/visloc_pseudo_gt_limitations
11 | 
12 | # Outputs.
13 | output/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | - Main ACE Files:
  2 |   - ace_encoder_pretrained.pt
  3 |   - ace_loss.py
  4 |   - ace_network.py
  5 |   - ace_trainer.py
  6 |   - ace_util.py
  7 |   - ace_visualizer.py
  8 |   - ace_vis_util.py
  9 |   - eval_poses.py
 10 |   - merge_ensemble_results.py
 11 |   - test_ace.py
 12 |   - train_ace.py
 13 | - Contents of the scripts/ folder.
 14 | 
 15 | Copyright © Niantic, Inc. 2022. Patent Pending.
 16 | 
 17 | All rights reserved.
 18 | 
 19 | 
 20 | 
 21 | ================================================================================
 22 | 
 23 | 
 24 | 
 25 | This Software is licensed under the terms of the following ACE license
 26 | which allows for non-commercial use only. For any other use of the software not
 27 | covered by the terms of this license, please contact partnerships@nianticlabs.com
 28 | 
 29 | 
 30 | 
 31 | ================================================================================
 32 | 
 33 | 
 34 | 
 35 | ACE License
 36 | 
 37 | 
 38 |     This Agreement is made by and between the Licensor and the Licensee as
 39 | defined and identified below.
 40 | 
 41 | 
 42 | 1.  Definitions.
 43 | 
 44 |     In this Agreement (“the Agreement”) the following words shall have the
 45 | following meanings:
 46 | 
 47 |     "Authors" shall mean E. Brachmann, T. Cavallari, V. Prisacariu
 48 |     "Licensee" Shall mean the person or organization agreeing to use the
 49 | Software in accordance with these terms and conditions.
 50 |     "Licensor" shall mean Niantic Inc., a company organized and existing under
 51 | the laws of Delaware, whose principal place of business is at 1 Ferry Building,
 52 | Suite 200, San Francisco, 94111.
 53 |     "Software" shall mean the ACE Software uploaded by Licensor to the
 54 | GitHub repository at https://github.com/nianticlabs/ace
 55 | on May 23rd, 2023, in source code or object code form and any
 56 | accompanying documentation as well as any modifications or additions uploaded
 57 | to the same GitHub repository by Licensor.
 58 | 
 59 | 
 60 | 2.  License.
 61 | 
 62 |     2.1 The Licensor has all necessary rights to grant a license under: (i)
 63 | copyright and rights in the nature of copyright subsisting in the Software; and
 64 | (ii) certain patent rights resulting from a patent application(s) filed by the
 65 | Licensor in the United States and/or other jurisdictions in connection with the
 66 | Software. The Licensor grants the Licensee for the duration of this Agreement,
 67 | a free of charge, non-sublicenseable, non-exclusive, non-transferable copyright
 68 | and patent license (in consequence of said patent application(s)) to use the
 69 | Software for non-commercial purpose only, including teaching and research at
 70 | educational institutions and research at not-for-profit research institutions
 71 | in accordance with the provisions of this Agreement. Non-commercial use
 72 | expressly excludes any profit-making or commercial activities, including without
 73 | limitation sale, license, manufacture or development of commercial products, use in
 74 | commercially-sponsored research, use at a laboratory or other facility owned or
 75 | controlled (whether in whole or in part) by a commercial entity, provision of
 76 | consulting service, use for or on behalf of any commercial entity, use in
 77 | research where a commercial party obtains rights to research results or any
 78 | other benefit, and use of the code in any models, model weights or code
 79 | resulting from such procedure in any commercial product. Notwithstanding the
 80 | foregoing restrictions, you can use this code for publishing comparison results
 81 | for academic papers, including retraining on your own data. Any use of the
 82 | Software for any purpose other than pursuant to the license grant set forth
 83 | above shall automatically terminate this License.
 84 | 
 85 | 
 86 |       2.2 The Licensee is permitted to make modifications to the Software
 87 | provided that any distribution of such modifications is in accordance with
 88 | Clause 3.
 89 | 
 90 |       2.3 Except as expressly permitted by this Agreement and save to the
 91 | extent and in the circumstances expressly required to be permitted by law, the
 92 | Licensee is not permitted to rent, lease, sell, offer to sell, or loan the
 93 | Software or its associated documentation.
 94 | 
 95 | 
 96 | 3.  Redistribution and modifications
 97 | 
 98 |     3.1 The Licensee may reproduce and distribute copies of the Software, with
 99 | or without modifications, in source format only and provided that any and every 
100 | distribution is accompanied by an unmodified copy of this License, and that the 
101 | following copyright notice is always displayed in an obvious manner:
102 | "Copyright © Niantic, Inc. 2023. All rights reserved."
103 | 
104 |     3.2 In the case where the Software has been modified, any distribution must
105 | include prominent notices indicating which files have been changed.
106 | 
107 |     3.3 The Licensee shall cause any work that it distributes or publishes,
108 | that in whole or in part contains or is derived from the Software or any part
109 | thereof (“Work based on the Software”), to be licensed as a whole at no charge
110 | to all third parties entitled to a license to the Software under the terms of
111 | this License and on the same terms provided in this License.
112 | 
113 | 
114 | 4.  Duration.
115 | 
116 |     This Agreement is effective until the Licensee terminates it by destroying
117 | the Software, any Work based on the Software, and its documentation together
118 | with all copies. It will also terminate automatically if the Licensee fails to
119 | abide by its terms. Upon automatic termination the Licensee agrees to destroy
120 | all copies of the Software, Work based on the Software, and its documentation.
121 | 
122 | 
123 | 5.  Disclaimer of Warranties.
124 | 
125 |     The Software is provided as is. To the maximum extent permitted by law,
126 | Licensor provides no warranties or conditions of any kind, either express or
127 | implied, including without limitation, any warranties or condition of title,
128 | non-infringement or fitness for a particular purpose.
129 | 
130 | 
131 | 6.  LIMITATION OF LIABILITY.
132 | 
133 |     IN NO EVENT SHALL THE LICENSOR AND/OR AUTHORS BE LIABLE FOR ANY DIRECT,
134 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING
135 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
136 | DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
137 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
138 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
139 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
140 | 
141 | 
142 | 7.  Indemnity.
143 | 
144 |     The Licensee shall indemnify the Licensor and/or Authors against all third
145 | party claims that may be asserted against or suffered by the Licensor and/or
146 | Authors and which relate to use of the Software by the Licensee.
147 | 
148 | 
149 | 8.  Intellectual Property.
150 | 
151 |     8.1 As between the Licensee and Licensor, copyright and all other
152 | intellectual property rights subsisting in or in connection with the Software
153 | and supporting information shall remain at all times the property of the
154 | Licensor. The Licensee shall acquire no rights in any such material except as
155 | expressly provided in this Agreement.
156 | 
157 |     8.2 No permission is granted to use the trademarks or product names of the
158 | Licensor except as required for reasonable and customary use in describing the
159 | origin of the Software and for the purposes of abiding by the terms of Clause
160 | 3.1.
161 | 
162 |     8.3 The Licensee shall promptly notify the Licensor of any improvement or
163 | new use of the Software (“Improvements”) in sufficient detail for Licensor to
164 | evaluate the Improvements. The Licensee hereby grants the Licensor and its
165 | affiliates a non-exclusive, fully paid-up, royalty-free, irrevocable and
166 | perpetual license to all Improvements for non-commercial academic research and
167 | teaching purposes upon creation of such improvements.
168 | 
169 |     8.4 The Licensee grants an exclusive first option to the Licensor to be
170 | exercised by the Licensor within three (3) years of the date of notification of
171 | an Improvement under Clause 8.3 to use any the Improvement for commercial
172 | purposes on terms to be negotiated and agreed by Licensee and Licensor in good
173 | faith within a period of six (6) months from the date of exercise of the said
174 | option (including without limitation any royalty share in net income from such
175 | commercialization payable to the Licensee, as the case may be).
176 | 
177 | 
178 | 9.  Acknowledgements.
179 | 
180 |     The Licensee shall acknowledge the Authors and use of the Software in the
181 | publication of any work that uses, or results that are achieved through, the
182 | use of the Software. The following citation shall be included in the
183 | acknowledgement: "Accelerated Coordinate Encoding: Learning to Relocalize in
184 | Minutes using RGB and Poses"; by Eric Brachmann, Tommaso Cavallari, and Victor 
185 | Adrian Prisacariu; CVPR 2023.
186 | 
187 | 
188 | 10. Governing Law.
189 | 
190 |     This Agreement shall be governed by, construed and interpreted in
191 | accordance with English law and the parties submit to the exclusive
192 | jurisdiction of the English courts.
193 | 
194 | 
195 | 11. Termination.
196 | 
197 |     Upon termination of this Agreement, the licenses granted hereunder will
198 | terminate and Sections 5, 6, 7, 8, 9, 10 and 11 shall survive any termination
199 | of this Agreement.
200 | 
201 | 
202 | --------------------------------------------------------------------------------
203 | - dataset.py
204 | - Contents of the dsacstar/ folder.
205 | 
206 | BSD 3-Clause License
207 | 
208 | Copyright (c) 2020, ebrach
209 | All rights reserved.
210 | 
211 | Redistribution and use in source and binary forms, with or without
212 | modification, are permitted provided that the following conditions are met:
213 | 
214 | 1. Redistributions of source code must retain the above copyright notice, this
215 |    list of conditions and the following disclaimer.
216 | 
217 | 2. Redistributions in binary form must reproduce the above copyright notice,
218 |    this list of conditions and the following disclaimer in the documentation
219 |    and/or other materials provided with the distribution.
220 | 
221 | 3. Neither the name of the copyright holder nor the names of its
222 |    contributors may be used to endorse or promote products derived from
223 |    this software without specific prior written permission.
224 | 
225 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
226 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
227 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
228 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
229 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
230 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
231 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
232 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
233 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
234 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
235 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses
  2 | 
  3 | ----------------------------------------------------------------------------------------
  4 | 
  5 | This repository contains the code associated to the ACE paper:
  6 | > **Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses**
  7 | > 
  8 | > [Eric Brachmann](https://ebrach.github.io/), [Tommaso Cavallari](https://scholar.google.it/citations?user=r7osSm0AAAAJ&hl=en), and [Victor Adrian Prisacariu](https://www.robots.ox.ac.uk/~victor/)
  9 | > 
 10 | > [CVPR 2023, Highlight](https://openaccess.thecvf.com/content/CVPR2023/papers/Brachmann_Accelerated_Coordinate_Encoding_Learning_to_Relocalize_in_Minutes_Using_RGB_CVPR_2023_paper.pdf)
 11 | 
 12 | For further information please visit:
 13 | 
 14 | - [Project page (with videos, method explanations, dataset details)](https://nianticlabs.github.io/ace)
 15 | - [Arxiv](https://arxiv.org/abs/2305.14059)
 16 | 
 17 | Table of contents:
 18 | 
 19 | - [Installation](#installation)
 20 | - [Dataset Setup](#datasets)
 21 | - [Usage](#usage)
 22 |     - [ACE Training](#ace-training)
 23 |     - [ACE Evaluation](#ace-evaluation)
 24 |     - [Training Scripts](#complete-training-and-evaluation-scripts)
 25 |     - [Pretrained ACE Networks](#pretrained-ace-networks)
 26 |     - [Note on the Encoder Training](#encoder-training)
 27 | - [References](#publications)
 28 | 
 29 | ## Installation
 30 | 
 31 | This code uses PyTorch to train and evaluate the scene-specific coordinate prediction head networks. It has been tested
 32 | on Ubuntu 20.04 with a T4 Nvidia GPU, although it should reasonably run with other Linux distributions and GPUs as well.
 33 | 
 34 | We provide a pre-configured [`conda`](https://docs.conda.io/en/latest/) environment containing all required dependencies
 35 | necessary to run our code.
 36 | You can re-create and activate the environment with:
 37 | 
 38 | ```shell
 39 | conda env create -f environment.yml
 40 | conda activate ace
 41 | ```
 42 | 
 43 | **All the following commands in this file need to run in the `ace` environment.**
 44 | 
 45 | The ACE network predicts dense 3D scene coordinates associated to the pixels of the input images.
 46 | In order to estimate the 6DoF camera poses, it relies on the RANSAC implementation of the DSAC* paper (Brachmann and
 47 | Rother, TPAMI 2021), which is written in C++.
 48 | As such, you need to build and install the C++/Python bindings of those functions.
 49 | You can do this with:
 50 | 
 51 | ```shell
 52 | cd dsacstar
 53 | python setup.py install
 54 | ```
 55 | 
 56 | > (Optional) If you want to create videos of the training/evaluation process:
 57 | > ```shell
 58 | > sudo apt install ffmpeg
 59 | > ```
 60 | 
 61 | Having done the steps above, you are ready to experiment with ACE!
 62 | 
 63 | ## Datasets
 64 | 
 65 | The ACE method has been evaluated using multiple published datasets:
 66 | 
 67 | - [Microsoft 7-Scenes](https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/)
 68 | - [Stanford 12-Scenes](https://graphics.stanford.edu/projects/reloc/)
 69 | - [Cambridge Landmarks](https://www.repository.cam.ac.uk/handle/1810/251342/)
 70 | - [Niantic Wayspots](https://nianticlabs.github.io/ace#dataset)
 71 | 
 72 | We provide scripts in the `datasets` folder to automatically download and extract the data in a format that can be
 73 | readily used by the ACE scripts.
 74 | The format is the same used by the DSAC* codebase, see [here](https://github.com/vislearn/dsacstar#data-structure) for
 75 | details.
 76 | 
 77 | > **Important: make sure you have checked the license terms of each dataset before using it.**
 78 | 
 79 | ### {7, 12}-Scenes:
 80 | 
 81 | You can use the `datasets/setup_{7,12}scenes.py` scripts to download the data.
 82 | As mentioned in the paper, we experimented with two variants of each of these datasets: one using the original
 83 | D-SLAM ground truth camera poses, and one using _Pseudo Ground Truth (PGT)_ camera poses obtained after running SfM on
 84 | the scenes
 85 | (see
 86 | the [ICCV 2021 paper](https://openaccess.thecvf.com/content/ICCV2021/html/Brachmann_On_the_Limits_of_Pseudo_Ground_Truth_in_Visual_Camera_ICCV_2021_paper.html)
 87 | ,
 88 | and [associated code](https://github.com/tsattler/visloc_pseudo_gt_limitations/) for details).
 89 | 
 90 | To download and prepare the datasets using the D-SLAM poses:
 91 | 
 92 | ```shell
 93 | cd datasets
 94 | # Downloads the data to datasets/7scenes_{chess, fire, ...}
 95 | ./setup_7scenes.py
 96 | # Downloads the data to datasets/12scenes_{apt1_kitchen, ...}
 97 | ./setup_12scenes.py
 98 | ``` 
 99 | 
100 | To download and prepare the datasets using the PGT poses:
101 | 
102 | ```shell
103 | cd datasets
104 | # Downloads the data to datasets/pgt_7scenes_{chess, fire, ...}
105 | ./setup_7scenes.py --poses pgt
106 | # Downloads the data to datasets/pgt_12scenes_{apt1_kitchen, ...}
107 | ./setup_12scenes.py --poses pgt
108 | ``` 
109 | 
110 | ### Cambridge Landmarks / Niantic Wayspots:
111 | 
112 | We used a single variant of these datasets. Simply run:
113 | 
114 | ```shell
115 | cd datasets
116 | # Downloads the data to datasets/Cambridge_{GreatCourt, KingsCollege, ...}
117 | ./setup_cambridge.py
118 | # Downloads the data to datasets/wayspots_{bears, cubes, ...}
119 | ./setup_wayspots.py
120 | ```
121 | 
122 | ## Usage
123 | 
124 | We provide scripts to train and evaluate ACE scene coordinate regression networks.
125 | In the following sections we'll detail some of the main command line options that can be used to customize the
126 | behavior of both the training and the pose estimation script.
127 | 
128 | ### ACE Training
129 | 
130 | The ACE scene-specific coordinate regression head for a scene can be trained using the `train_ace.py` script.
131 | Basic usage:
132 | 
133 | ```shell
134 | ./train_ace.py <scene path> <output map name>
135 | # Example:
136 | ./train_ace.py datasets/7scenes_chess output/7scenes_chess.pt
137 | ```
138 | 
139 | The output map file contains just the weights of the scene-specific head network -- encoded as half-precision floating
140 | point -- for a size of ~4MB when using default options, as mentioned in the paper. The testing script will use these
141 | weights, together with the scene-agnostic pretrained encoder (`ace_encoder_pretrained.pt`) we provide, to estimate 6DoF
142 | poses for the query images.
143 | 
144 | **Additional parameters** that can be passed to the training script to alter its behavior:
145 | 
146 | - `--training_buffer_size`: Changes the size of the training buffer containing decorrelated image features (see paper),
147 |   that is created at the beginning of the training process. The default size is 8M.
148 | - `--samples_per_image`: How many features to sample from each image during the buffer generation phase. This affects
149 |   the amount of time necessary to fill the training buffer, but also affects the amount of decorrelation in the features
150 |   present in the buffer. The default is 1024 samples per image.
151 | - `--epochs`: How many full passes over the training buffer are performed during the training. This directly affects the
152 |   training time. Default is 16.
153 | - `--num_head_blocks`: The depth of the head network. Specifically, the number of extra 3-layer residual blocks to add
154 |   to the default head depth. Default value is 1, which results in a head network composed of 9 layers, for a total of
155 |   4MB weights.
156 | 
157 | **Clustering parameters:** these are used for the ensemble experiments (ACE Poker variant) we ran on the Cambridge
158 | Landmarks
159 | dataset. They are used to split the input scene into multiple independent clusters, and training the head network on one
160 | of them (see Section 4.2 of the main paper and Section 1.3 of the supplementary material for details).
161 | 
162 | - `--num_clusters`: How many clusters to split the training scene in. Default `None` (disabled).
163 | - `--cluster_idx`: Selects a specific cluster for training.
164 | 
165 | **Visualization parameters:** these are used to generate the videos available in the project page (they actually
166 | generate
167 | individual frames that can be collated into a video later). _Note: enabling them will significantly slow down the
168 | training._
169 | 
170 | - `--render_visualization`: Set to `True` to enable generating frames showing the training process. Default `False`.
171 | - `--render_target_path`: Base folder where the frames will be saved. The script automatically appends the current map
172 |   name to the folder. Default is `renderings`.
173 | 
174 | There are other options available, they can be discovered by running the script with the `--help` flag.
175 | 
176 | ### ACE Evaluation
177 | 
178 | The pose estimation for a testing scene can be performed using the `test_ace.py` script.
179 | Basic usage:
180 | 
181 | ```shell
182 | ./test_ace.py <scene path> <output map name>
183 | # Example:
184 | ./test_ace.py datasets/7scenes_chess output/7scenes_chess.pt
185 | ```
186 | 
187 | The script loads (a) the scene-specific ACE head network and (b) the pre-trained scene-agnostic encoder and, for each
188 | testing frame:
189 | 
190 | - Computes its per-pixel 3D scene coordinates, resulting in a set of 2D-3D correspondences.
191 | - The correspondences are then passed to a RANSAC algorithm that is able to estimate a 6DoF camera pose.
192 | - The camera poses are compared with the ground truth, and various cumulative metrics are then computed and printed
193 |   at the end of the script.
194 | 
195 | The metrics include: %-age of frames within certain translation/angle thresholds of the ground truth,
196 | median translation, median rotation error.
197 | 
198 | The script also creates a file containing per-frame results so that they can be parsed by other tools or analyzed
199 | separately.
200 | The output file is located alongside the head network and is named: `poses_<map name>_<session>.txt`.
201 | 
202 | Each line in the output file contains the results for an individual query frame, in this format:
203 | 
204 | ```
205 | file_name rot_quaternion_w rot_quaternion_x rot_quaternion_y rot_quaternion_z translation_x translation_y translation_z rot_err_deg tr_err_m inlier_count
206 | ```
207 | 
208 | There are some parameters that can be passed to the script to customize the RANSAC behavior:
209 | 
210 | - `--session`: Custom suffix to append to the name of the file containing the estimated camera poses (see paragraph
211 |   above).
212 | - `--hypotheses`: How many pose hypotheses to generate and evaluate (i.e. the number of RANSAC iterations). Default is
213 |     64.
214 | - `--threshold`: Inlier threshold (in pixels) to consider a 2D-3D correspondence as valid.
215 | - `--render_visualization`: Set to `True` to enable generating frames showing the evaluation process. Will slow down the
216 |   testing significantly if enabled. Default `False`.
217 | - `--render_target_path`: Base folder where the frames will be saved. The script automatically appends the current map
218 |   name to the folder. Default is `renderings`.
219 | 
220 | There are other options available, they can be discovered by running the script with the `--help` flag.
221 | 
222 | #### Ensemble Evaluation
223 | 
224 | To deploy the ensemble variants (such as the 4-cluster ACE Poker variant), we simply need to run the training script
225 | multiple times (once per cluster), thus training multiple head networks.
226 | 
227 | At localization time we run the testing script once for each trained head and save the per-frame `poses_...` files,
228 | passing the `--session` parameter to the test script to tag each file according to the network used to generate it.
229 | 
230 | We provide two more scripts:
231 | 
232 | 1. `merge_ensemble_results.py`: Merge multiple `poses_` files -- choosing for each frame the pose that resulted in the
233 |    best inlier count.
234 | 2. `eval_poses.py`: Compute the overall accuracy metrics (%-age of poses within threshold and median errors) that we showed
235 |    in the paper.
236 | 
237 | ACE Poker example for a scene in the Cambridge dataset:
238 | 
239 | ```shell
240 | mkdir -p output/Cambridge_GreatCourt
241 | 
242 | # Head training:
243 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/0_4.pt --num_clusters 4 --cluster_idx 0
244 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/1_4.pt --num_clusters 4 --cluster_idx 1
245 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/2_4.pt --num_clusters 4 --cluster_idx 2
246 | ./train_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/3_4.pt --num_clusters 4 --cluster_idx 3
247 | 
248 | # Per-cluster evaluation:
249 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/0_4.pt --session 0_4
250 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/1_4.pt --session 1_4
251 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/2_4.pt --session 2_4
252 | ./test_ace.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/3_4.pt --session 3_4
253 | 
254 | # Merging results and computing metrics.
255 | 
256 | # The merging script takes a --poses_suffix argument that's used to select only the 
257 | # poses generated for the requested number of clusters. 
258 | ./merge_ensemble_results.py output/Cambridge_GreatCourt output/Cambridge_GreatCourt/merged_poses_4.txt --poses_suffix "_4.txt"
259 | 
260 | # The output poses output by the previous script are then evaluated against the scene ground truth data.
261 | ./eval_poses.py datasets/Cambridge_GreatCourt output/Cambridge_GreatCourt/merged_poses_4.txt
262 | ```
263 | 
264 | ### Complete training and evaluation scripts
265 | 
266 | We provide several scripts to run training and evaluation on the various datasets we tested our method with.
267 | These allow replicating the results we showcased in the paper.
268 | They are located under the `scripts` folder: `scripts/train_*.sh`.
269 | 
270 | In the same folder we also provide scripts to generate videos of the training/testing protocol, as can be seen in the
271 | project page. They are under `scripts/viz_*.sh`.
272 | 
273 | ### Pretrained ACE Networks
274 | 
275 | We also make available the set of pretrained ACE Heads we used for the experiments in the paper.
276 | Each head was trained for 5 minutes on one of the scenes in the various datasets, and was used to compute the accuracy
277 | metrics we showed in the main text.
278 | 
279 | Each network can be passed directly to the `test_ace.py` script, together with the path to its dataset scene, to run
280 | camera relocalization on the images of the testing split and compute the accuracy metrics, like this:
281 | 
282 | ```shell
283 | ./test_ace.py datasets/7scenes_chess <Downloads>/7Scenes/7scenes_chess.pt
284 | ```
285 | 
286 | **The data is available
287 | at [this location](https://storage.googleapis.com/niantic-lon-static/research/ace/ace_models.tar.gz).**
288 | 
289 | ### Encoder Training
290 | 
291 | As mentioned above, in this repository we provide a set of weights for the pretrained feature extraction
292 | backbone (`ace_encoder_pretrained.pt`) that was used in our experiments.
293 | You are welcome to use them to experiment with ACE in novel scenes.
294 | Both indoor and outdoor environments perform reasonably well with this set of weights, as shown in the paper.
295 | 
296 | **Unfortunately, we cannot provide the code to train the encoder as part of this release.**
297 | 
298 | The feature extractor has been trained on 100 scenes from [ScanNet](http://www.scan-net.org/), in parallel, for ~1 week,
299 | as described in Section 3.3 of the paper and Section 1.1 of the supplementary material.
300 | It is possible to reimplement the encoder training protocol following those instructions.
301 | 
302 | ## Publications
303 | 
304 | If you use ACE or parts of its code in your own work, please cite:
305 | 
306 | ```
307 | @inproceedings{brachmann2023ace,
308 |     title={Accelerated Coordinate Encoding: Learning to Relocalize in Minutes using RGB and Poses},
309 |     author={Brachmann, Eric and Cavallari, Tommaso and Prisacariu, Victor Adrian},
310 |     booktitle={CVPR},
311 |     year={2023},
312 | }
313 | ```
314 | 
315 | This code builds on previous camera relocalization pipelines, namely DSAC, DSAC++, and DSAC*. Please consider citing:
316 | 
317 | ```
318 | @inproceedings{brachmann2017dsac,
319 |   title={{DSAC}-{Differentiable RANSAC} for Camera Localization},
320 |   author={Brachmann, Eric and Krull, Alexander and Nowozin, Sebastian and Shotton, Jamie and Michel, Frank and Gumhold, Stefan and Rother, Carsten},
321 |   booktitle={CVPR},
322 |   year={2017}
323 | }
324 | 
325 | @inproceedings{brachmann2018lessmore,
326 |   title={Learning less is more - {6D} camera localization via {3D} surface regression},
327 |   author={Brachmann, Eric and Rother, Carsten},
328 |   booktitle={CVPR},
329 |   year={2018}
330 | }
331 | 
332 | @article{brachmann2021dsacstar,
333 |   title={Visual Camera Re-Localization from {RGB} and {RGB-D} Images Using {DSAC}},
334 |   author={Brachmann, Eric and Rother, Carsten},
335 |   journal={TPAMI},
336 |   year={2021}
337 | }
338 | ```
339 | 
340 | ## License
341 | 
342 | Copyright © Niantic, Inc. 2023. Patent Pending.
343 | All rights reserved.
344 | Please see the [license file](LICENSE) for terms.


--------------------------------------------------------------------------------
/ace_encoder_pretrained.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nianticlabs/ace/e9e90f2d02ee92c348bf411a5a60e230af6c315e/ace_encoder_pretrained.pt


--------------------------------------------------------------------------------
/ace_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright © Niantic, Inc. 2022.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def weighted_tanh(repro_errs, weight):
 8 |     return weight * torch.tanh(repro_errs / weight).sum()
 9 | 
10 | 
11 | class ReproLoss:
12 |     """
13 |     Compute per-pixel reprojection loss using different configurable approaches.
14 | 
15 |     - tanh:     tanh loss with a constant scale factor given by the `soft_clamp` parameter (when a pixel's reprojection
16 |                 error is equal to `soft_clamp`, its loss is equal to `soft_clamp * tanh(1)`).
17 |     - dyntanh:  Used in the paper, similar to the tanh loss above, but the scaling factor decreases during the course of
18 |                 the training from `soft_clamp` to `soft_clamp_min`. The decrease is linear, unless `circle_schedule`
19 |                 is True (default), in which case it applies a circular scheduling. See paper for details.
20 |     - l1:       Standard L1 loss, computed only on those pixels having an error lower than `soft_clamp`
21 |     - l1+sqrt:  L1 loss for pixels with reprojection error smaller than `soft_clamp` and
22 |                 `sqrt(soft_clamp * reprojection_error)` for pixels with a higher error.
23 |     - l1+logl1: Similar to the above, but using log L1 for pixels with high reprojection error.
24 |     """
25 | 
26 |     def __init__(self,
27 |                  total_iterations,
28 |                  soft_clamp,
29 |                  soft_clamp_min,
30 |                  type='dyntanh',
31 |                  circle_schedule=True):
32 | 
33 |         self.total_iterations = total_iterations
34 |         self.soft_clamp = soft_clamp
35 |         self.soft_clamp_min = soft_clamp_min
36 |         self.type = type
37 |         self.circle_schedule = circle_schedule
38 | 
39 |     def compute(self, repro_errs_b1N, iteration):
40 |         if repro_errs_b1N.nelement() == 0:
41 |             return 0
42 | 
43 |         if self.type == "tanh":
44 |             return weighted_tanh(repro_errs_b1N, self.soft_clamp)
45 | 
46 |         elif self.type == "dyntanh":
47 |             # Compute the progress over the training process.
48 |             schedule_weight = iteration / self.total_iterations
49 | 
50 |             if self.circle_schedule:
51 |                 # Optionally scale it using the circular schedule.
52 |                 schedule_weight = 1 - np.sqrt(1 - schedule_weight ** 2)
53 | 
54 |             # Compute the weight to use in the tanh loss.
55 |             loss_weight = (1 - schedule_weight) * self.soft_clamp + self.soft_clamp_min
56 | 
57 |             # Compute actual loss.
58 |             return weighted_tanh(repro_errs_b1N, loss_weight)
59 | 
60 |         elif self.type == "l1":
61 |             # L1 loss on all pixels with small-enough error.
62 |             softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp
63 |             return repro_errs_b1N[~softclamp_mask_b1].sum()
64 | 
65 |         elif self.type == "l1+sqrt":
66 |             # L1 loss on pixels with small errors and sqrt for the others.
67 |             softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp
68 |             loss_l1 = repro_errs_b1N[~softclamp_mask_b1].sum()
69 |             loss_sqrt = torch.sqrt(self.soft_clamp * repro_errs_b1N[softclamp_mask_b1]).sum()
70 | 
71 |             return loss_l1 + loss_sqrt
72 | 
73 |         else:
74 |             # l1+logl1: same as above, but use log(L1) for pixels with a larger error.
75 |             softclamp_mask_b1 = repro_errs_b1N > self.soft_clamp
76 |             loss_l1 = repro_errs_b1N[~softclamp_mask_b1].sum()
77 |             loss_logl1 = torch.log(1 + (self.soft_clamp * repro_errs_b1N[softclamp_mask_b1])).sum()
78 | 
79 |             return loss_l1 + loss_logl1
80 | 


--------------------------------------------------------------------------------
/ace_network.py:
--------------------------------------------------------------------------------
  1 | # Copyright © Niantic, Inc. 2022.
  2 | 
  3 | import logging
  4 | import math
  5 | import re
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | _logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class Encoder(nn.Module):
 15 |     """
 16 |     FCN encoder, used to extract features from the input images.
 17 | 
 18 |     The number of output channels is configurable, the default used in the paper is 512.
 19 |     """
 20 | 
 21 |     def __init__(self, out_channels=512):
 22 |         super(Encoder, self).__init__()
 23 | 
 24 |         self.out_channels = out_channels
 25 | 
 26 |         self.conv1 = nn.Conv2d(1, 32, 3, 1, 1)
 27 |         self.conv2 = nn.Conv2d(32, 64, 3, 2, 1)
 28 |         self.conv3 = nn.Conv2d(64, 128, 3, 2, 1)
 29 |         self.conv4 = nn.Conv2d(128, 256, 3, 2, 1)
 30 | 
 31 |         self.res1_conv1 = nn.Conv2d(256, 256, 3, 1, 1)
 32 |         self.res1_conv2 = nn.Conv2d(256, 256, 1, 1, 0)
 33 |         self.res1_conv3 = nn.Conv2d(256, 256, 3, 1, 1)
 34 | 
 35 |         self.res2_conv1 = nn.Conv2d(256, 512, 3, 1, 1)
 36 |         self.res2_conv2 = nn.Conv2d(512, 512, 1, 1, 0)
 37 |         self.res2_conv3 = nn.Conv2d(512, self.out_channels, 3, 1, 1)
 38 | 
 39 |         self.res2_skip = nn.Conv2d(256, self.out_channels, 1, 1, 0)
 40 | 
 41 |     def forward(self, x):
 42 |         x = F.relu(self.conv1(x))
 43 |         x = F.relu(self.conv2(x))
 44 |         x = F.relu(self.conv3(x))
 45 |         res = F.relu(self.conv4(x))
 46 | 
 47 |         x = F.relu(self.res1_conv1(res))
 48 |         x = F.relu(self.res1_conv2(x))
 49 |         x = F.relu(self.res1_conv3(x))
 50 | 
 51 |         res = res + x
 52 | 
 53 |         x = F.relu(self.res2_conv1(res))
 54 |         x = F.relu(self.res2_conv2(x))
 55 |         x = F.relu(self.res2_conv3(x))
 56 | 
 57 |         x = self.res2_skip(res) + x
 58 | 
 59 |         return x
 60 | 
 61 | 
 62 | class Head(nn.Module):
 63 |     """
 64 |     MLP network predicting per-pixel scene coordinates given a feature vector. All layers are 1x1 convolutions.
 65 |     """
 66 | 
 67 |     def __init__(self,
 68 |                  mean,
 69 |                  num_head_blocks,
 70 |                  use_homogeneous,
 71 |                  homogeneous_min_scale=0.01,
 72 |                  homogeneous_max_scale=4.0,
 73 |                  in_channels=512):
 74 |         super(Head, self).__init__()
 75 | 
 76 |         self.use_homogeneous = use_homogeneous
 77 |         self.in_channels = in_channels  # Number of encoder features.
 78 |         self.head_channels = 512  # Hardcoded.
 79 | 
 80 |         # We may need a skip layer if the number of features output by the encoder is different.
 81 |         self.head_skip = nn.Identity() if self.in_channels == self.head_channels else nn.Conv2d(self.in_channels,
 82 |                                                                                                 self.head_channels, 1,
 83 |                                                                                                 1, 0)
 84 | 
 85 |         self.res3_conv1 = nn.Conv2d(self.in_channels, self.head_channels, 1, 1, 0)
 86 |         self.res3_conv2 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0)
 87 |         self.res3_conv3 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0)
 88 | 
 89 |         self.res_blocks = []
 90 | 
 91 |         for block in range(num_head_blocks):
 92 |             self.res_blocks.append((
 93 |                 nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0),
 94 |                 nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0),
 95 |                 nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0),
 96 |             ))
 97 | 
 98 |             super(Head, self).add_module(str(block) + 'c0', self.res_blocks[block][0])
 99 |             super(Head, self).add_module(str(block) + 'c1', self.res_blocks[block][1])
100 |             super(Head, self).add_module(str(block) + 'c2', self.res_blocks[block][2])
101 | 
102 |         self.fc1 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0)
103 |         self.fc2 = nn.Conv2d(self.head_channels, self.head_channels, 1, 1, 0)
104 | 
105 |         if self.use_homogeneous:
106 |             self.fc3 = nn.Conv2d(self.head_channels, 4, 1, 1, 0)
107 | 
108 |             # Use buffers because they need to be saved in the state dict.
109 |             self.register_buffer("max_scale", torch.tensor([homogeneous_max_scale]))
110 |             self.register_buffer("min_scale", torch.tensor([homogeneous_min_scale]))
111 |             self.register_buffer("max_inv_scale", 1. / self.max_scale)
112 |             self.register_buffer("h_beta", math.log(2) / (1. - self.max_inv_scale))
113 |             self.register_buffer("min_inv_scale", 1. / self.min_scale)
114 |         else:
115 |             self.fc3 = nn.Conv2d(self.head_channels, 3, 1, 1, 0)
116 | 
117 |         # Learn scene coordinates relative to a mean coordinate (e.g. center of the scene).
118 |         self.register_buffer("mean", mean.clone().detach().view(1, 3, 1, 1))
119 | 
120 |     def forward(self, res):
121 | 
122 |         x = F.relu(self.res3_conv1(res))
123 |         x = F.relu(self.res3_conv2(x))
124 |         x = F.relu(self.res3_conv3(x))
125 | 
126 |         res = self.head_skip(res) + x
127 | 
128 |         for res_block in self.res_blocks:
129 |             x = F.relu(res_block[0](res))
130 |             x = F.relu(res_block[1](x))
131 |             x = F.relu(res_block[2](x))
132 | 
133 |             res = res + x
134 | 
135 |         sc = F.relu(self.fc1(res))
136 |         sc = F.relu(self.fc2(sc))
137 |         sc = self.fc3(sc)
138 | 
139 |         if self.use_homogeneous:
140 |             # Dehomogenize coords:
141 |             # Softplus ensures we have a smooth homogeneous parameter with a minimum value = self.max_inv_scale.
142 |             h_slice = F.softplus(sc[:, 3, :, :].unsqueeze(1), beta=self.h_beta.item()) + self.max_inv_scale
143 |             h_slice.clamp_(max=self.min_inv_scale)
144 |             sc = sc[:, :3] / h_slice
145 | 
146 |         # Add the mean to the predicted coordinates.
147 |         sc += self.mean
148 | 
149 |         return sc
150 | 
151 | 
152 | class Regressor(nn.Module):
153 |     """
154 |     FCN architecture for scene coordinate regression.
155 | 
156 |     The network predicts a 3d scene coordinates, the output is subsampled by a factor of 8 compared to the input.
157 |     """
158 | 
159 |     OUTPUT_SUBSAMPLE = 8
160 | 
161 |     def __init__(self, mean, num_head_blocks, use_homogeneous, num_encoder_features=512):
162 |         """
163 |         Constructor.
164 | 
165 |         mean: Learn scene coordinates relative to a mean coordinate (e.g. the center of the scene).
166 |         num_head_blocks: How many extra residual blocks to use in the head (one is always used).
167 |         use_homogeneous: Whether to learn homogeneous or 3D coordinates.
168 |         num_encoder_features: Number of channels output of the encoder network.
169 |         """
170 |         super(Regressor, self).__init__()
171 | 
172 |         self.feature_dim = num_encoder_features
173 | 
174 |         self.encoder = Encoder(out_channels=self.feature_dim)
175 |         self.heads = Head(mean, num_head_blocks, use_homogeneous, in_channels=self.feature_dim)
176 | 
177 |     @classmethod
178 |     def create_from_encoder(cls, encoder_state_dict, mean, num_head_blocks, use_homogeneous):
179 |         """
180 |         Create a regressor using a pretrained encoder, loading encoder-specific parameters from the state dict.
181 | 
182 |         encoder_state_dict: pretrained encoder state dictionary.
183 |         mean: Learn scene coordinates relative to a mean coordinate (e.g. the center of the scene).
184 |         num_head_blocks: How many extra residual blocks to use in the head (one is always used).
185 |         use_homogeneous: Whether to learn homogeneous or 3D coordinates.
186 |         """
187 | 
188 |         # Number of output channels of the last encoder layer.
189 |         num_encoder_features = encoder_state_dict['res2_conv3.weight'].shape[0]
190 | 
191 |         # Create a regressor.
192 |         _logger.info(f"Creating Regressor using pretrained encoder with {num_encoder_features} feature size.")
193 |         regressor = cls(mean, num_head_blocks, use_homogeneous, num_encoder_features)
194 | 
195 |         # Load encoder weights.
196 |         regressor.encoder.load_state_dict(encoder_state_dict)
197 | 
198 |         # Done.
199 |         return regressor
200 | 
201 |     @classmethod
202 |     def create_from_state_dict(cls, state_dict):
203 |         """
204 |         Instantiate a regressor from a pretrained state dictionary.
205 | 
206 |         state_dict: pretrained state dictionary.
207 |         """
208 |         # Mean is zero (will be loaded from the state dict).
209 |         mean = torch.zeros((3,))
210 | 
211 |         # Count how many head blocks are in the dictionary.
212 |         pattern = re.compile(r"^heads\.\d+c0\.weight$")
213 |         num_head_blocks = sum(1 for k in state_dict.keys() if pattern.match(k))
214 | 
215 |         # Whether the network uses homogeneous coordinates.
216 |         use_homogeneous = state_dict["heads.fc3.weight"].shape[0] == 4
217 | 
218 |         # Number of output channels of the last encoder layer.
219 |         num_encoder_features = state_dict['encoder.res2_conv3.weight'].shape[0]
220 | 
221 |         # Create a regressor.
222 |         _logger.info(f"Creating regressor from pretrained state_dict:"
223 |                      f"\n\tNum head blocks: {num_head_blocks}"
224 |                      f"\n\tHomogeneous coordinates: {use_homogeneous}"
225 |                      f"\n\tEncoder feature size: {num_encoder_features}")
226 |         regressor = cls(mean, num_head_blocks, use_homogeneous, num_encoder_features)
227 | 
228 |         # Load all weights.
229 |         regressor.load_state_dict(state_dict)
230 | 
231 |         # Done.
232 |         return regressor
233 | 
234 |     @classmethod
235 |     def create_from_split_state_dict(cls, encoder_state_dict, head_state_dict):
236 |         """
237 |         Instantiate a regressor from a pretrained encoder (scene-agnostic) and a scene-specific head.
238 | 
239 |         encoder_state_dict: encoder state dictionary
240 |         head_state_dict: scene-specific head state dictionary
241 |         """
242 |         # We simply merge the dictionaries and call the other constructor.
243 |         merged_state_dict = {}
244 | 
245 |         for k, v in encoder_state_dict.items():
246 |             merged_state_dict[f"encoder.{k}"] = v
247 | 
248 |         for k, v in head_state_dict.items():
249 |             merged_state_dict[f"heads.{k}"] = v
250 | 
251 |         return cls.create_from_state_dict(merged_state_dict)
252 | 
253 |     def load_encoder(self, encoder_dict_file):
254 |         """
255 |         Load weights into the encoder network.
256 |         """
257 |         self.encoder.load_state_dict(torch.load(encoder_dict_file))
258 | 
259 |     def get_features(self, inputs):
260 |         return self.encoder(inputs)
261 | 
262 |     def get_scene_coordinates(self, features):
263 |         return self.heads(features)
264 | 
265 |     def forward(self, inputs):
266 |         """
267 |         Forward pass.
268 |         """
269 |         features = self.get_features(inputs)
270 |         return self.get_scene_coordinates(features)
271 | 


--------------------------------------------------------------------------------
/ace_trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright © Niantic, Inc. 2022.
  2 | 
  3 | import logging
  4 | import random
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.optim as optim
 10 | import torchvision.transforms.functional as TF
 11 | from torch.cuda.amp import GradScaler, autocast
 12 | from torch.utils.data import DataLoader
 13 | from torch.utils.data import sampler
 14 | 
 15 | from ace_util import get_pixel_grid, to_homogeneous
 16 | from ace_loss import ReproLoss
 17 | from ace_network import Regressor
 18 | from dataset import CamLocDataset
 19 | 
 20 | import ace_vis_util as vutil
 21 | from ace_visualizer import ACEVisualizer
 22 | 
 23 | _logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def set_seed(seed):
 27 |     """
 28 |     Seed all sources of randomness.
 29 |     """
 30 |     torch.manual_seed(seed)
 31 |     np.random.seed(seed)
 32 |     random.seed(seed)
 33 | 
 34 | class TrainerACE:
 35 |     def __init__(self, options):
 36 |         self.options = options
 37 | 
 38 |         self.device = torch.device('cuda')
 39 | 
 40 |         # The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
 41 |         # torch.backends.cuda.matmul.allow_tf32 = False
 42 | 
 43 |         # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
 44 |         # torch.backends.cudnn.allow_tf32 = False
 45 | 
 46 |         # Setup randomness for reproducibility.
 47 |         self.base_seed = 2089
 48 |         set_seed(self.base_seed)
 49 | 
 50 |         # Used to generate batch indices.
 51 |         self.batch_generator = torch.Generator()
 52 |         self.batch_generator.manual_seed(self.base_seed + 1023)
 53 | 
 54 |         # Dataloader generator, used to seed individual workers by the dataloader.
 55 |         self.loader_generator = torch.Generator()
 56 |         self.loader_generator.manual_seed(self.base_seed + 511)
 57 | 
 58 |         # Generator used to sample random features (runs on the GPU).
 59 |         self.sampling_generator = torch.Generator(device=self.device)
 60 |         self.sampling_generator.manual_seed(self.base_seed + 4095)
 61 | 
 62 |         # Generator used to permute the feature indices during each training epoch.
 63 |         self.training_generator = torch.Generator()
 64 |         self.training_generator.manual_seed(self.base_seed + 8191)
 65 | 
 66 |         self.iteration = 0
 67 |         self.training_start = None
 68 |         self.num_data_loader_workers = 12
 69 | 
 70 |         # Create dataset.
 71 |         self.dataset = CamLocDataset(
 72 |             root_dir=self.options.scene / "train",
 73 |             mode=0,  # Default for ACE, we don't need scene coordinates/RGB-D.
 74 |             use_half=self.options.use_half,
 75 |             image_height=self.options.image_resolution,
 76 |             augment=self.options.use_aug,
 77 |             aug_rotation=self.options.aug_rotation,
 78 |             aug_scale_max=self.options.aug_scale,
 79 |             aug_scale_min=1 / self.options.aug_scale,
 80 |             num_clusters=self.options.num_clusters,  # Optional clustering for Cambridge experiments.
 81 |             cluster_idx=self.options.cluster_idx,    # Optional clustering for Cambridge experiments.
 82 |         )
 83 | 
 84 |         _logger.info("Loaded training scan from: {} -- {} images, mean: {:.2f} {:.2f} {:.2f}".format(
 85 |             self.options.scene,
 86 |             len(self.dataset),
 87 |             self.dataset.mean_cam_center[0],
 88 |             self.dataset.mean_cam_center[1],
 89 |             self.dataset.mean_cam_center[2]))
 90 | 
 91 |         # Create network using the state dict of the pretrained encoder.
 92 |         encoder_state_dict = torch.load(self.options.encoder_path, map_location="cpu")
 93 |         self.regressor = Regressor.create_from_encoder(
 94 |             encoder_state_dict,
 95 |             mean=self.dataset.mean_cam_center,
 96 |             num_head_blocks=self.options.num_head_blocks,
 97 |             use_homogeneous=self.options.use_homogeneous
 98 |         )
 99 |         _logger.info(f"Loaded pretrained encoder from: {self.options.encoder_path}")
100 | 
101 |         self.regressor = self.regressor.to(self.device)
102 |         self.regressor.train()
103 | 
104 |         # Setup optimization parameters.
105 |         self.optimizer = optim.AdamW(self.regressor.parameters(), lr=self.options.learning_rate_min)
106 | 
107 |         # Setup learning rate scheduler.
108 |         steps_per_epoch = self.options.training_buffer_size // self.options.batch_size
109 |         self.scheduler = optim.lr_scheduler.OneCycleLR(self.optimizer,
110 |                                                        max_lr=self.options.learning_rate_max,
111 |                                                        epochs=self.options.epochs,
112 |                                                        steps_per_epoch=steps_per_epoch,
113 |                                                        cycle_momentum=False)
114 | 
115 |         # Gradient scaler in case we train with half precision.
116 |         self.scaler = GradScaler(enabled=self.options.use_half)
117 | 
118 |         # Generate grid of target reprojection pixel positions.
119 |         pixel_grid_2HW = get_pixel_grid(self.regressor.OUTPUT_SUBSAMPLE)
120 |         self.pixel_grid_2HW = pixel_grid_2HW.to(self.device)
121 | 
122 |         # Compute total number of iterations.
123 |         self.iterations = self.options.epochs * self.options.training_buffer_size // self.options.batch_size
124 |         self.iterations_output = 100 # print loss every n iterations, and (optionally) write a visualisation frame
125 | 
126 |         # Setup reprojection loss function.
127 |         self.repro_loss = ReproLoss(
128 |             total_iterations=self.iterations,
129 |             soft_clamp=self.options.repro_loss_soft_clamp,
130 |             soft_clamp_min=self.options.repro_loss_soft_clamp_min,
131 |             type=self.options.repro_loss_type,
132 |             circle_schedule=(self.options.repro_loss_schedule == 'circle')
133 |         )
134 | 
135 |         # Will be filled at the beginning of the training process.
136 |         self.training_buffer = None
137 | 
138 |         # Generate video of training process
139 |         if self.options.render_visualization:
140 |             # infer rendering folder from map file name
141 |             target_path = vutil.get_rendering_target_path(
142 |                 self.options.render_target_path,
143 |                 self.options.output_map_file)
144 |             self.ace_visualizer = ACEVisualizer(
145 |                 target_path,
146 |                 self.options.render_flipped_portrait,
147 |                 self.options.render_map_depth_filter,
148 |                 mapping_vis_error_threshold=self.options.render_map_error_threshold)
149 |         else:
150 |             self.ace_visualizer = None
151 | 
152 |     def train(self):
153 |         """
154 |         Main training method.
155 | 
156 |         Fills a feature buffer using the pretrained encoder and subsequently trains a scene coordinate regression head.
157 |         """
158 | 
159 |         if self.ace_visualizer is not None:
160 | 
161 |             # Setup the ACE render pipeline.
162 |             self.ace_visualizer.setup_mapping_visualisation(
163 |                 self.dataset.pose_files,
164 |                 self.dataset.rgb_files,
165 |                 self.iterations // self.iterations_output + 1,
166 |                 self.options.render_camera_z_offset
167 |             )
168 | 
169 |         creating_buffer_time = 0.
170 |         training_time = 0.
171 | 
172 |         self.training_start = time.time()
173 | 
174 |         # Create training buffer.
175 |         buffer_start_time = time.time()
176 |         self.create_training_buffer()
177 |         buffer_end_time = time.time()
178 |         creating_buffer_time += buffer_end_time - buffer_start_time
179 |         _logger.info(f"Filled training buffer in {buffer_end_time - buffer_start_time:.1f}s.")
180 | 
181 |         # Train the regression head.
182 |         for self.epoch in range(self.options.epochs):
183 |             epoch_start_time = time.time()
184 |             self.run_epoch()
185 |             training_time += time.time() - epoch_start_time
186 | 
187 |         # Save trained model.
188 |         self.save_model()
189 | 
190 |         end_time = time.time()
191 |         _logger.info(f'Done without errors. '
192 |                      f'Creating buffer time: {creating_buffer_time:.1f} seconds. '
193 |                      f'Training time: {training_time:.1f} seconds. '
194 |                      f'Total time: {end_time - self.training_start:.1f} seconds.')
195 | 
196 |         if self.ace_visualizer is not None:
197 | 
198 |             # Finalize the rendering by animating the fully trained map.
199 |             vis_dataset = CamLocDataset(
200 |                 root_dir=self.options.scene / "train",
201 |                 mode=0,
202 |                 use_half=self.options.use_half,
203 |                 image_height=self.options.image_resolution,
204 |                 augment=False) # No data augmentation when visualizing the map
205 | 
206 |             vis_dataset_loader = torch.utils.data.DataLoader(
207 |                 vis_dataset,
208 |                 shuffle=False, # Process data in order for a growing effect later when rendering
209 |                 num_workers=self.num_data_loader_workers)
210 | 
211 |             self.ace_visualizer.finalize_mapping(self.regressor, vis_dataset_loader)
212 | 
213 |     def create_training_buffer(self):
214 |         # Disable benchmarking, since we have variable tensor sizes.
215 |         torch.backends.cudnn.benchmark = False
216 | 
217 |         # Sampler.
218 |         batch_sampler = sampler.BatchSampler(sampler.RandomSampler(self.dataset, generator=self.batch_generator),
219 |                                              batch_size=1,
220 |                                              drop_last=False)
221 | 
222 |         # Used to seed workers in a reproducible manner.
223 |         def seed_worker(worker_id):
224 |             # Different seed per epoch. Initial seed is generated by the main process consuming one random number from
225 |             # the dataloader generator.
226 |             worker_seed = torch.initial_seed() % 2 ** 32
227 |             np.random.seed(worker_seed)
228 |             random.seed(worker_seed)
229 | 
230 |         # Batching is handled at the dataset level (the dataset __getitem__ receives a list of indices, because we
231 |         # need to rescale all images in the batch to the same size).
232 |         training_dataloader = DataLoader(dataset=self.dataset,
233 |                                          sampler=batch_sampler,
234 |                                          batch_size=None,
235 |                                          worker_init_fn=seed_worker,
236 |                                          generator=self.loader_generator,
237 |                                          pin_memory=True,
238 |                                          num_workers=self.num_data_loader_workers,
239 |                                          persistent_workers=self.num_data_loader_workers > 0,
240 |                                          timeout=60 if self.num_data_loader_workers > 0 else 0,
241 |                                          )
242 | 
243 |         _logger.info("Starting creation of the training buffer.")
244 | 
245 |         # Create a training buffer that lives on the GPU.
246 |         self.training_buffer = {
247 |             'features': torch.empty((self.options.training_buffer_size, self.regressor.feature_dim),
248 |                                     dtype=(torch.float32, torch.float16)[self.options.use_half], device=self.device),
249 |             'target_px': torch.empty((self.options.training_buffer_size, 2), dtype=torch.float32, device=self.device),
250 |             'gt_poses_inv': torch.empty((self.options.training_buffer_size, 3, 4), dtype=torch.float32,
251 |                                         device=self.device),
252 |             'intrinsics': torch.empty((self.options.training_buffer_size, 3, 3), dtype=torch.float32,
253 |                                       device=self.device),
254 |             'intrinsics_inv': torch.empty((self.options.training_buffer_size, 3, 3), dtype=torch.float32,
255 |                                           device=self.device)
256 |         }
257 | 
258 |         # Features are computed in evaluation mode.
259 |         self.regressor.eval()
260 | 
261 |         # The encoder is pretrained, so we don't compute any gradient.
262 |         with torch.no_grad():
263 |             # Iterate until the training buffer is full.
264 |             buffer_idx = 0
265 |             dataset_passes = 0
266 | 
267 |             while buffer_idx < self.options.training_buffer_size:
268 |                 dataset_passes += 1
269 |                 for image_B1HW, image_mask_B1HW, gt_pose_B44, gt_pose_inv_B44, intrinsics_B33, intrinsics_inv_B33, _, _ in training_dataloader:
270 | 
271 |                     # Copy to device.
272 |                     image_B1HW = image_B1HW.to(self.device, non_blocking=True)
273 |                     image_mask_B1HW = image_mask_B1HW.to(self.device, non_blocking=True)
274 |                     gt_pose_inv_B44 = gt_pose_inv_B44.to(self.device, non_blocking=True)
275 |                     intrinsics_B33 = intrinsics_B33.to(self.device, non_blocking=True)
276 |                     intrinsics_inv_B33 = intrinsics_inv_B33.to(self.device, non_blocking=True)
277 | 
278 |                     # Compute image features.
279 |                     with autocast(enabled=self.options.use_half):
280 |                         features_BCHW = self.regressor.get_features(image_B1HW)
281 | 
282 |                     # Dimensions after the network's downsampling.
283 |                     B, C, H, W = features_BCHW.shape
284 | 
285 |                     # The image_mask needs to be downsampled to the actual output resolution and cast to bool.
286 |                     image_mask_B1HW = TF.resize(image_mask_B1HW, [H, W], interpolation=TF.InterpolationMode.NEAREST)
287 |                     image_mask_B1HW = image_mask_B1HW.bool()
288 | 
289 |                     # If the current mask has no valid pixels, continue.
290 |                     if image_mask_B1HW.sum() == 0:
291 |                         continue
292 | 
293 |                     # Create a tensor with the pixel coordinates of every feature vector.
294 |                     pixel_positions_B2HW = self.pixel_grid_2HW[:, :H, :W].clone()  # It's 2xHxW (actual H and W) now.
295 |                     pixel_positions_B2HW = pixel_positions_B2HW[None]  # 1x2xHxW
296 |                     pixel_positions_B2HW = pixel_positions_B2HW.expand(B, 2, H, W)  # Bx2xHxW
297 | 
298 |                     # Bx3x4 -> Nx3x4 (for each image, repeat pose per feature)
299 |                     gt_pose_inv = gt_pose_inv_B44[:, :3]
300 |                     gt_pose_inv = gt_pose_inv.unsqueeze(1).expand(B, H * W, 3, 4).reshape(-1, 3, 4)
301 | 
302 |                     # Bx3x3 -> Nx3x3 (for each image, repeat intrinsics per feature)
303 |                     intrinsics = intrinsics_B33.unsqueeze(1).expand(B, H * W, 3, 3).reshape(-1, 3, 3)
304 |                     intrinsics_inv = intrinsics_inv_B33.unsqueeze(1).expand(B, H * W, 3, 3).reshape(-1, 3, 3)
305 | 
306 |                     def normalize_shape(tensor_in):
307 |                         """Bring tensor from shape BxCxHxW to NxC"""
308 |                         return tensor_in.transpose(0, 1).flatten(1).transpose(0, 1)
309 | 
310 |                     batch_data = {
311 |                         'features': normalize_shape(features_BCHW),
312 |                         'target_px': normalize_shape(pixel_positions_B2HW),
313 |                         'gt_poses_inv': gt_pose_inv,
314 |                         'intrinsics': intrinsics,
315 |                         'intrinsics_inv': intrinsics_inv
316 |                     }
317 | 
318 |                     # Turn image mask into sampling weights (all equal).
319 |                     image_mask_B1HW = image_mask_B1HW.float()
320 |                     image_mask_N1 = normalize_shape(image_mask_B1HW)
321 | 
322 |                     # Over-sample according to image mask.
323 |                     features_to_select = self.options.samples_per_image * B
324 |                     features_to_select = min(features_to_select, self.options.training_buffer_size - buffer_idx)
325 | 
326 |                     # Sample indices uniformly, with replacement.
327 |                     sample_idxs = torch.multinomial(image_mask_N1.view(-1),
328 |                                                     features_to_select,
329 |                                                     replacement=True,
330 |                                                     generator=self.sampling_generator)
331 | 
332 |                     # Select the data to put in the buffer.
333 |                     for k in batch_data:
334 |                         batch_data[k] = batch_data[k][sample_idxs]
335 | 
336 |                     # Write to training buffer. Start at buffer_idx and end at buffer_offset - 1.
337 |                     buffer_offset = buffer_idx + features_to_select
338 |                     for k in batch_data:
339 |                         self.training_buffer[k][buffer_idx:buffer_offset] = batch_data[k]
340 | 
341 |                     buffer_idx = buffer_offset
342 |                     if buffer_idx >= self.options.training_buffer_size:
343 |                         break
344 | 
345 |         buffer_memory = sum([v.element_size() * v.nelement() for k, v in self.training_buffer.items()])
346 |         buffer_memory /= 1024 * 1024 * 1024
347 | 
348 |         _logger.info(f"Created buffer of {buffer_memory:.2f}GB with {dataset_passes} passes over the training data.")
349 |         self.regressor.train()
350 | 
351 |     def run_epoch(self):
352 |         """
353 |         Run one epoch of training, shuffling the feature buffer and iterating over it.
354 |         """
355 |         # Enable benchmarking since all operations work on the same tensor size.
356 |         torch.backends.cudnn.benchmark = True
357 | 
358 |         # Shuffle indices.
359 |         random_indices = torch.randperm(self.options.training_buffer_size, generator=self.training_generator)
360 | 
361 |         # Iterate with mini batches.
362 |         for batch_start in range(0, self.options.training_buffer_size, self.options.batch_size):
363 |             batch_end = batch_start + self.options.batch_size
364 | 
365 |             # Drop last batch if not full.
366 |             if batch_end > self.options.training_buffer_size:
367 |                 continue
368 | 
369 |             # Sample indices.
370 |             random_batch_indices = random_indices[batch_start:batch_end]
371 | 
372 |             # Call the training step with the sampled features and relevant metadata.
373 |             self.training_step(
374 |                 self.training_buffer['features'][random_batch_indices].contiguous(),
375 |                 self.training_buffer['target_px'][random_batch_indices].contiguous(),
376 |                 self.training_buffer['gt_poses_inv'][random_batch_indices].contiguous(),
377 |                 self.training_buffer['intrinsics'][random_batch_indices].contiguous(),
378 |                 self.training_buffer['intrinsics_inv'][random_batch_indices].contiguous()
379 |             )
380 |             self.iteration += 1
381 | 
382 |     def training_step(self, features_bC, target_px_b2, gt_inv_poses_b34, Ks_b33, invKs_b33):
383 |         """
384 |         Run one iteration of training, computing the reprojection error and minimising it.
385 |         """
386 |         batch_size = features_bC.shape[0]
387 |         channels = features_bC.shape[1]
388 | 
389 |         # Reshape to a "fake" BCHW shape, since it's faster to run through the network compared to the original shape.
390 |         features_bCHW = features_bC[None, None, ...].view(-1, 16, 32, channels).permute(0, 3, 1, 2)
391 |         with autocast(enabled=self.options.use_half):
392 |             pred_scene_coords_b3HW = self.regressor.get_scene_coordinates(features_bCHW)
393 | 
394 |         # Back to the original shape. Convert to float32 as well.
395 |         pred_scene_coords_b31 = pred_scene_coords_b3HW.permute(0, 2, 3, 1).flatten(0, 2).unsqueeze(-1).float()
396 | 
397 |         # Make 3D points homogeneous so that we can easily matrix-multiply them.
398 |         pred_scene_coords_b41 = to_homogeneous(pred_scene_coords_b31)
399 | 
400 |         # Scene coordinates to camera coordinates.
401 |         pred_cam_coords_b31 = torch.bmm(gt_inv_poses_b34, pred_scene_coords_b41)
402 | 
403 |         # Project scene coordinates.
404 |         pred_px_b31 = torch.bmm(Ks_b33, pred_cam_coords_b31)
405 | 
406 |         # Avoid division by zero.
407 |         # Note: negative values are also clamped at +self.options.depth_min. The predicted pixel would be wrong,
408 |         # but that's fine since we mask them out later.
409 |         pred_px_b31[:, 2].clamp_(min=self.options.depth_min)
410 | 
411 |         # Dehomogenise.
412 |         pred_px_b21 = pred_px_b31[:, :2] / pred_px_b31[:, 2, None]
413 | 
414 |         # Measure reprojection error.
415 |         reprojection_error_b2 = pred_px_b21.squeeze() - target_px_b2
416 |         reprojection_error_b1 = torch.norm(reprojection_error_b2, dim=1, keepdim=True, p=1)
417 | 
418 |         #
419 |         # Compute masks used to ignore invalid pixels.
420 |         #
421 |         # Predicted coordinates behind or close to camera plane.
422 |         invalid_min_depth_b1 = pred_cam_coords_b31[:, 2] < self.options.depth_min
423 |         # Very large reprojection errors.
424 |         invalid_repro_b1 = reprojection_error_b1 > self.options.repro_loss_hard_clamp
425 |         # Predicted coordinates beyond max distance.
426 |         invalid_max_depth_b1 = pred_cam_coords_b31[:, 2] > self.options.depth_max
427 | 
428 |         # Invalid mask is the union of all these. Valid mask is the opposite.
429 |         invalid_mask_b1 = (invalid_min_depth_b1 | invalid_repro_b1 | invalid_max_depth_b1)
430 |         valid_mask_b1 = ~invalid_mask_b1
431 | 
432 |         # Reprojection error for all valid scene coordinates.
433 |         valid_reprojection_error_b1 = reprojection_error_b1[valid_mask_b1]
434 |         # Compute the loss for valid predictions.
435 |         loss_valid = self.repro_loss.compute(valid_reprojection_error_b1, self.iteration)
436 | 
437 |         # Handle the invalid predictions: generate proxy coordinate targets with constant depth assumption.
438 |         pixel_grid_crop_b31 = to_homogeneous(target_px_b2.unsqueeze(2))
439 |         target_camera_coords_b31 = self.options.depth_target * torch.bmm(invKs_b33, pixel_grid_crop_b31)
440 | 
441 |         # Compute the distance to target camera coordinates.
442 |         invalid_mask_b11 = invalid_mask_b1.unsqueeze(2)
443 |         loss_invalid = torch.abs(target_camera_coords_b31 - pred_cam_coords_b31).masked_select(invalid_mask_b11).sum()
444 | 
445 |         # Final loss is the sum of all 2.
446 |         loss = loss_valid + loss_invalid
447 |         loss /= batch_size
448 | 
449 |         # We need to check if the step actually happened, since the scaler might skip optimisation steps.
450 |         old_optimizer_step = self.optimizer._step_count
451 | 
452 |         # Optimization steps.
453 |         self.optimizer.zero_grad(set_to_none=True)
454 |         self.scaler.scale(loss).backward()
455 |         self.scaler.step(self.optimizer)
456 |         self.scaler.update()
457 | 
458 |         if self.iteration % self.iterations_output == 0:
459 |             # Print status.
460 |             time_since_start = time.time() - self.training_start
461 |             fraction_valid = float(valid_mask_b1.sum() / batch_size)
462 |             # median_depth = float(pred_cam_coords_b31[:, 2].median())
463 | 
464 |             _logger.info(f'Iteration: {self.iteration:6d} / Epoch {self.epoch:03d}|{self.options.epochs:03d}, '
465 |                          f'Loss: {loss:.1f}, Valid: {fraction_valid * 100:.1f}%, Time: {time_since_start:.2f}s')
466 | 
467 |             if self.ace_visualizer is not None:
468 |                 vis_scene_coords = pred_scene_coords_b31.detach().cpu().squeeze().numpy()
469 |                 vis_errors = reprojection_error_b1.detach().cpu().squeeze().numpy()
470 |                 self.ace_visualizer.render_mapping_frame(vis_scene_coords, vis_errors)
471 | 
472 |         # Only step if the optimizer stepped and if we're not over-stepping the total_steps supported by the scheduler.
473 |         if old_optimizer_step < self.optimizer._step_count < self.scheduler.total_steps:
474 |             self.scheduler.step()
475 | 
476 |     def save_model(self):
477 |         # NOTE: This would save the whole regressor (encoder weights included) in full precision floats (~30MB).
478 |         # torch.save(self.regressor.state_dict(), self.options.output_map_file)
479 | 
480 |         # This saves just the head weights as half-precision floating point numbers for a total of ~4MB, as mentioned
481 |         # in the paper. The scene-agnostic encoder weights can then be loaded from the pretrained encoder file.
482 |         head_state_dict = self.regressor.heads.state_dict()
483 |         for k, v in head_state_dict.items():
484 |             head_state_dict[k] = head_state_dict[k].half()
485 |         torch.save(head_state_dict, self.options.output_map_file)
486 |         _logger.info(f"Saved trained head weights to: {self.options.output_map_file}")


--------------------------------------------------------------------------------
/ace_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright © Niantic, Inc. 2022.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | def get_pixel_grid(subsampling_factor):
 7 |     """
 8 |     Generate target pixel positions according to a subsampling factor, assuming prediction at center pixel.
 9 |     """
10 |     pix_range = torch.arange(np.ceil(5000 / subsampling_factor), dtype=torch.float32)
11 |     yy, xx = torch.meshgrid(pix_range, pix_range, indexing='ij')
12 |     return subsampling_factor * (torch.stack([xx, yy]) + 0.5)
13 | 
14 | def to_homogeneous(input_tensor, dim=1):
15 |     """
16 |     Converts tensor to homogeneous coordinates by adding ones to the specified dimension
17 |     """
18 |     ones = torch.ones_like(input_tensor.select(dim, 0).unsqueeze(dim))
19 |     output = torch.cat([input_tensor, ones], dim=dim)
20 |     return output


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import cv2
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | import torchvision.transforms.functional as TF
 11 | from skimage import color
 12 | from skimage import io
 13 | from skimage.transform import rotate, resize
 14 | from torch.utils.data import Dataset
 15 | from torch.utils.data.dataloader import default_collate
 16 | from torchvision import transforms
 17 | 
 18 | from ace_network import Regressor
 19 | 
 20 | _logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class CamLocDataset(Dataset):
 24 |     """Camera localization dataset.
 25 | 
 26 |     Access to image, calibration and ground truth data given a dataset directory.
 27 |     """
 28 | 
 29 |     def __init__(self,
 30 |                  root_dir,
 31 |                  mode=0,
 32 |                  sparse=False,
 33 |                  augment=False,
 34 |                  aug_rotation=15,
 35 |                  aug_scale_min=2 / 3,
 36 |                  aug_scale_max=3 / 2,
 37 |                  aug_black_white=0.1,
 38 |                  aug_color=0.3,
 39 |                  image_height=480,
 40 |                  use_half=True,
 41 |                  num_clusters=None,
 42 |                  cluster_idx=None,
 43 |                  ):
 44 |         """Constructor.
 45 | 
 46 |         Parameters:
 47 |             root_dir: Folder of the data (training or test).
 48 |             mode:
 49 |                 0 = RGB only, load no initialization targets. Default for the ACE paper.
 50 |                 1 = RGB + ground truth scene coordinates, load or generate ground truth scene coordinate targets
 51 |                 2 = RGB-D, load camera coordinates instead of scene coordinates
 52 |             sparse: for mode = 1 (RGB+GT SC), load sparse initialization targets when True, load dense depth maps and
 53 |                 generate initialization targets when False
 54 |             augment: Use random data augmentation, note: not supported for mode = 2 (RGB-D) since pre-generated eye
 55 |                 coordinates cannot be augmented
 56 |             aug_rotation: Max 2D image rotation angle, sampled uniformly around 0, both directions, degrees.
 57 |             aug_scale_min: Lower limit of image scale factor for uniform sampling
 58 |             aug_scale_min: Upper limit of image scale factor for uniform sampling
 59 |             aug_black_white: Max relative scale factor for image brightness/contrast sampling, e.g. 0.1 -> [0.9,1.1]
 60 |             aug_color: Max relative scale factor for image saturation/hue sampling, e.g. 0.1 -> [0.9,1.1]
 61 |             image_height: RGB images are rescaled to this maximum height (if augmentation is disabled, and in the range
 62 |                 [aug_scale_min * image_height, aug_scale_max * image_height] otherwise).
 63 |             use_half: Enabled if training with half-precision floats.
 64 |             num_clusters: split the input frames into disjoint clusters using hierarchical clustering in order to train
 65 |                 an ensemble model. Clustering is deterministic, so multiple training calls with the same number of
 66 |                 target clusters will result in the same split. See the paper for details of the approach. Disabled by
 67 |                 default.
 68 |             cluster_idx: If num_clusters is not None, then use this parameter to choose the cluster used for training.
 69 |         """
 70 | 
 71 |         self.use_half = use_half
 72 | 
 73 |         self.init = (mode == 1)
 74 |         self.sparse = sparse
 75 |         self.eye = (mode == 2)
 76 | 
 77 |         self.image_height = image_height
 78 | 
 79 |         self.augment = augment
 80 |         self.aug_rotation = aug_rotation
 81 |         self.aug_scale_min = aug_scale_min
 82 |         self.aug_scale_max = aug_scale_max
 83 |         self.aug_black_white = aug_black_white
 84 |         self.aug_color = aug_color
 85 | 
 86 |         self.num_clusters = num_clusters
 87 |         self.cluster_idx = cluster_idx
 88 | 
 89 |         if self.num_clusters is not None:
 90 |             if self.num_clusters < 1:
 91 |                 raise ValueError("num_clusters must be at least 1")
 92 | 
 93 |             if self.cluster_idx is None:
 94 |                 raise ValueError("cluster_idx needs to be specified when num_clusters is set")
 95 | 
 96 |             if self.cluster_idx < 0 or self.cluster_idx >= self.num_clusters:
 97 |                 raise ValueError(f"cluster_idx needs to be between 0 and {self.num_clusters - 1}")
 98 | 
 99 |         if self.eye and self.augment and (self.aug_rotation > 0 or self.aug_scale_min != 1 or self.aug_scale_max != 1):
100 |             # pre-generated eye coordinates cannot be augmented
101 |             _logger.warning("WARNING: Check your augmentation settings. Camera coordinates will not be augmented.")
102 | 
103 |         # Setup data paths.
104 |         root_dir = Path(root_dir)
105 | 
106 |         # Main folders.
107 |         rgb_dir = root_dir / 'rgb'
108 |         pose_dir = root_dir / 'poses'
109 |         calibration_dir = root_dir / 'calibration'
110 | 
111 |         # Optional folders. Unused in ACE.
112 |         if self.eye:
113 |             coord_dir = root_dir / 'eye'
114 |         elif self.sparse:
115 |             coord_dir = root_dir / 'init'
116 |         else:
117 |             coord_dir = root_dir / 'depth'
118 | 
119 |         # Find all images. The assumption is that it only contains image files.
120 |         self.rgb_files = sorted(rgb_dir.iterdir())
121 | 
122 |         # Find all ground truth pose files. One per image.
123 |         self.pose_files = sorted(pose_dir.iterdir())
124 | 
125 |         # Load camera calibrations. One focal length per image.
126 |         self.calibration_files = sorted(calibration_dir.iterdir())
127 | 
128 |         if self.init or self.eye:
129 |             # Load GT scene coordinates.
130 |             self.coord_files = sorted(coord_dir.iterdir())
131 |         else:
132 |             self.coord_files = None
133 | 
134 |         if len(self.rgb_files) != len(self.pose_files):
135 |             raise RuntimeError('RGB file count does not match pose file count!')
136 | 
137 |         if len(self.rgb_files) != len(self.calibration_files):
138 |             raise RuntimeError('RGB file count does not match calibration file count!')
139 | 
140 |         if self.coord_files and len(self.rgb_files) != len(self.coord_files):
141 |             raise RuntimeError('RGB file count does not match coordinate file count!')
142 | 
143 |         # Create grid of 2D pixel positions used when generating scene coordinates from depth.
144 |         if self.init and not self.sparse:
145 |             self.prediction_grid = self._create_prediction_grid()
146 |         else:
147 |             self.prediction_grid = None
148 | 
149 |         # Image transformations. Excluding scale since that can vary batch-by-batch.
150 |         if self.augment:
151 |             self.image_transform = transforms.Compose([
152 |                 # transforms.ToPILImage(),
153 |                 # transforms.Resize(int(self.image_height * scale_factor)),
154 |                 transforms.Grayscale(),
155 |                 transforms.ColorJitter(brightness=self.aug_black_white, contrast=self.aug_black_white),
156 |                 # saturation=self.aug_color, hue=self.aug_color),  # Disable colour augmentation.
157 |                 transforms.ToTensor(),
158 |                 transforms.Normalize(
159 |                     mean=[0.4],  # statistics calculated over 7scenes training set, should generalize fairly well
160 |                     std=[0.25]
161 |                 ),
162 |             ])
163 |         else:
164 |             self.image_transform = transforms.Compose([
165 |                 # transforms.ToPILImage(),
166 |                 # transforms.Resize(self.image_height),
167 |                 transforms.Grayscale(),
168 |                 transforms.ToTensor(),
169 |                 transforms.Normalize(
170 |                     mean=[0.4],  # statistics calculated over 7scenes training set, should generalize fairly well
171 |                     std=[0.25]
172 |                 ),
173 |             ])
174 | 
175 |         # We use this to iterate over all frames. If clustering is enabled this is used to filter them.
176 |         self.valid_file_indices = np.arange(len(self.rgb_files))
177 | 
178 |         # If clustering is enabled.
179 |         if self.num_clusters is not None:
180 |             _logger.info(f"Clustering the {len(self.rgb_files)} into {num_clusters} clusters.")
181 |             _, _, cluster_labels = self._cluster(num_clusters)
182 | 
183 |             self.valid_file_indices = np.flatnonzero(cluster_labels == cluster_idx)
184 |             _logger.info(f"After clustering, chosen cluster: {cluster_idx}, Using {len(self.valid_file_indices)} images.")
185 | 
186 |         # Calculate mean camera center (using the valid frames only).
187 |         self.mean_cam_center = self._compute_mean_camera_center()
188 | 
189 |     @staticmethod
190 |     def _create_prediction_grid():
191 |         # Assumes all input images have a resolution smaller than 5000x5000.
192 |         prediction_grid = np.zeros((2,
193 |                                     math.ceil(5000 / Regressor.OUTPUT_SUBSAMPLE),
194 |                                     math.ceil(5000 / Regressor.OUTPUT_SUBSAMPLE)))
195 | 
196 |         for x in range(0, prediction_grid.shape[2]):
197 |             for y in range(0, prediction_grid.shape[1]):
198 |                 prediction_grid[0, y, x] = x * Regressor.OUTPUT_SUBSAMPLE
199 |                 prediction_grid[1, y, x] = y * Regressor.OUTPUT_SUBSAMPLE
200 | 
201 |         return prediction_grid
202 | 
203 |     @staticmethod
204 |     def _resize_image(image, image_height):
205 |         # Resize a numpy image as PIL. Works slightly better than resizing the tensor using torch's internal function.
206 |         image = TF.to_pil_image(image)
207 |         image = TF.resize(image, image_height)
208 |         return image
209 | 
210 |     @staticmethod
211 |     def _rotate_image(image, angle, order, mode='constant'):
212 |         # Image is a torch tensor (CxHxW), convert it to numpy as HxWxC.
213 |         image = image.permute(1, 2, 0).numpy()
214 |         # Apply rotation.
215 |         image = rotate(image, angle, order=order, mode=mode)
216 |         # Back to torch tensor.
217 |         image = torch.from_numpy(image).permute(2, 0, 1).float()
218 |         return image
219 | 
220 |     def _cluster(self, num_clusters):
221 |         """
222 |         Clusters the dataset using hierarchical kMeans.
223 |         Initialization:
224 |             Put all images in one cluster.
225 |         Interate:
226 |             Pick largest cluster.
227 |             Split with kMeans and k=2.
228 |             Input for kMeans is the 3D median scene coordiante per image.
229 |         Terminate:
230 |             When number of target clusters has been reached.
231 |         Returns:
232 |             cam_centers: For each cluster the mean (not median) scene coordinate
233 |             labels: For each image the cluster ID
234 |         """
235 |         num_images = len(self.pose_files)
236 |         _logger.info(f'Clustering a dataset with {num_images} frames into {num_clusters} clusters.')
237 | 
238 |         # A tensor holding all camera centers used for clustering.
239 |         cam_centers = np.zeros((num_images, 3), dtype=np.float32)
240 |         for i in range(num_images):
241 |             pose = self._load_pose(i)
242 |             cam_centers[i] = pose[:3, 3]
243 | 
244 |         # Setup kMEans
245 |         criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.1)
246 |         flags = cv2.KMEANS_PP_CENTERS
247 | 
248 |         # Label of next cluster.
249 |         label_counter = 0
250 | 
251 |         # Initialise list of clusters with all images.
252 |         clusters = []
253 |         clusters.append((cam_centers, label_counter, np.zeros(3)))
254 | 
255 |         # All images belong to cluster 0.
256 |         labels = np.zeros(num_images)
257 | 
258 |         # iterate kMeans with k=2
259 |         while len(clusters) < num_clusters:
260 |             # Select largest cluster (list is sorted).
261 |             cur_cluster = clusters.pop(0)
262 |             label_counter += 1
263 | 
264 |             # Split cluster.
265 |             cur_error, cur_labels, cur_centroids = cv2.kmeans(cur_cluster[0], 2, None, criteria, 10, flags)
266 | 
267 |             # Update cluster list.
268 |             cur_mask = (cur_labels == 0)[:, 0]
269 |             cur_cam_centers0 = cur_cluster[0][cur_mask, :]
270 |             clusters.append((cur_cam_centers0, cur_cluster[1], cur_centroids[0]))
271 | 
272 |             cur_mask = (cur_labels == 1)[:, 0]
273 |             cur_cam_centers1 = cur_cluster[0][cur_mask, :]
274 |             clusters.append((cur_cam_centers1, label_counter, cur_centroids[1]))
275 | 
276 |             cluster_labels = labels[labels == cur_cluster[1]]
277 |             cluster_labels[cur_mask] = label_counter
278 |             labels[labels == cur_cluster[1]] = cluster_labels
279 | 
280 |             # Sort updated list.
281 |             clusters = sorted(clusters, key=lambda cluster: cluster[0].shape[0], reverse=True)
282 | 
283 |         # clusters are sorted but cluster indices are random, remap cluster indices to sorted indices
284 |         remapped_labels = np.zeros(num_images)
285 |         remapped_clusters = []
286 | 
287 |         for cluster_idx_new, cluster in enumerate(clusters):
288 |             cluster_idx_old = cluster[1]
289 |             remapped_labels[labels == cluster_idx_old] = cluster_idx_new
290 |             remapped_clusters.append((cluster[0], cluster_idx_new, cluster[2]))
291 | 
292 |         labels = remapped_labels
293 |         clusters = remapped_clusters
294 | 
295 |         cluster_centers = np.zeros((num_clusters, 3))
296 |         cluster_sizes = np.zeros((num_clusters, 1))
297 | 
298 |         for cluster in clusters:
299 |             # Compute distance of each cam to the center of the cluster.
300 |             cam_num = cluster[0].shape[0]
301 |             cam_data = np.zeros((cam_num, 3))
302 |             cam_count = 0
303 | 
304 |             # First compute the center of the cluster (mean).
305 |             for i, cam_center in enumerate(cam_centers):
306 |                 if labels[i] == cluster[1]:
307 |                     cam_data[cam_count] = cam_center
308 |                     cam_count += 1
309 | 
310 |             cluster_centers[cluster[1]] = cam_data.mean(0)
311 | 
312 |             # Compute the distance of each cam from the cluster center. Then average and square.
313 |             cam_dists = np.broadcast_to(cluster_centers[cluster[1]][np.newaxis, :], (cam_num, 3))
314 |             cam_dists = cam_data - cam_dists
315 |             cam_dists = np.linalg.norm(cam_dists, axis=1)
316 |             cam_dists = cam_dists ** 2
317 | 
318 |             cluster_sizes[cluster[1]] = cam_dists.mean()
319 | 
320 |             _logger.info("Cluster %i: %.1fm, %.1fm, %.1fm, images: %i, mean squared dist: %f" % (
321 |                 cluster[1], cluster_centers[cluster[1]][0], cluster_centers[cluster[1]][1], cluster_centers[cluster[1]][2],
322 |                 cluster[0].shape[0], cluster_sizes[cluster[1]]))
323 | 
324 |         _logger.info('Clustering done.')
325 | 
326 |         return cluster_centers, cluster_sizes, labels
327 | 
328 |     def _compute_mean_camera_center(self):
329 |         mean_cam_center = torch.zeros((3,))
330 | 
331 |         for idx in self.valid_file_indices:
332 |             pose = self._load_pose(idx)
333 | 
334 |             # Get the translation component.
335 |             mean_cam_center += pose[0:3, 3]
336 | 
337 |         # Avg.
338 |         mean_cam_center /= len(self)
339 |         return mean_cam_center
340 | 
341 |     def _load_image(self, idx):
342 |         image = io.imread(self.rgb_files[idx])
343 | 
344 |         if len(image.shape) < 3:
345 |             # Convert to RGB if needed.
346 |             image = color.gray2rgb(image)
347 | 
348 |         return image
349 | 
350 |     def _load_pose(self, idx):
351 |         # Stored as a 4x4 matrix.
352 |         pose = np.loadtxt(self.pose_files[idx])
353 |         pose = torch.from_numpy(pose).float()
354 | 
355 |         return pose
356 | 
357 |     def _get_single_item(self, idx, image_height):
358 |         # Apply index indirection.
359 |         idx = self.valid_file_indices[idx]
360 | 
361 |         # Load image.
362 |         image = self._load_image(idx)
363 | 
364 |         # Load intrinsics.
365 |         k = np.loadtxt(self.calibration_files[idx])
366 |         if k.size == 1:
367 |             focal_length = float(k)
368 |             centre_point = None
369 |         elif k.shape == (3, 3):
370 |             k = k.tolist()
371 |             focal_length = [k[0][0], k[1][1]]
372 |             centre_point = [k[0][2], k[1][2]]
373 |         else: 
374 |             raise Exception("Calibration file must contain either a 3x3 camera \
375 |                 intrinsics matrix or a single float giving the focal length \
376 |                 of the camera.")
377 | 
378 |         # The image will be scaled to image_height, adjust focal length as well.
379 |         f_scale_factor = image_height / image.shape[0]
380 |         if centre_point:
381 |             centre_point = [c * f_scale_factor for c in centre_point]
382 |             focal_length = [f * f_scale_factor for f in focal_length]
383 |         else:
384 |             focal_length *= f_scale_factor
385 | 
386 |         # Rescale image.
387 |         image = self._resize_image(image, image_height)
388 | 
389 |         # Create mask of the same size as the resized image (it's a PIL image at this point).
390 |         image_mask = torch.ones((1, image.size[1], image.size[0]))
391 | 
392 |         # Apply remaining transforms.
393 |         image = self.image_transform(image)
394 | 
395 |         # Load pose.
396 |         pose = self._load_pose(idx)
397 | 
398 |         # Load ground truth scene coordinates, if needed.
399 |         if self.init:
400 |             if self.sparse:
401 |                 coords = torch.load(self.coord_files[idx])
402 |             else:
403 |                 depth = io.imread(self.coord_files[idx])
404 |                 depth = depth.astype(np.float64)
405 |                 depth /= 1000  # from millimeters to meters
406 |         elif self.eye:
407 |             coords = torch.load(self.coord_files[idx])
408 |         else:
409 |             coords = 0  # Default for ACE, we don't need them.
410 | 
411 |         # Apply data augmentation if necessary.
412 |         if self.augment:
413 |             # Generate a random rotation angle.
414 |             angle = random.uniform(-self.aug_rotation, self.aug_rotation)
415 | 
416 |             # Rotate input image and mask.
417 |             image = self._rotate_image(image, angle, 1, 'reflect')
418 |             image_mask = self._rotate_image(image_mask, angle, order=1, mode='constant')
419 | 
420 |             # If we loaded the GT scene coordinates.
421 |             if self.init:
422 |                 if self.sparse:
423 |                     # rotate and scale initalization targets
424 |                     coords_w = math.ceil(image.size(2) / Regressor.OUTPUT_SUBSAMPLE)
425 |                     coords_h = math.ceil(image.size(1) / Regressor.OUTPUT_SUBSAMPLE)
426 |                     coords = F.interpolate(coords.unsqueeze(0), size=(coords_h, coords_w))[0]
427 | 
428 |                     coords = self._rotate_image(coords, angle, 0)
429 |                 else:
430 |                     # rotate and scale depth maps
431 |                     depth = resize(depth, image.shape[1:], order=0)
432 |                     depth = rotate(depth, angle, order=0, mode='constant')
433 | 
434 |             # Rotate ground truth camera pose as well.
435 |             angle = angle * math.pi / 180.
436 |             # Create a rotation matrix.
437 |             pose_rot = torch.eye(4)
438 |             pose_rot[0, 0] = math.cos(angle)
439 |             pose_rot[0, 1] = -math.sin(angle)
440 |             pose_rot[1, 0] = math.sin(angle)
441 |             pose_rot[1, 1] = math.cos(angle)
442 | 
443 |             # Apply rotation matrix to the ground truth camera pose.
444 |             pose = torch.matmul(pose, pose_rot)
445 | 
446 |         # Not used for ACE.
447 |         if self.init and not self.sparse:
448 |             # generate initialization targets from depth map
449 |             offsetX = int(Regressor.OUTPUT_SUBSAMPLE / 2)
450 |             offsetY = int(Regressor.OUTPUT_SUBSAMPLE / 2)
451 | 
452 |             coords = torch.zeros((
453 |                 3,
454 |                 math.ceil(image.shape[1] / Regressor.OUTPUT_SUBSAMPLE),
455 |                 math.ceil(image.shape[2] / Regressor.OUTPUT_SUBSAMPLE)))
456 | 
457 |             # subsample to network output size
458 |             depth = depth[offsetY::Regressor.OUTPUT_SUBSAMPLE, offsetX::Regressor.OUTPUT_SUBSAMPLE]
459 | 
460 |             # construct x and y coordinates of camera coordinate
461 |             xy = self.prediction_grid[:, :depth.shape[0], :depth.shape[1]].copy()
462 |             # add random pixel shift
463 |             xy[0] += offsetX
464 |             xy[1] += offsetY
465 |             # substract principal point (assume image center)
466 |             xy[0] -= image.shape[2] / 2
467 |             xy[1] -= image.shape[1] / 2
468 |             # reproject
469 |             xy /= focal_length
470 |             xy[0] *= depth
471 |             xy[1] *= depth
472 | 
473 |             # assemble camera coordinates tensor
474 |             eye = np.ndarray((4, depth.shape[0], depth.shape[1]))
475 |             eye[0:2] = xy
476 |             eye[2] = depth
477 |             eye[3] = 1
478 | 
479 |             # eye to scene coordinates
480 |             sc = np.matmul(pose.numpy(), eye.reshape(4, -1))
481 |             sc = sc.reshape(4, depth.shape[0], depth.shape[1])
482 | 
483 |             # mind pixels with invalid depth
484 |             sc[:, depth == 0] = 0
485 |             sc[:, depth > 1000] = 0
486 |             sc = torch.from_numpy(sc[0:3])
487 | 
488 |             coords[:, :sc.shape[1], :sc.shape[2]] = sc
489 | 
490 |         # Convert to half if needed.
491 |         if self.use_half and torch.cuda.is_available():
492 |             image = image.half()
493 | 
494 |         # Binarize the mask.
495 |         image_mask = image_mask > 0
496 | 
497 |         # Invert the pose.
498 |         pose_inv = pose.inverse()
499 | 
500 |         # Create the intrinsics matrix.
501 |         intrinsics = torch.eye(3)
502 |         
503 |         # Hardcode the principal point to the centre of the image unless otherwise specified.
504 |         if centre_point:
505 |             intrinsics[0, 0] = focal_length[0]
506 |             intrinsics[1, 1] = focal_length[1]
507 |             intrinsics[0, 2] = centre_point[0]
508 |             intrinsics[1, 2] = centre_point[1]
509 |         else:
510 |             intrinsics[0, 0] = focal_length
511 |             intrinsics[1, 1] = focal_length
512 |             intrinsics[0, 2] = image.shape[2] / 2
513 |             intrinsics[1, 2] = image.shape[1] / 2
514 | 
515 |         # Also need the inverse.
516 |         intrinsics_inv = intrinsics.inverse()
517 | 
518 |         return image, image_mask, pose, pose_inv, intrinsics, intrinsics_inv, coords, str(self.rgb_files[idx])
519 | 
520 |     def __len__(self):
521 |         return len(self.valid_file_indices)
522 | 
523 |     def __getitem__(self, idx):
524 |         if self.augment:
525 |             scale_factor = random.uniform(self.aug_scale_min, self.aug_scale_max)
526 |             # scale_factor = 1 / scale_factor #inverse scale sampling, not used for ACE mapping
527 |         else:
528 |             scale_factor = 1
529 | 
530 |         # Target image height. We compute it here in case we are asked for a full batch of tensors because we need
531 |         # to apply the same scale factor to all of them.
532 |         image_height = int(self.image_height * scale_factor)
533 | 
534 |         if type(idx) == list:
535 |             # Whole batch.
536 |             tensors = [self._get_single_item(i, image_height) for i in idx]
537 |             return default_collate(tensors)
538 |         else:
539 |             # Single element.
540 |             return self._get_single_item(idx, image_height)
541 | 


--------------------------------------------------------------------------------
/datasets/dataset_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from scipy.spatial.transform import Rotation as Rotation
  4 | 
  5 | def mkdir(directory):
  6 | 	"""Checks whether the directory exists and creates it if necessacy."""
  7 | 	if not os.path.exists(directory):
  8 | 		os.makedirs(directory)
  9 | 
 10 | def dlheidata(doi, filename):
 11 | 	"""Download and unpack data from heiData host."""
 12 | 	os.system(f"wget https://heidata.uni-heidelberg.de/api/access/datafile/:persistentId?persistentId=doi:{doi} -O {filename}")
 13 | 	os.system(f"tar -xvzf {filename}")
 14 | 	os.system(f"rm {filename}")
 15 | 
 16 | 
 17 | def get_base_file_name(file_name):
 18 |     '''
 19 |     Extracts the base file name by removing file extension and modality string.
 20 |     eg. frame-000000.color.jpg -> frame-000000
 21 |     '''
 22 | 
 23 |     base_file = os.path.splitext(file_name)[0]
 24 |     base_file = os.path.splitext(base_file)[0]
 25 | 
 26 |     return base_file
 27 | 
 28 | def read_pose_data(file_name):
 29 |     '''
 30 |     Expects path to file with one pose per line.
 31 |     Input pose is expected to map world to camera coordinates.
 32 |     Output pose maps camera to world coordinates.
 33 |     Pose format: file qw qx qy qz tx ty tz (f)
 34 |     Return dictionary that maps a file name to a tuple of (4x4 pose, focal_length)
 35 |     Sets focal_length to None if not contained in file.
 36 |     '''
 37 | 
 38 |     with open(file_name, "r") as f:
 39 |         pose_data = f.readlines()
 40 | 
 41 |     # create a dict from the poses with file name as key
 42 |     pose_dict = {}
 43 |     for pose_string in pose_data:
 44 | 
 45 |         pose_string = pose_string.split()
 46 |         file_name = pose_string[0]
 47 | 
 48 |         pose_q = np.array(pose_string[1:5])
 49 |         pose_q = np.array([pose_q[1], pose_q[2], pose_q[3], pose_q[0]])
 50 |         pose_t = np.array(pose_string[5:8])
 51 |         pose_R = Rotation.from_quat(pose_q).as_matrix()
 52 | 
 53 |         pose_4x4 = np.identity(4)
 54 |         pose_4x4[0:3, 0:3] = pose_R
 55 |         pose_4x4[0:3, 3] = pose_t
 56 | 
 57 |         # convert world->cam to cam->world for evaluation
 58 |         pose_4x4 = np.linalg.inv(pose_4x4)
 59 | 
 60 |         if len(pose_string) > 8:
 61 |             focal_length = float(pose_string[8])
 62 |         else:
 63 |             focal_length = None
 64 | 
 65 |         pose_dict[get_base_file_name(file_name)] = (pose_4x4, focal_length)
 66 | 
 67 |     return pose_dict
 68 | 
 69 | def write_cam_pose(file_path, cam_pose):
 70 |     '''
 71 |     Writes 4x4 camera pose to a human readable text file.
 72 |     '''
 73 |     with open(file_path, 'w') as f:
 74 |         f.write(str(float(cam_pose[0, 0])) + ' ' + str(float(cam_pose[0, 1])) + ' ' + str(
 75 |             float(cam_pose[0, 2])) + ' ' + str(float(cam_pose[0, 3])) + '\n')
 76 |         f.write(str(float(cam_pose[1, 0])) + ' ' + str(float(cam_pose[1, 1])) + ' ' + str(
 77 |             float(cam_pose[1, 2])) + ' ' + str(float(cam_pose[1, 3])) + '\n')
 78 |         f.write(str(float(cam_pose[2, 0])) + ' ' + str(float(cam_pose[2, 1])) + ' ' + str(
 79 |             float(cam_pose[2, 2])) + ' ' + str(float(cam_pose[2, 3])) + '\n')
 80 |         f.write(str(float(cam_pose[3, 0])) + ' ' + str(float(cam_pose[3, 1])) + ' ' + str(
 81 |             float(cam_pose[3, 2])) + ' ' + str(float(cam_pose[3, 3])) + '\n')
 82 | 
 83 | def write_focal_length(file_path, focal_length):
 84 |     '''
 85 |     Write the focal length to a human readable text file.
 86 |     '''
 87 |     with open(file_path, 'w') as f:
 88 |         f.write(str(focal_length))
 89 | 
 90 | def clone_external_pose_files():
 91 |     '''
 92 |     Clone repository containing SfM pose files for 7Scenes and 12Scenes.
 93 | 
 94 |     From paper:
 95 |     On the Limits of Pseudo Ground Truth in Visual Camera Re-localisation
 96 |     Eric Brachmann, Martin Humenberger, Carsten Rother, Torsten Sattler
 97 |     ICCV 2021
 98 | 
 99 |     @return folder name of the repo
100 |     '''
101 |     repo_url = "https://github.com/tsattler/visloc_pseudo_gt_limitations.git"
102 |     repo_folder = os.path.splitext(os.path.basename(repo_url))[0]
103 | 
104 |     if not os.path.exists(repo_folder):
105 |         os.system('git clone https://github.com/tsattler/visloc_pseudo_gt_limitations.git')
106 | 
107 |     return os.path.join(repo_folder, 'pgt', 'sfm')
108 | 


--------------------------------------------------------------------------------
/datasets/setup_12scenes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | import zipfile
  6 | 
  7 | import dataset_util as dutil
  8 | import numpy as np
  9 | import torch
 10 | from joblib import Parallel, delayed
 11 | from skimage import io
 12 | 
 13 | # name of the folder where we download the original 12scenes dataset to
 14 | # we restructure the dataset by creating symbolic links to that folder
 15 | src_folder = '12scenes_source'
 16 | 
 17 | # sub sampling factor of eye coordinate tensor
 18 | nn_subsampling = 8
 19 | 
 20 | if __name__ == '__main__':
 21 |     parser = argparse.ArgumentParser(
 22 |         description='Download and setup the 12Scenes dataset.',
 23 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 24 | 
 25 |     parser.add_argument('--depth', type=str, choices=['none', 'rendered', 'original'], default='none',
 26 |                         help='none: ignore depth maps; rendered: download depth rendered using 3D scene model (18GB), original: original depth maps')
 27 | 
 28 |     parser.add_argument('--eye', type=str, choices=['none', 'original'], default='none',
 29 |                         help='none: ignore eye coordinates; original: precompute eye coordinates from original depth maps')
 30 | 
 31 |     parser.add_argument('--poses', type=str, choices=['original', 'pgt'], default='original',
 32 |                         help='original: original pose files; '
 33 |                              'pgt: get SfM poses from external repository (Brachmann et al., ICCV21)')
 34 | 
 35 |     opt = parser.parse_args()
 36 | 
 37 |     if opt.depth == 'rendered' and opt.poses == 'pgt':
 38 |         print("Sorry. Rendered depth files are not compatible with SfM pose files, "
 39 |               "since both have missing frames and we would have to figure out the intersection. "
 40 |               "It can be done, but is not supported atm.")
 41 |         exit()
 42 | 
 43 |     print("\n#####################################################################")
 44 |     print("# Please make sure to check this dataset's license before using it! #")
 45 |     print("# http://graphics.stanford.edu/projects/reloc/                      #")
 46 |     print("#####################################################################\n\n")
 47 | 
 48 |     license_response = input('Please confirm with "yes" or abort. ')
 49 |     if not (license_response == "yes" or license_response == "y"):
 50 |         print(f"Your response: {license_response}. Aborting.")
 51 |         exit()
 52 | 
 53 |     if opt.poses == 'pgt':
 54 | 
 55 |         print("\n###################################################################")
 56 |         print("# You requested external pose files. Please check the license at: #")
 57 |         print("# https://github.com/tsattler/visloc_pseudo_gt_limitations        #")
 58 |         print("###################################################################\n\n")
 59 | 
 60 |         license_response = input('Please confirm with "yes" or abort. ')
 61 |         if not (license_response == "yes" or license_response == "y"):
 62 |             print(f"Your response: {license_response}. Aborting.")
 63 |             exit()
 64 | 
 65 |         print("Getting external pose files...")
 66 |         external_pgt_folder = dutil.clone_external_pose_files()
 67 | 
 68 |     # download the original 12 scenes dataset for calibration, poses and images
 69 |     dutil.mkdir(src_folder)
 70 |     os.chdir(src_folder)
 71 | 
 72 |     for ds in ['apt1', 'apt2', 'office1', 'office2']:
 73 | 
 74 |         if not os.path.exists(ds):
 75 | 
 76 |             print("=== Downloading 12scenes Data:", ds, "===============================")
 77 | 
 78 |             os.system('wget http://graphics.stanford.edu/projects/reloc/data/' + ds + '.zip')
 79 | 
 80 |             # unpack and delete zip file
 81 |             f = zipfile.PyZipFile(ds + '.zip')
 82 |             f.extractall()
 83 | 
 84 |             os.system('rm ' + ds + '.zip')
 85 | 
 86 |         else:
 87 |             print(f"Found data of scene {ds} already. Assuming its complete and skipping download.")
 88 | 
 89 | 
 90 |     def process_dataset(ds):
 91 | 
 92 |         scenes = os.listdir(ds)
 93 | 
 94 |         for scene in scenes:
 95 | 
 96 |             data_folder = ds + '/' + scene + '/data/'
 97 | 
 98 |             if not os.path.isdir(data_folder):
 99 |                 # skip README files
100 |                 continue
101 | 
102 |             print("Linking files for 12scenes_" + ds + "_" + scene + "...")
103 | 
104 |             if opt.poses == 'pgt':
105 |                 target_folder = '../pgt_12scenes_' + ds + '_' + scene + '/'
106 |             else:
107 |                 target_folder = '../12scenes_' + ds + '_' + scene + '/'
108 | 
109 |             # create subfolders for training and test
110 |             dutil.mkdir(target_folder + 'test/rgb/')
111 |             dutil.mkdir(target_folder + 'test/poses/')
112 |             dutil.mkdir(target_folder + 'test/calibration/')
113 |             if opt.depth == 'original':
114 |                 dutil.mkdir(target_folder + 'test/depth/')
115 |             if opt.eye == 'original':
116 |                 dutil.mkdir(target_folder + 'test/eye/')
117 | 
118 |             dutil.mkdir(target_folder + 'train/rgb/')
119 |             dutil.mkdir(target_folder + 'train/poses/')
120 |             dutil.mkdir(target_folder + 'train/calibration/')
121 |             if opt.depth == 'original':
122 |                 dutil.mkdir(target_folder + 'train/depth/')
123 |             if opt.eye == 'original':
124 |                 dutil.mkdir(target_folder + 'train/eye/')
125 | 
126 |             # read the train / test split - the first sequence is used for testing, everything else for training
127 |             with open(ds + '/' + scene + '/split.txt', 'r') as f:
128 |                 split = f.readlines()
129 |             split = int(split[0].split()[1][8:-1])
130 | 
131 |             # read the calibration parameters
132 |             with open(ds + '/' + scene + '/info.txt', 'r') as f:
133 |                 calibration_info = f.readlines()
134 | 
135 |             im_h = int(calibration_info[3].split()[2])
136 | 
137 |             focallength = calibration_info[7].split()
138 |             focallength = (float(focallength[2]) + float(focallength[7])) / 2
139 | 
140 |             files = os.listdir(data_folder)
141 | 
142 |             images = [f for f in files if f.endswith('color.jpg')]
143 |             images.sort()
144 | 
145 |             poses = [f for f in files if f.endswith('pose.txt')]
146 |             poses.sort()
147 | 
148 |             if opt.depth == 'original' or opt.eye == 'original':
149 |                 depth_maps = [f for f in files if f.endswith('depth.png')]
150 |                 depth_maps.sort()
151 | 
152 |             # read external poses and calibration if requested
153 |             if opt.poses == 'pgt':
154 |                 pgt_file = os.path.join('..', external_pgt_folder, '12scenes', f'{ds}_{scene}_test.txt')
155 |                 pgt_test_poses = dutil.read_pose_data(pgt_file)
156 |                 pgt_file = os.path.join('..', external_pgt_folder, '12scenes', f'{ds}_{scene}_train.txt')
157 |                 pgt_train_poses = dutil.read_pose_data(pgt_file)
158 |             else:
159 |                 pgt_test_poses = None
160 |                 pgt_train_poses = None
161 | 
162 |             def link_frame(i, variant, pgt_poses):
163 |                 """ Links calibration, pose and image of frame i in either test or training. """
164 | 
165 |                 # some image have invalid pose files, skip those
166 |                 if opt.poses == 'pgt':
167 |                     valid = os.path.join('data', dutil.get_base_file_name(poses[i])) in pgt_poses
168 |                 else:
169 | 
170 |                     valid = True
171 |                     with open(ds + '/' + scene + '/data/' + poses[i], 'r') as f:
172 |                         pose = f.readlines()
173 |                         for line in pose:
174 |                             if 'INF' in line:
175 |                                 valid = False
176 | 
177 |                 if not valid:
178 |                     print("Skipping frame", i, "(" + variant + ") - Corrupt pose.")
179 |                 else:
180 |                     # link image
181 |                     os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + images[
182 |                         i] + ' ' + target_folder + variant + '/rgb/')
183 | 
184 |                     if opt.poses == 'pgt':
185 |                         cam_pose, _ = pgt_poses[os.path.join('data', dutil.get_base_file_name(poses[i]))]
186 |                         dutil.write_cam_pose(target_folder + variant + '/poses/' + poses[i], cam_pose)
187 |                     else:
188 |                         # link pose
189 |                         os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + poses[
190 |                             i] + ' ' + target_folder + variant + '/poses/')
191 | 
192 |                     # create a calibration file
193 |                     with open(target_folder + variant + '/calibration/frame-%s.calibration.txt' % str(i).zfill(6),
194 |                               'w') as f:
195 |                         f.write(str(focallength))
196 | 
197 |                     if opt.depth == 'original':
198 |                         # link original depth files
199 |                         os.system('ln -s ../../../' + src_folder + '/' + data_folder + '/' + depth_maps[
200 |                             i] + ' ' + target_folder + variant + '/depth/')
201 | 
202 |                     if opt.eye == 'original':
203 | 
204 |                         depth = io.imread(data_folder + '/' + depth_maps[i])
205 |                         depth = depth.astype(np.float64)
206 |                         depth /= 1000  # from millimeters to meters
207 | 
208 |                         d_h = depth.shape[0]
209 |                         d_w = depth.shape[1]
210 | 
211 |                         # get RGB focal length and adjust to depth resolution
212 |                         if opt.poses == 'pgt':
213 |                             _, rgb_f = pgt_poses[os.path.join('data', dutil.get_base_file_name(poses[i]))]
214 |                         else:
215 |                             rgb_f = focallength
216 | 
217 |                         d_f = rgb_f * d_h / im_h
218 | 
219 |                         # generate sub-sampled eye coordinate tensor from calibrated depth
220 |                         out_h = int(d_h / nn_subsampling)
221 |                         out_w = int(d_w / nn_subsampling)
222 |                         nn_offset = int(nn_subsampling / 2)
223 | 
224 |                         eye_tensor = np.zeros((3, out_h, out_w))
225 | 
226 |                         # generate pixel coordinates
227 |                         eye_tensor[0] = np.dstack([np.arange(0, out_w)] * out_h)[0].T * nn_subsampling + nn_offset
228 |                         eye_tensor[1] = np.dstack([np.arange(0, out_h)] * out_w)[0] * nn_subsampling + nn_offset
229 | 
230 |                         # substract principal point
231 |                         eye_tensor[0] -= d_w / 2
232 |                         eye_tensor[1] -= d_h / 2
233 | 
234 |                         # subsample depth
235 |                         depth = depth[nn_offset::nn_subsampling, nn_offset::nn_subsampling]
236 | 
237 |                         # project
238 |                         eye_tensor[0:2] /= d_f
239 |                         eye_tensor[2, 0:depth.shape[0], 0:depth.shape[1]] = depth
240 |                         eye_tensor[0] *= eye_tensor[2]
241 |                         eye_tensor[1] *= eye_tensor[2]
242 | 
243 |                         eye_tensor = torch.from_numpy(eye_tensor).float()
244 | 
245 |                         torch.save(eye_tensor, target_folder + variant + '/eye/' + depth_maps[i][:-10] + '.eye.dat')
246 | 
247 |             # frames up to split are test images
248 |             for i in range(split):
249 |                 link_frame(i, 'test', pgt_test_poses)
250 | 
251 |             # all remaining frames are training images
252 |             for i in range(split, len(images)):
253 |                 link_frame(i, 'train', pgt_train_poses)
254 | 
255 | 
256 |     Parallel(n_jobs=4, verbose=0)(
257 |         map(delayed(process_dataset), ['apt1', 'apt2', 'office1', 'office2']))
258 | 
259 |     if opt.depth == 'rendered':
260 |         os.chdir('..')
261 |         dutil.dlheidata("10.11588/data/N07HKC/OMLKR1", "12scenes_depth.tar.gz")
262 | 


--------------------------------------------------------------------------------
/datasets/setup_7scenes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | import warnings
  6 | 
  7 | import dataset_util as dutil
  8 | import numpy as np
  9 | import torch
 10 | from joblib import Parallel, delayed
 11 | from skimage import io
 12 | 
 13 | # name of the folder where we download the original 7scenes dataset to
 14 | # we restructure the dataset by creating symbolic links to that folder
 15 | src_folder = '7scenes_source'
 16 | focal_length = 525.0
 17 | 
 18 | # focal length of the depth sensor (source: https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/)
 19 | d_focal_length = 585.0
 20 | 
 21 | # RGB image dimensions
 22 | img_w = 640
 23 | img_h = 480
 24 | 
 25 | # sub sampling factor of eye coordinate tensor
 26 | nn_subsampling = 8
 27 | 
 28 | # transformation from depth sensor to RGB sensor
 29 | # calibration according to https://projet.liris.cnrs.fr/voir/activities-dataset/kinect-calibration.html
 30 | d_to_rgb = np.array([
 31 |     [9.9996518012567637e-01, 2.6765126468950343e-03, -7.9041012313000904e-03, -2.5558943178152542e-02],
 32 |     [-2.7409311281316700e-03, 9.9996302803027592e-01, -8.1504520778013286e-03, 1.0109636268061706e-04],
 33 |     [7.8819942130445332e-03, 8.1718328771890631e-03, 9.9993554558014031e-01, 2.0318321729487039e-03],
 34 |     [0, 0, 0, 1]
 35 | ])
 36 | 
 37 | if __name__ == '__main__':
 38 |     parser = argparse.ArgumentParser(
 39 |         description='Download and setup the 7Scenes dataset.',
 40 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 41 | 
 42 |     parser.add_argument('--depth', type=str, choices=['none', 'rendered', 'calibrated'], default='none',
 43 |                         help='none: ignore depth maps; rendered: download depth rendered using 3D scene model (28GB); calibrated: register original depth maps to RGB')
 44 | 
 45 |     parser.add_argument('--eye', type=str, choices=['none', 'calibrated'], default='none',
 46 |                         help='none: ignore eye coordinates; original: calibrate original depth maps and precompute eye coordinates')
 47 | 
 48 |     parser.add_argument('--poses', type=str, choices=['original', 'calibrated', 'pgt'], default='calibrated',
 49 |                         help='original: use raw poses of depth sensor; '
 50 |                              'calibrated: register poses to RGB sensor; '
 51 |                              'pgt: get SfM poses from external repository (Brachmann et al., ICCV21)')
 52 | 
 53 |     opt = parser.parse_args()
 54 | 
 55 |     print("\n############################################################################")
 56 |     print("# Please make sure to check this dataset's license before using it!        #")
 57 |     print("# https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/ #")
 58 |     print("############################################################################\n\n")
 59 | 
 60 |     license_response = input('Please confirm with "yes" or abort. ')
 61 |     if not (license_response == "yes" or license_response == "y"):
 62 |         print(f"Your response: {license_response}. Aborting.")
 63 |         exit()
 64 | 
 65 |     if opt.poses == 'pgt':
 66 | 
 67 |         print("\n###################################################################")
 68 |         print("# You requested external pose files. Please check the license at: #")
 69 |         print("# https://github.com/tsattler/visloc_pseudo_gt_limitations        #")
 70 |         print("###################################################################\n\n")
 71 | 
 72 |         license_response = input('Please confirm with "yes" or abort. ')
 73 |         if not (license_response == "yes" or license_response == "y"):
 74 |             print(f"Your response: {license_response}. Aborting.")
 75 |             exit()
 76 | 
 77 |         print("Getting external pose files...")
 78 |         external_pgt_folder = dutil.clone_external_pose_files()
 79 | 
 80 |     # download the original 7 scenes dataset for poses and images
 81 |     dutil.mkdir(src_folder)
 82 |     os.chdir(src_folder)
 83 | 
 84 |     for ds in ['chess', 'fire', 'heads', 'office', 'pumpkin', 'redkitchen', 'stairs']:
 85 | 
 86 |         if not os.path.exists(ds):
 87 | 
 88 |             print("=== Downloading 7scenes Data:", ds, "===============================")
 89 | 
 90 |             os.system(
 91 |                 'wget http://download.microsoft.com/download/2/8/5/28564B23-0828-408F-8631-23B1EFF1DAC8/' + ds + '.zip')
 92 |             os.system('unzip ' + ds + '.zip')
 93 |             os.system('rm ' + ds + '.zip')
 94 | 
 95 |             sequences = os.listdir(ds)
 96 | 
 97 |             for file in sequences:
 98 |                 if file.endswith('.zip'):
 99 |                     print("Unpacking", file)
100 |                     os.system('unzip ' + ds + '/' + file + ' -d ' + ds)
101 |                     os.system('rm ' + ds + '/' + file)
102 |         else:
103 |             print(f"Found data of scene {ds} already. Assuming its complete and skipping download.")
104 | 
105 |     print("Processing frames...")
106 | 
107 | 
108 |     def process_scene(ds):
109 | 
110 |         if opt.poses == 'pgt':
111 |             target_folder = '../pgt_7scenes_' + ds + '/'
112 |         else:
113 |             target_folder = '../7scenes_' + ds + '/'
114 | 
115 |         def link_frames(split_file, variant):
116 | 
117 |             # create subfolders
118 |             dutil.mkdir(target_folder + variant + '/rgb/')
119 |             dutil.mkdir(target_folder + variant + '/poses/')
120 |             dutil.mkdir(target_folder + variant + '/calibration/')
121 |             if opt.depth == 'calibrated':
122 |                 dutil.mkdir(target_folder + variant + '/depth/')
123 |             if opt.eye == 'calibrated':
124 |                 dutil.mkdir(target_folder + variant + '/eye/')
125 | 
126 |             # read the split file
127 |             with open(ds + '/' + split_file, 'r') as f:
128 |                 split = f.readlines()
129 |             # map sequences to folder names
130 |             split = ['seq-' + s.strip()[8:].zfill(2) for s in split]
131 | 
132 |             # read external poses and calibration if requested
133 |             if opt.poses == 'pgt':
134 |                 pgt_file = os.path.join('..', external_pgt_folder, '7scenes', f'{ds}_{variant}.txt')
135 |                 pgt_poses = dutil.read_pose_data(pgt_file)
136 | 
137 |             for seq in split:
138 |                 files = os.listdir(ds + '/' + seq)
139 | 
140 |                 # link images
141 |                 images = [f for f in files if f.endswith('color.png')]
142 |                 for img in images:
143 |                     os.system(
144 |                         'ln -s ../../../' + src_folder + '/' + ds + '/' + seq + '/' + img + ' ' + target_folder + variant + '/rgb/' + seq + '-' + img)
145 | 
146 |                 pose_files = [f for f in files if f.endswith('pose.txt')]
147 | 
148 |                 # create pose files
149 |                 if opt.poses == 'original':
150 | 
151 |                     # link original poses
152 |                     for p_file in pose_files:
153 |                         os.system(
154 |                             'ln -s ../../../' + src_folder + '/' + ds + '/' + seq + '/' + p_file + ' ' + target_folder + variant + '/poses/' + seq + '-' + p_file)
155 | 
156 |                 elif opt.poses == 'pgt':
157 | 
158 |                     # use poses as externally provided
159 |                     for p_file in pose_files:
160 |                         cam_pose, _ = pgt_poses[os.path.join(seq, dutil.get_base_file_name(p_file))]
161 |                         dutil.write_cam_pose(target_folder + variant + '/poses/' + seq + '-' + p_file, cam_pose)
162 | 
163 |                 else:
164 | 
165 |                     # adjust original camera pose files by mapping to RGB sensor
166 |                     for p_file in pose_files:
167 |                         # load original pose (aligned to the depth sensor)
168 |                         cam_pose = np.loadtxt(ds + '/' + seq + '/' + p_file)
169 | 
170 |                         # apply relative transform from depth to RGB sensor
171 |                         cam_pose = np.matmul(cam_pose, np.linalg.inv(d_to_rgb))
172 | 
173 |                         # write adjusted pose (aligned to the RGB sensor)
174 |                         dutil.write_cam_pose(target_folder + variant + '/poses/' + seq + '-' + p_file, cam_pose)
175 | 
176 |                 # create calibration files
177 |                 if opt.poses == 'pgt':
178 | 
179 |                     for p_file in pose_files:
180 |                         base_file = dutil.get_base_file_name(p_file)
181 |                         calib_file = f'{base_file}.calibration.txt'
182 | 
183 |                         _, rgb_f = pgt_poses[os.path.join(seq, base_file)]
184 |                         dutil.write_focal_length(target_folder + variant + '/calibration/' + seq + '-' + calib_file,
185 |                                                  rgb_f)
186 |                 else:
187 |                     for i in range(len(images)):
188 |                         dutil.write_focal_length(
189 |                             target_folder + variant + '/calibration/%s-frame-%s.calibration.txt' % (
190 |                             seq, str(i).zfill(6)),
191 |                             focal_length)
192 | 
193 |                 if opt.depth != 'calibrated' and opt.eye != 'calibrated':
194 |                     # no calibration requested, done
195 |                     continue
196 | 
197 |                 # adjust depth files by mapping to RGB sensor
198 |                 depth_files = [f for f in files if f.endswith('depth.png')]
199 | 
200 |                 for d_file in depth_files:
201 | 
202 |                     if opt.poses == 'pgt':
203 |                         # use correct per-frame focal length if provided
204 |                         _, rgb_f = pgt_poses[os.path.join(seq, dutil.get_base_file_name(d_file))]
205 |                     else:
206 |                         # use default focal length as fall back
207 |                         rgb_f = focal_length
208 | 
209 |                     depth = io.imread(ds + '/' + seq + '/' + d_file)
210 |                     depth = depth.astype(np.float64)
211 |                     depth /= 1000  # from millimeters to meters
212 | 
213 |                     d_h = depth.shape[0]
214 |                     d_w = depth.shape[1]
215 | 
216 |                     # reproject depth map to 3D eye coordinates
217 |                     eye_coords = np.zeros((4, d_h, d_w))
218 |                     # set x and y coordinates
219 |                     eye_coords[0] = np.dstack([np.arange(0, d_w)] * d_h)[0].T
220 |                     eye_coords[1] = np.dstack([np.arange(0, d_h)] * d_w)[0]
221 | 
222 |                     eye_coords = eye_coords.reshape(4, -1)
223 |                     depth = depth.reshape(-1)
224 | 
225 |                     # filter pixels with invalid depth
226 |                     depth_mask = (depth > 0) & (depth < 100)
227 |                     eye_coords = eye_coords[:, depth_mask]
228 |                     depth = depth[depth_mask]
229 | 
230 |                     # substract depth principal point (assume image center)
231 |                     eye_coords[0] -= d_w / 2
232 |                     eye_coords[1] -= d_h / 2
233 |                     # reproject
234 |                     eye_coords[0:2] /= d_focal_length
235 |                     eye_coords[0] *= depth
236 |                     eye_coords[1] *= depth
237 |                     eye_coords[2] = depth
238 |                     eye_coords[3] = 1
239 | 
240 |                     # transform from depth sensor to RGB sensor
241 |                     eye_coords = np.matmul(d_to_rgb, eye_coords)
242 | 
243 |                     # project
244 |                     depth = eye_coords[2]
245 | 
246 |                     eye_coords[0] /= depth
247 |                     eye_coords[1] /= depth
248 |                     eye_coords[0:2] *= rgb_f
249 | 
250 |                     # add RGB principal point (assume image center)
251 |                     eye_coords[0] += img_w / 2
252 |                     eye_coords[1] += img_h / 2
253 | 
254 |                     registered_depth = np.zeros((img_h, img_w), dtype='uint16')
255 | 
256 |                     for pt in range(eye_coords.shape[1]):
257 |                         x = round(eye_coords[0, pt])
258 |                         y = round(eye_coords[1, pt])
259 |                         d = eye_coords[2, pt]
260 | 
261 |                         registered_depth[y, x] = d * 1000
262 | 
263 |                     if opt.depth == 'calibrated':
264 |                         # store calibrated depth
265 |                         with warnings.catch_warnings():
266 |                             warnings.simplefilter("ignore")
267 |                             io.imsave(target_folder + variant + '/depth/' + seq + '-' + d_file, registered_depth)
268 | 
269 |                     if opt.eye != 'calibrated':
270 |                         # done
271 |                         continue
272 | 
273 |                     # generate sub-sampled eye coordinate tensor from calibrated depth
274 |                     out_h = int(img_h / nn_subsampling)
275 |                     out_w = int(img_w / nn_subsampling)
276 |                     nn_offset = int(nn_subsampling / 2)
277 | 
278 |                     eye_tensor = np.zeros((3, out_h, out_w))
279 | 
280 |                     # generate pixel coordinates
281 |                     eye_tensor[0] = np.dstack([np.arange(0, out_w)] * out_h)[0].T * nn_subsampling + nn_offset
282 |                     eye_tensor[1] = np.dstack([np.arange(0, out_h)] * out_w)[0] * nn_subsampling + nn_offset
283 | 
284 |                     # substract principal point
285 |                     eye_tensor[0] -= img_w / 2
286 |                     eye_tensor[1] -= img_h / 2
287 | 
288 |                     # prepare depth (convert to meters and subsample)
289 |                     registered_depth = registered_depth.astype(float)
290 |                     registered_depth /= 1000
291 |                     registered_depth = registered_depth[nn_offset::nn_subsampling, nn_offset::nn_subsampling]
292 | 
293 |                     # project
294 |                     eye_tensor[0:2] /= rgb_f
295 |                     eye_tensor[2, 0:registered_depth.shape[0], 0:registered_depth.shape[1]] = registered_depth
296 |                     eye_tensor[0] *= eye_tensor[2]
297 |                     eye_tensor[1] *= eye_tensor[2]
298 | 
299 |                     eye_tensor = torch.from_numpy(eye_tensor).float()
300 | 
301 |                     torch.save(eye_tensor,
302 |                                target_folder + variant + '/eye/' + seq + '-' + d_file[:-10] + '.eye.dat')
303 | 
304 |         link_frames('TrainSplit.txt', 'train')
305 |         link_frames('TestSplit.txt', 'test')
306 | 
307 | 
308 |     Parallel(n_jobs=7, verbose=0)(
309 |         map(delayed(process_scene), ['chess', 'fire', 'heads', 'office', 'pumpkin', 'redkitchen', 'stairs']))
310 | 
311 |     if opt.depth == 'rendered':
312 |         os.chdir('..')
313 |         dutil.dlheidata("10.11588/data/N07HKC/4PLEEJ", "7scenes_depth.tar.gz")
314 | 


--------------------------------------------------------------------------------
/datasets/setup_cambridge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import math
  5 | import os
  6 | 
  7 | import cv2 as cv
  8 | import dataset_util as dutil
  9 | import numpy as np
 10 | import torch
 11 | from skimage import io
 12 | 
 13 | # setup individual scene IDs and their download location
 14 | scenes = [
 15 |     'https://www.repository.cam.ac.uk/bitstream/handle/1810/251342/KingsCollege.zip',
 16 |     'https://www.repository.cam.ac.uk/bitstream/handle/1810/251340/OldHospital.zip',
 17 |     'https://www.repository.cam.ac.uk/bitstream/handle/1810/251336/ShopFacade.zip',
 18 |     'https://www.repository.cam.ac.uk/bitstream/handle/1810/251294/StMarysChurch.zip',
 19 |     'https://www.repository.cam.ac.uk/bitstream/handle/1810/251291/GreatCourt.zip',
 20 | ]
 21 | 
 22 | target_height = 480  # rescale images
 23 | nn_subsampling = 8  # sub sampling of our CNN architecture, for size of the initalization targets
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(
 27 |         description='Download and setup the Cambridge dataset.',
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | 
 30 |     parser.add_argument('--init', type=str, choices=['none', 'sfm'], default='none',
 31 |                         help='none: no initialisation targets for scene coordinates; sfm: scene coordinate targets by rendering the SfM point cloud')
 32 | 
 33 |     opt = parser.parse_args()
 34 | 
 35 |     print("\n###############################################################################")
 36 |     print("# Please make sure to check this dataset's license before using it!           #")
 37 |     print("# https://www.repository.cam.ac.uk/items/53788265-cb98-42ee-b85b-7a0cbc8eddb3 #")
 38 |     print("###############################################################################\n\n")
 39 | 
 40 |     license_response = input('Please confirm with "yes" or abort. ')
 41 |     if not (license_response == "yes" or license_response == "y"):
 42 |         print(f"Your response: {license_response}. Aborting.")
 43 |         exit()
 44 | 
 45 |     for scene in scenes:
 46 | 
 47 |         scene_file = scene.split('/')[-1]
 48 |         scene_name = scene_file[:-4]
 49 | 
 50 |         print("===== Processing " + scene_name + " ===================")
 51 | 
 52 |         print("Downloading and unzipping data...")
 53 |         os.system('wget ' + scene)
 54 |         os.system('unzip ' + scene_file)
 55 |         os.system('rm ' + scene_file)
 56 |         os.system('mv ' + scene_name + ' Cambridge_' + scene_name)
 57 |         os.chdir('Cambridge_' + scene_name)
 58 | 
 59 |         modes = ['train', 'test']
 60 |         input_file = 'reconstruction.nvm'
 61 | 
 62 |         print("Loading SfM reconstruction...")
 63 | 
 64 |         f = open(input_file)
 65 |         reconstruction = f.readlines()
 66 |         f.close()
 67 | 
 68 |         num_cams = int(reconstruction[2])
 69 |         num_pts = int(reconstruction[num_cams + 4])
 70 | 
 71 |         if opt.init == 'sfm':
 72 | 
 73 |             # read points
 74 |             pts_dict = {}
 75 |             for cam_idx in range(0, num_cams):
 76 |                 pts_dict[cam_idx] = []
 77 | 
 78 |             pt = pts_start = num_cams + 5
 79 |             pts_end = pts_start + num_pts
 80 | 
 81 |             while pt < pts_end:
 82 | 
 83 |                 pt_list = reconstruction[pt].split()
 84 |                 pt_3D = [float(x) for x in pt_list[0:3]]
 85 |                 pt_3D.append(1.0)
 86 | 
 87 |                 for pt_view in range(0, int(pt_list[6])):
 88 |                     cam_view = int(pt_list[7 + pt_view * 4])
 89 |                     pts_dict[cam_view].append(pt_3D)
 90 | 
 91 |                 pt += 1
 92 | 
 93 |         print("Reconstruction contains %d cameras and %d 3D points." % (num_cams, num_pts))
 94 | 
 95 |         for mode in modes:
 96 | 
 97 |             print("Converting " + mode + " data...")
 98 | 
 99 |             img_output_folder = mode + '/rgb/'
100 |             cal_output_folder = mode + '/calibration/'
101 |             pose_output_folder = mode + '/poses/'
102 | 
103 |             dutil.mkdir(img_output_folder)
104 |             dutil.mkdir(cal_output_folder)
105 |             dutil.mkdir(pose_output_folder)
106 | 
107 |             if opt.init != 'none':
108 |                 target_output_folder = mode + '/init/'
109 |                 dutil.mkdir(target_output_folder)
110 | 
111 |             # get list of images for current mode (train vs. test)
112 |             image_list = 'dataset_' + mode + '.txt'
113 | 
114 |             f = open(image_list)
115 |             camera_list = f.readlines()
116 |             f.close()
117 |             camera_list = camera_list[3:]
118 | 
119 |             image_list = [camera.split()[0] for camera in camera_list]
120 | 
121 |             for cam_idx in range(num_cams):
122 | 
123 |                 print("Processing camera %d of %d." % (cam_idx, num_cams))
124 |                 image_file = reconstruction[3 + cam_idx].split()[0]
125 |                 image_file = image_file[:-3] + 'png'
126 | 
127 |                 if image_file not in image_list:
128 |                     print("Skipping image " + image_file + ". Not part of set: " + mode + ".")
129 |                     continue
130 | 
131 |                 image_idx = image_list.index(image_file)
132 | 
133 |                 # read camera
134 |                 camera = camera_list[image_idx].split()
135 |                 cam_rot = [float(r) for r in camera[4:]]
136 | 
137 |                 # quaternion to axis-angle
138 |                 angle = 2 * math.acos(cam_rot[0])
139 |                 x = cam_rot[1] / math.sqrt(1 - cam_rot[0] ** 2)
140 |                 y = cam_rot[2] / math.sqrt(1 - cam_rot[0] ** 2)
141 |                 z = cam_rot[3] / math.sqrt(1 - cam_rot[0] ** 2)
142 | 
143 |                 cam_rot = [x * angle, y * angle, z * angle]
144 | 
145 |                 cam_rot = np.asarray(cam_rot)
146 |                 cam_rot, _ = cv.Rodrigues(cam_rot)
147 | 
148 |                 cam_trans = [float(r) for r in camera[1:4]]
149 |                 cam_trans = np.asarray([cam_trans])
150 |                 cam_trans = np.transpose(cam_trans)
151 |                 cam_trans = - np.matmul(cam_rot, cam_trans)
152 | 
153 |                 if np.absolute(cam_trans).max() > 10000:
154 |                     print("Skipping image " + image_file + ". Extremely large translation. Outlier?")
155 |                     print(cam_trans)
156 |                     continue
157 | 
158 |                 cam_pose = np.concatenate((cam_rot, cam_trans), axis=1)
159 |                 cam_pose = np.concatenate((cam_pose, [[0, 0, 0, 1]]), axis=0)
160 |                 cam_pose = torch.tensor(cam_pose).float()
161 | 
162 |                 focal_length = float(reconstruction[3 + cam_idx].split()[1])
163 | 
164 |                 # load image
165 |                 image = io.imread(image_file)
166 |                 image_file = image_file.replace('/', '_')
167 | 
168 |                 img_aspect = image.shape[0] / image.shape[1]
169 | 
170 |                 if img_aspect > 1:
171 |                     # portrait
172 |                     img_w = target_height
173 |                     img_h = int(math.ceil(target_height * img_aspect))
174 |                 else:
175 |                     # landscape
176 |                     img_w = int(math.ceil(target_height / img_aspect))
177 |                     img_h = target_height
178 | 
179 |                 out_w = int(math.ceil(img_w / nn_subsampling))
180 |                 out_h = int(math.ceil(img_h / nn_subsampling))
181 | 
182 |                 out_scale = out_w / image.shape[1]
183 |                 img_scale = img_w / image.shape[1]
184 | 
185 |                 image = cv.resize(image, (img_w, img_h))
186 |                 io.imsave(img_output_folder + image_file, image)
187 | 
188 |                 with open(cal_output_folder + image_file[:-3] + 'txt', 'w') as f:
189 |                     f.write(str(focal_length * img_scale))
190 | 
191 |                 inv_cam_pose = cam_pose.inverse()
192 | 
193 |                 with open(pose_output_folder + image_file[:-3] + 'txt', 'w') as f:
194 |                     f.write(str(float(inv_cam_pose[0, 0])) + ' ' + str(float(inv_cam_pose[0, 1])) + ' ' + str(
195 |                         float(inv_cam_pose[0, 2])) + ' ' + str(float(inv_cam_pose[0, 3])) + '\n')
196 |                     f.write(str(float(inv_cam_pose[1, 0])) + ' ' + str(float(inv_cam_pose[1, 1])) + ' ' + str(
197 |                         float(inv_cam_pose[1, 2])) + ' ' + str(float(inv_cam_pose[1, 3])) + '\n')
198 |                     f.write(str(float(inv_cam_pose[2, 0])) + ' ' + str(float(inv_cam_pose[2, 1])) + ' ' + str(
199 |                         float(inv_cam_pose[2, 2])) + ' ' + str(float(inv_cam_pose[2, 3])) + '\n')
200 |                     f.write(str(float(inv_cam_pose[3, 0])) + ' ' + str(float(inv_cam_pose[3, 1])) + ' ' + str(
201 |                         float(inv_cam_pose[3, 2])) + ' ' + str(float(inv_cam_pose[3, 3])) + '\n')
202 | 
203 |                 if opt.init == 'sfm':
204 | 
205 |                     # load 3D points from reconstruction
206 |                     pts_3D = torch.tensor(pts_dict[cam_idx])
207 | 
208 |                     out_tensor = torch.zeros((3, out_h, out_w))
209 |                     out_zbuffer = torch.zeros((out_h, out_w))
210 | 
211 |                     fine = 0
212 |                     conflict = 0
213 | 
214 |                     for pt_idx in range(0, pts_3D.size(0)):
215 | 
216 |                         scene_pt = pts_3D[pt_idx]
217 |                         scene_pt = scene_pt.unsqueeze(0)
218 |                         scene_pt = scene_pt.transpose(0, 1)
219 | 
220 |                         # scene to camera coordinates
221 |                         cam_pt = torch.mm(cam_pose, scene_pt)
222 |                         # projection to image
223 |                         img_pt = cam_pt[0:2, 0] * focal_length / cam_pt[2, 0] * out_scale
224 | 
225 |                         y = img_pt[1] + out_h / 2
226 |                         x = img_pt[0] + out_w / 2
227 | 
228 |                         x = int(torch.clamp(x, min=0, max=out_tensor.size(2) - 1))
229 |                         y = int(torch.clamp(y, min=0, max=out_tensor.size(1) - 1))
230 | 
231 |                         if cam_pt[2, 0] > 1000:  # filter some outlier points (large depth)
232 |                             continue
233 | 
234 |                         if out_zbuffer[y, x] == 0 or out_zbuffer[y, x] > cam_pt[2, 0]:
235 |                             out_zbuffer[y, x] = cam_pt[2, 0]
236 |                             out_tensor[:, y, x] = pts_3D[pt_idx, 0:3]
237 | 
238 |                     torch.save(out_tensor, target_output_folder + image_file[:-4] + '.dat')
239 | 
240 |         os.chdir('..')
241 | 


--------------------------------------------------------------------------------
/datasets/setup_wayspots.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | 
 5 | if __name__ == '__main__':
 6 |     dataset_url = 'https://storage.googleapis.com/niantic-lon-static/research/ace/wayspots.tar.gz'
 7 |     dataset_file = dataset_url.split('/')[-1]
 8 | 
 9 |     print("\n#####################################################################")
10 |     print("# Please make sure to check this dataset's license before using it! #")
11 |     print("# https://nianticlabs.github.io/ace/wayspots.html                   #")
12 |     print("#####################################################################\n\n")
13 | 
14 |     license_response = input('Please confirm with "yes" or abort. ')
15 |     if not (license_response == "yes" or license_response == "y"):
16 |         print(f"Your response: {license_response}. Aborting.")
17 |         exit()
18 | 
19 |     print("Downloading and unzipping data...")
20 |     os.system('wget ' + dataset_url)
21 |     os.system('tar -xzf ' + dataset_file)
22 |     os.system('rm ' + dataset_file)
23 |     print("Done.")
24 | 


--------------------------------------------------------------------------------
/dsacstar/dsacstar_loss.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Based on the DSAC++ and ESAC code.
  3 | https://github.com/vislearn/LessMore
  4 | https://github.com/vislearn/esac
  5 | 
  6 | Copyright (c) 2016, TU Dresden
  7 | Copyright (c) 2020, Heidelberg University
  8 | All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without
 11 | modification, are permitted provided that the following conditions are met:
 12 |     * Redistributions of source code must retain the above copyright
 13 |       notice, this list of conditions and the following disclaimer.
 14 |     * Redistributions in binary form must reproduce the above copyright
 15 |       notice, this list of conditions and the following disclaimer in the
 16 |       documentation and/or other materials provided with the distribution.
 17 |     * Neither the name of the TU Dresden, Heidelberg University nor the
 18 |       names of its contributors may be used to endorse or promote products
 19 |       derived from this software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY
 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | #pragma once
 34 | 
 35 | #define MAXLOSS 10000000.0 // clamp for stability
 36 | 
 37 | namespace dsacstar
 38 | {
 39 | 	/**
 40 | 	 * @brief Calculates the rotational distance in degree between two transformations.
 41 | 	 * Translation will be ignored.
 42 | 	 *
 43 | 	 * @param trans1 Transformation 1.
 44 | 	 * @param trans2 Transformation 2.
 45 | 	 * @return Angle in degree.
 46 | 	 */
 47 | 	double calcAngularDistance(const dsacstar::trans_t& trans1, const dsacstar::trans_t& trans2)
 48 | 	{
 49 | 		cv::Mat rot1 = trans1.colRange(0, 3).rowRange(0, 3);
 50 | 		cv::Mat rot2 = trans2.colRange(0, 3).rowRange(0, 3);
 51 | 
 52 | 		cv::Mat rotDiff= rot2 * rot1.t();
 53 | 		double trace = cv::trace(rotDiff)[0];
 54 | 
 55 | 		trace = std::min(3.0, std::max(-1.0, trace));
 56 | 		return 180*acos((trace-1.0)/2.0)/PI;
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * @brief Weighted average of  translational error and rotational error between two pose hypothesis.
 61 | 	 * @param h1 Pose 1.
 62 | 	 * @param h2 Pose 2.
 63 | 	 * @param wRot Weight of rotation error.
 64 | 	 * @param wTrans Weight of translation error.
 65 | 	 * @param cut Apply soft clamping after this value.
 66 | 	 * @return Loss.
 67 | 	 */
 68 | 	double loss(
 69 | 		const dsacstar::trans_t& trans1, 
 70 | 		const dsacstar::trans_t& trans2, 
 71 | 		double wRot = 1.0, 
 72 | 		double wTrans = 1.0,
 73 | 		double cut = 100)
 74 | 	{
 75 | 		double rotErr = dsacstar::calcAngularDistance(trans1, trans2);
 76 | 		double tErr = cv::norm(
 77 | 			trans1.col(3).rowRange(0, 3) - trans2.col(3).rowRange(0, 3));
 78 | 
 79 | 		double loss = wRot * rotErr + wTrans * tErr;
 80 | 
 81 | 		if(loss > cut)
 82 | 			loss = std::sqrt(cut * loss);
 83 | 
 84 | 	    return std::min(loss, MAXLOSS);
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * @brief Calculate the derivative of the loss w.r.t. the estimated pose.
 89 | 	 * @param est Estimated pose (6 DoF).
 90 | 	 * @param gt Ground truth pose (6 DoF).
 91 | 	 * @param wRot Weight of rotation error.
 92 | 	 * @param wTrans Weight of translation error.
 93 | 	 * @param cut Apply soft clamping after this value.	 
 94 | 	 * @return 1x6 Jacobean.
 95 | 	 */
 96 | 	cv::Mat_<double> dLoss(
 97 | 		const dsacstar::pose_t& est, 
 98 | 		const dsacstar::pose_t& gt,
 99 | 		double wRot = 1.0,
100 | 		double wTrans = 1.0,
101 | 		double cut = 100)
102 | 	{
103 | 	    cv::Mat rot1, rot2, dRod;
104 | 	    cv::Rodrigues(est.first, rot1, dRod);
105 | 	    cv::Rodrigues(gt.first, rot2);
106 | 
107 | 	    // measure loss of inverted poses (camera pose instead of scene pose)
108 | 	    cv::Mat_<double> invRot1 = rot1.t();
109 | 	    cv::Mat_<double> invRot2 = rot2.t();
110 | 
111 | 	    // get the difference rotation
112 | 	    cv::Mat diffRot = rot1 * invRot2;
113 | 
114 | 	    // calculate rotational and translational error
115 | 	    double trace = cv::trace(diffRot)[0];
116 | 	    trace = std::min(3.0, std::max(-1.0, trace));
117 | 	    double rotErr = 180*acos((trace-1.0)/2.0)/CV_PI;
118 | 
119 | 	    cv::Mat_<double> invT1 = est.second.clone();
120 | 	    invT1 = invRot1 * invT1;
121 | 
122 | 	    cv::Mat_<double> invT2 = gt.second.clone();
123 | 	    invT2 = invRot2 * invT2;
124 | 
125 | 	    // zero error, abort
126 | 	    double tErr = cv::norm(invT1 - invT2);
127 | 
128 | 	    cv::Mat_<double> jacobean = cv::Mat_<double>::zeros(1, 6);
129 | 	    
130 | 	    // clamped loss, return zero gradient if loss is bigger than threshold
131 | 	    double loss = wRot * rotErr + wTrans * tErr;
132 | 	    bool cutLoss = false;
133 | 
134 | 
135 | 	    if(loss > cut)
136 | 	    {
137 | 	    	loss = std::sqrt(loss);
138 | 	    	cutLoss = true;
139 | 	    }
140 | 
141 | 	    if(loss > MAXLOSS)
142 | 	        return jacobean;
143 | 
144 | 	    if((tErr + rotErr) < EPS)
145 | 	        return jacobean;
146 | 		
147 | 	  	
148 | 	    // return gradient of translational error
149 | 	    cv::Mat_<double> dDist_dInvT1(1, 3);
150 | 	    for(unsigned i = 0; i < 3; i++)
151 | 	        dDist_dInvT1(0, i) = (invT1(i, 0) - invT2(i, 0)) / tErr;
152 | 
153 | 	    cv::Mat_<double> dInvT1_dEstT(3, 3);
154 | 	    dInvT1_dEstT = invRot1;
155 | 
156 | 	    cv::Mat_<double> dDist_dEstT = dDist_dInvT1 * dInvT1_dEstT;
157 | 	    jacobean.colRange(3, 6) += dDist_dEstT * wTrans;
158 | 
159 | 	    cv::Mat_<double> dInvT1_dInvRot1 = cv::Mat_<double>::zeros(3, 9);
160 | 
161 | 	    dInvT1_dInvRot1(0, 0) = est.second.at<double>(0, 0);
162 | 	    dInvT1_dInvRot1(0, 3) = est.second.at<double>(1, 0);
163 | 	    dInvT1_dInvRot1(0, 6) = est.second.at<double>(2, 0);
164 | 
165 | 	    dInvT1_dInvRot1(1, 1) = est.second.at<double>(0, 0);
166 | 	    dInvT1_dInvRot1(1, 4) = est.second.at<double>(1, 0);
167 | 	    dInvT1_dInvRot1(1, 7) = est.second.at<double>(2, 0);
168 | 
169 | 	    dInvT1_dInvRot1(2, 2) = est.second.at<double>(0, 0);
170 | 	    dInvT1_dInvRot1(2, 5) = est.second.at<double>(1, 0);
171 | 	    dInvT1_dInvRot1(2, 8) = est.second.at<double>(2, 0);
172 | 
173 | 	    dRod = dRod.t();
174 | 
175 | 	    cv::Mat_<double> dDist_dRod = dDist_dInvT1 * dInvT1_dInvRot1 * dRod;
176 | 	    jacobean.colRange(0, 3) += dDist_dRod * wTrans;
177 | 
178 | 
179 | 	    // return gradient of rotational error
180 | 	    cv::Mat_<double> dRotDiff = cv::Mat_<double>::zeros(9, 9);
181 | 	    invRot2.row(0).copyTo(dRotDiff.row(0).colRange(0, 3));
182 | 	    invRot2.row(1).copyTo(dRotDiff.row(1).colRange(0, 3));
183 | 	    invRot2.row(2).copyTo(dRotDiff.row(2).colRange(0, 3));
184 | 
185 | 	    invRot2.row(0).copyTo(dRotDiff.row(3).colRange(3, 6));
186 | 	    invRot2.row(1).copyTo(dRotDiff.row(4).colRange(3, 6));
187 | 	    invRot2.row(2).copyTo(dRotDiff.row(5).colRange(3, 6));
188 | 
189 | 	    invRot2.row(0).copyTo(dRotDiff.row(6).colRange(6, 9));
190 | 	    invRot2.row(1).copyTo(dRotDiff.row(7).colRange(6, 9));
191 | 	    invRot2.row(2).copyTo(dRotDiff.row(8).colRange(6, 9));
192 | 
193 | 	    dRotDiff = dRotDiff.t();
194 | 
195 | 	    cv::Mat_<double> dTrace = cv::Mat_<double>::zeros(1, 9);
196 | 	    dTrace(0, 0) = 1;
197 | 	    dTrace(0, 4) = 1;
198 | 	    dTrace(0, 8) = 1;
199 | 
200 | 	    cv::Mat_<double> dAngle = (180 / CV_PI * -1 / sqrt(3 - trace * trace + 2 * trace)) * dTrace * dRotDiff * dRod;
201 | 
202 | 	    jacobean.colRange(0, 3) += dAngle * wRot;
203 | 		
204 | 		if(cutLoss)
205 | 			jacobean *= 0.5 / loss;
206 | 
207 | 
208 | 	    if(cv::sum(cv::Mat(jacobean != jacobean))[0] > 0) //check for NaNs
209 | 	        return cv::Mat_<double>::zeros(1, 6);
210 | 
211 | 	    return jacobean;
212 | 	}
213 | 
214 | 
215 | }
216 | 


--------------------------------------------------------------------------------
/dsacstar/dsacstar_types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Based on the DSAC++ and ESAC code.
 3 | https://github.com/vislearn/LessMore
 4 | https://github.com/vislearn/esac
 5 | 
 6 | Copyright (c) 2016, TU Dresden
 7 | Copyright (c) 2020, Heidelberg University
 8 | All rights reserved.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of the TU Dresden, Heidelberg University nor the
18 |       names of its contributors may be used to endorse or promote products
19 |       derived from this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | #pragma once
34 | 
35 | #include "opencv2/opencv.hpp"
36 | 
37 | /** Several important types used troughout all this code. If types have to be changed, it can be done here, conveniently. */
38 | 
39 | namespace dsacstar
40 | {
41 | 	// scene pose type (OpenCV convention: axis-angle + translation)
42 | 	typedef std::pair<cv::Mat, cv::Mat> pose_t;
43 | 	// camera transformation type (inverted scene pose as 4x4 matrix)
44 | 	typedef cv::Mat_<double> trans_t;
45 | 	// ATen accessor type
46 | 	typedef at::TensorAccessor<float, 4> coord_t;
47 | }
48 | 


--------------------------------------------------------------------------------
/dsacstar/dsacstar_util_rgbd.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2018-2020, Heidelberg University
  3 | 
  4 | This code fragment contains adapted sources from PyTorch. See license agreement of PyTorch below.
  5 | 
  6 | From PyTorch:
  7 | 
  8 | Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
  9 | Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 10 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 11 | Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 12 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 13 | Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 14 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 15 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 16 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 17 | 
 18 | From Caffe2:
 19 | 
 20 | Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 21 | 
 22 | All contributions by Facebook:
 23 | Copyright (c) 2016 Facebook Inc.
 24 |  
 25 | All contributions by Google:
 26 | Copyright (c) 2015 Google Inc.
 27 | All rights reserved.
 28 |  
 29 | All contributions by Yangqing Jia:
 30 | Copyright (c) 2015 Yangqing Jia
 31 | All rights reserved.
 32 |  
 33 | All contributions from Caffe:
 34 | Copyright(c) 2013, 2014, 2015, the respective contributors
 35 | All rights reserved.
 36 |  
 37 | All other contributions:
 38 | Copyright(c) 2015, 2016 the respective contributors
 39 | All rights reserved.
 40 |  
 41 | Caffe2 uses a copyright model similar to Caffe: each contributor holds
 42 | copyright over their contributions to Caffe2. The project versioning records
 43 | all such contribution and copyright details. If a contributor wants to further
 44 | mark their specific copyright on a particular contribution, they should
 45 | indicate their copyright solely in the commit message of the change when it is
 46 | committed.
 47 | 
 48 | All rights reserved.
 49 | 
 50 | Redistribution and use in source and binary forms, with or without
 51 | modification, are permitted provided that the following conditions are met:
 52 | 
 53 | 1. Redistributions of source code must retain the above copyright
 54 |    notice, this list of conditions and the following disclaimer.
 55 | 
 56 | 2. Redistributions in binary form must reproduce the above copyright
 57 |    notice, this list of conditions and the following disclaimer in the
 58 |    documentation and/or other materials provided with the distribution.
 59 | 
 60 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 61 |    Heidelberg University and IDIAP Research Institute nor the names of its contributors may be
 62 |    used to endorse or promote products derived from this software without
 63 |    specific prior written permission.
 64 | 
 65 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 66 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 67 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 68 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 69 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 70 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 71 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 72 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 73 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 74 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 75 | POSSIBILITY OF SUCH DAMAGE.
 76 | 
 77 | */
 78 | 
 79 | #pragma once
 80 | 
 81 | 
 82 | //#include "opencv2/core/types_c.h"
 83 | //#include "opencv2/calib3d/calib3d_c.h"
 84 | 
 85 | /*
 86 |  * @brief reimplementation of PyTorch svd_backward in C++
 87 |  *
 88 |  * ref: https://github.com/pytorch/pytorch/blob/1d427fd6f66b0822db62f30e7654cae95abfd207/tools/autograd/templates/Functions.cpp
 89 |  * ref: https://j-towns.github.io/papers/svd-derivative.pdf
 90 |  *
 91 |  * This makes no assumption on the signs of sigma.
 92 |  *
 93 |  * @param grad: gradients w.r.t U, W, V
 94 |  * @param self: matrix decomposed by SVD
 95 |  * @param raw_u: U from SVD output
 96 |  * @param sigma: W from SVD output
 97 |  * @param raw_v: V from SVD output
 98 |  *
 99 |  */
100 | 
101 | cv::Mat svd_backward(
102 | 	const std::vector<cv::Mat> &grads, 
103 | 	const cv::Mat& self, 
104 | 	const cv::Mat& raw_u, 
105 | 	const cv::Mat& sigma, 
106 | 	const cv::Mat& raw_v)
107 | {
108 | 	auto m = self.rows;
109 | 	auto n = self.cols;
110 | 	auto k = sigma.cols;
111 | 	auto gsigma = grads[1];
112 | 
113 | 	auto u = raw_u;
114 | 	auto v = raw_v;
115 | 	auto gu = grads[0];
116 | 	auto gv = grads[2];
117 | 
118 | 	auto vt = v.t();
119 | 
120 | 	cv::Mat sigma_term;
121 | 
122 | 	if (!gsigma.empty())
123 | 	{
124 | 		sigma_term = (u * cv::Mat::diag(gsigma)) * vt;
125 | 	}
126 | 	else
127 | 	{
128 | 		sigma_term = cv::Mat::zeros(self.size(), self.type());
129 | 	}
130 | 	// in case that there are no gu and gv, we can avoid the series of kernel
131 | 	// calls below
132 | 	if (gv.empty() && gu.empty())
133 | 	{
134 | 		return sigma_term;
135 | 	}
136 | 
137 | 	auto ut = u.t();
138 | 	auto im = cv::Mat::eye((int)m, (int)m, self.type());
139 | 	auto in = cv::Mat::eye((int)n, (int)n, self.type());
140 | 	auto sigma_mat = cv::Mat::diag(sigma);
141 | 
142 | 	cv::Mat sigma_mat_inv;
143 | 	cv::pow(sigma, -1, sigma_mat_inv);
144 | 	sigma_mat_inv = cv::Mat::diag(sigma_mat_inv);
145 | 
146 | 	cv::Mat sigma_sq, sigma_expanded_sq;
147 | 	cv::pow(sigma, 2, sigma_sq);
148 | 	sigma_expanded_sq = cv::repeat(sigma_sq, sigma_mat.rows, 1);
149 | 
150 | 	cv::Mat F = sigma_expanded_sq - sigma_expanded_sq.t();
151 | 	// The following two lines invert values of F, and fills the diagonal with 0s.
152 | 	// Notice that F currently has 0s on diagonal. So we fill diagonal with +inf
153 | 	// first to prevent nan from appearing in backward of this function.
154 | 	F.diag().setTo(std::numeric_limits<float>::max());
155 | 	cv::pow(F, -1, F);
156 | 
157 | 	cv::Mat u_term, v_term;
158 | 
159 | 	if (!gu.empty())
160 | 	{
161 | 		cv::multiply(F, ut*gu-gu.t()*u, u_term);
162 | 		u_term = (u * u_term) * sigma_mat;
163 | 		if (m > k)
164 | 		{
165 | 			u_term = u_term + ((im - u*ut)*gu)*sigma_mat_inv;
166 | 		}
167 | 		u_term = u_term * vt;
168 | 	}
169 | 	else
170 | 	{
171 | 		u_term = cv::Mat::zeros(self.size(), self.type());
172 | 	}
173 | 
174 | 	if (!gv.empty())
175 | 	{
176 | 		auto gvt = gv.t();
177 | 		cv::multiply(F, vt*gv - gvt*v, v_term);
178 | 		v_term = (sigma_mat*v_term) * vt;
179 | 		if (n > k)
180 | 		{
181 | 			v_term = v_term + sigma_mat_inv*(gvt*(in - v*vt));
182 | 		}
183 | 		v_term = u * v_term;
184 | 	}
185 | 	else
186 | 	{
187 | 		v_term = cv::Mat::zeros(self.size(), self.type());
188 | 	}
189 | 
190 | 	return u_term + sigma_term + v_term;
191 | }
192 | 
193 | /*
194 |  * @brief Compute partial derivatives of the matrix product for each multiplied matrix.
195 |  * 			This wrapper function avoids unnecessary computation
196 |  *
197 |  * @param _Amat: First multiplied matrix.
198 |  * @param _Bmat: Second multiplied matrix.
199 |  * @param _dABdA Output parameter: First output derivative matrix. Pass cv::noArray() if not needed.
200 |  * @param _dABdB Output parameter: Second output derivative matrix. Pass cv::noArray() if not needed.
201 |  *
202 |  */
203 | void matMulDerivWrapper(
204 | 	cv::InputArray _Amat, 
205 | 	cv::InputArray _Bmat, 
206 | 	cv::OutputArray _dABdA, 
207 | 	cv::OutputArray _dABdB)
208 | {
209 | 	cv::Mat A = _Amat.getMat(), B = _Bmat.getMat();
210 | 
211 | 	if (_dABdA.needed())
212 | 	{
213 | 		_dABdA.create(A.rows*B.cols, A.rows*A.cols, A.type());
214 | 	}
215 | 
216 | 	if (_dABdB.needed())
217 | 	{
218 | 		_dABdB.create(A.rows*B.cols, B.rows*B.cols, A.type());
219 | 	}
220 | 
221 | 	CvMat matA = A, matB = B, c_dABdA=_dABdA.getMat(), c_dABdB=_dABdB.getMat();
222 | 	cvCalcMatMulDeriv(&matA, &matB, _dABdA.needed() ? &c_dABdA : 0, _dABdB.needed() ? &c_dABdB : 0);
223 | }
224 | 
225 | /*
226 |  * @brief Computes extrinsic camera parameters using Kabsch algorithm.
227 |  * 			If jacobean matrix is passed as argument, it further computes the analytical gradients.
228 |  *
229 |  * @param imgdPts: measurements
230 |  * @param objPts: scene points
231 |  * @param extCam Output parameter: extrinsic camera matrix (i.e. rotation vector and translation vector)
232 |  * @param _jacobean Output parameter: 6x3N jacobean matrix of rotation and translation vector
233 |  * 					w.r.t scene point coordinates.
234 |  * 					If gradient computation is not successful, jacobean matrix is set empty.
235 |  *
236 |  */
237 | void kabsch(
238 | 	const std::vector<cv::Point3f>& imgdPts, 
239 | 	const std::vector<cv::Point3f>& objPts, 
240 | 	dsacstar::pose_t& extCam, 
241 | 	cv::OutputArray _jacobean=cv::noArray())
242 | {
243 | 	unsigned int N = objPts.size();  //number of scene points
244 | 	bool calc = _jacobean.needed();  //check if computation of gradient is required
245 | 	bool degenerate = false;  //indicate if SVD gives degenerate case, i.e. non-distinct or zero singular values
246 | 
247 | 	cv::Mat P, X, Pc, Xc;  //Nx3
248 | 	cv::Mat A, U, W, Vt, V, D, R;  //3x3
249 | 	cv::Mat cx, cp, r, t;  //1x3
250 | 	cv::Mat invN;  //1xN
251 | 	cv::Mat gRodr;  //9x3
252 | 
253 | 	// construct the datasets P and X from input vectors, set false to avoid data copying
254 | 	P = cv::Mat(imgdPts, false).reshape(1, N);
255 | 	X = cv::Mat(objPts, false).reshape(1, N);
256 | 
257 | 	// compute centroid as average of each coordinate axis
258 | 	invN = cv::Mat(1, N, CV_32F, 1.f/N);  //average filter
259 | 	cx = invN * X;
260 | 	cp = invN * P;
261 | 
262 | 	// move centroid of datasets to origin
263 | 	Xc =  X - cv::repeat(cx, N, 1);
264 | 	Pc =  P - cv::repeat(cp, N, 1);
265 | 
266 | 	// compute covariance matrix
267 | 	A = Pc.t() * Xc;
268 | 
269 | 	// compute SVD of covariance matrix
270 | 	cv::SVD::compute(A, W, U, Vt);
271 | 
272 | 	// degenerate if any singular value is zero
273 | 	if ((unsigned int)cv::countNonZero(W) != (unsigned int)W.total())
274 | 		degenerate = true;
275 | 
276 | 	// degenerate if singular values are not distinct
277 | 	if (std::abs(W.at<float>(0,0)-W.at<float>(1,0)) < 1e-6
278 | 			|| std::abs(W.at<float>(0,0)-W.at<float>(2,0)) < 1e-6
279 | 			|| std::abs(W.at<float>(1,0)-W.at<float>(2,0)) < 1e-6)
280 | 		degenerate = true;
281 | 	
282 | 	// for correcting rotation matrix to ensure a right-handed coordinate system
283 | 	float d = cv::determinant(U * Vt);
284 | 
285 | 	D = (cv::Mat_<float>(3,3) <<
286 | 				1., 0., 0.,
287 | 				0., 1., 0.,
288 | 				0., 0., d );
289 | 	
290 | 	// calculates rotation matrix R
291 | 	R = U * (D * Vt);
292 | 
293 | 	// convert rotation matrix to rotation vector,
294 | 	// if needed, also compute jacobean matrix of rotation matrix w.r.t rotation vector
295 | 	calc ? cv::Rodrigues(R, r, gRodr) : cv::Rodrigues(R, r);
296 | 
297 | 	// calculates translation vector
298 | 	t = cp - cx * R.t();  //equiv: cp - (R*cx.t()).t();
299 | 
300 | 	// store results
301 | 	extCam.first = cv::Mat_<double>(r.reshape(1, 3));
302 | 	extCam.second = cv::Mat_<double>(t.reshape(1, 3));
303 | 	
304 | 	// end here no gradient is required
305 | 	if (!calc)
306 | 		return;
307 | 	
308 | 	// if SVD is degenerate, return empty jacobean matrix
309 | 	if (degenerate)
310 | 	{
311 | 		_jacobean.release();
312 | 		return;
313 | 	}
314 | 
315 | 	// allocate matrix data
316 | 	_jacobean.create(6, N*3, CV_64F);
317 | 	cv::Mat jacobean = _jacobean.getMat();
318 | 
319 | //	cv::Mat dRidU, dRidVt, dRidV, dRidA, dRidXc, dRidX;
320 | 	cv::Mat dRdU, dRdVt;  //9x9
321 | 	cv::Mat dAdXc;  //9x3N
322 | 	cv::Mat dtdR;  //3x9
323 | 	cv::Mat dtdcx;  //3x3
324 | 	cv::Mat dcxdX, drdX, dtdX;  //3x3N
325 | 	cv::Mat dRdX = cv::Mat_<float>::zeros(9, N*3);  //9x3N
326 | 
327 | 	// jacobean matrices of each dot product operation in kabsch algorithm
328 | 	matMulDerivWrapper(U, Vt, dRdU, dRdVt);
329 | 	matMulDerivWrapper(Pc.t(), Xc, cv::noArray(), dAdXc);
330 | 	matMulDerivWrapper(R, cx.t(), dtdR, dtdcx);
331 | 	matMulDerivWrapper(invN, X, cv::noArray(), dcxdX);
332 | 	
333 | 	V = Vt.t();
334 | 	W = W.reshape(1, 1);
335 | 
336 | //	#pragma omp parallel for
337 | 	for (int i = 0; i < 9; ++i)
338 | 	{
339 | 		cv::Mat dRidU, dRidVt, dRidV, dRidA;  //3x3
340 | 		cv::Mat dRidXc, dRidX;  //Nx3
341 | 
342 | 		dRidU = dRdU.row(i).reshape(1, 3);
343 | 		dRidVt = dRdVt.row(i).reshape(1, 3);
344 | 
345 | 		dRidV = dRidVt.t();
346 | 
347 | 		//W is not used in computation of R, no gradient of W is needed
348 | 		std::vector<cv::Mat> grads{dRidU, cv::Mat(), dRidV};
349 | 
350 | 		dRidA = svd_backward(grads, A, U, W, V);
351 | 		dRidA = dRidA.reshape(1, 1);
352 | 		dRidXc = dRidA * dAdXc;
353 | 		dRidXc = dRidXc.reshape(1, N);
354 | 		dRidX = cv::Mat::zeros(dRidXc.size(), dRidXc.type());
355 | 
356 | 		int bstep = dRidXc.step/CV_ELEM_SIZE(dRidXc.type());
357 | 
358 | //		#pragma omp parallel for
359 | 		for (int j = 0; j < 3; ++j)
360 | 		{
361 | 			// compute dRidXj = dRidXcj * dXcjdXj
362 | 			float* pdRidXj = (float*)dRidX.data + j;
363 | 			const float* pdRidXcj = (const float*)dRidXc.data + j;
364 | 
365 | 			float tmp = 0.f;
366 | 			for (unsigned int k = 0; k < N; ++k)
367 | 			{
368 | 				tmp += pdRidXcj[k*bstep];
369 | 			}
370 | 			tmp /= N;
371 | 
372 | 			for (unsigned int k = 0; k < N; ++k)
373 | 			{
374 | 				pdRidXj[k*bstep] = pdRidXcj[k*bstep] - tmp;
375 | 			}
376 | 		}
377 | 
378 | 		dRidX = dRidX.reshape(1, 1);
379 | 		dRidX.copyTo(dRdX.rowRange(i, i+1));
380 | 	}
381 | 
382 | 	drdX = gRodr.t() * dRdX;
383 | 	drdX.copyTo(jacobean.rowRange(0, 3));
384 | 	dtdX = - (dtdR * dRdX + dtdcx * dcxdX);
385 | 	dtdX.copyTo(jacobean.rowRange(3, 6));
386 | 	
387 | }
388 | 
389 | /**
390 | * @brief Checks whether the given matrix contains NaN entries.
391 | * @param m Input matrix.
392 | * @return True if m contrains NaN entries.
393 | */
394 | inline bool containsNaNs(const cv::Mat& m)
395 | {
396 |     return cv::sum(cv::Mat(m != m))[0] > 0;
397 | }
398 | 
399 | /*
400 |  * @brief Computes gradient of Kabsch algorithm and differentiation
401 |  * 			using central finite differences
402 |  *
403 |  * @param imgdPts: measurement points
404 |  * @param objPts: scene points
405 |  * @param jacobean Output parameter: 6x3N jacobean matrix of rotation and translation vector
406 |  * 										w.r.t scene point coordinates
407 |  * @param eps: step size in finite difference approximation
408 |  *
409 |  */
410 | void dKabschFD(
411 | 	std::vector<cv::Point3f>& imgdPts, 
412 | 	std::vector<cv::Point3f> objPts, 
413 | 	cv::OutputArray _jacobean, 
414 | 	float eps = 0.001f)
415 | {
416 | 	_jacobean.create(6, objPts.size()*3, CV_64F);
417 | 	cv::Mat jacobean = _jacobean.getMat();
418 | 
419 | 	for (unsigned int i = 0; i < objPts.size(); ++i)
420 | 	{
421 | 		for (unsigned int j = 0; j < 3; ++j)
422 | 		{
423 | 
424 | 			if(j == 0) objPts[i].x += eps;
425 | 			else if(j == 1) objPts[i].y += eps;
426 | 			else if(j == 2) objPts[i].z += eps;
427 | 
428 | 			// forward step
429 | 
430 | 			dsacstar::pose_t fStep;
431 | 			kabsch(imgdPts, objPts, fStep);
432 | 
433 | 			if(j == 0) objPts[i].x -= 2 * eps;
434 | 			else if(j == 1) objPts[i].y -= 2 * eps;
435 | 			else if(j == 2) objPts[i].z -= 2 * eps;
436 | 
437 | 			// backward step
438 | 			dsacstar::pose_t bStep;
439 | 			kabsch(imgdPts, objPts, bStep);
440 | 
441 | 			if(j == 0) objPts[i].x += eps;
442 | 			else if(j == 1) objPts[i].y += eps;
443 | 			else if(j == 2) objPts[i].z += eps;
444 | 
445 | 			// gradient calculation
446 | 			fStep.first = (fStep.first - bStep.first) / (2 * eps);
447 | 			fStep.second = (fStep.second - bStep.second) / (2 * eps);
448 | 
449 | 			fStep.first.copyTo(jacobean.col(i * 3 + j).rowRange(0, 3));
450 | 			fStep.second.copyTo(jacobean.col(i * 3 + j).rowRange(3, 6));
451 | 
452 | 			if(containsNaNs(jacobean.col(i * 3 + j)))
453 | 				jacobean.setTo(0);
454 | 
455 | 		}
456 | 	}
457 | 
458 | }
459 | 
460 | void transform(
461 | 	const std::vector<cv::Point3f>& objPts, 
462 | 	const dsacstar::pose_t& transform, 
463 | 	std::vector<cv::Point3f>& transformedPts)
464 | {
465 | 	
466 | 	cv::Mat rot;
467 | 	cv::Rodrigues(transform.first, rot);
468 | 
469 | 	// P = RX + T
470 | 	for (unsigned int i = 0; i < objPts.size(); ++i)
471 | 	{
472 | 		cv::Mat_<double> res= rot * cv::Mat_<double>(objPts[i], false);
473 | 
474 | 		cv::add(res, transform.second, res);
475 | 
476 | 		transformedPts.push_back(cv::Point3f(
477 | 			res.at<double>(0,0), 
478 | 			res.at<double>(1,0), 
479 | 			res.at<double>(2,0)));
480 | 	}
481 | }


--------------------------------------------------------------------------------
/dsacstar/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension
 3 | import os
 4 | 
 5 | opencv_inc_dir = '' # directory containing OpenCV header files
 6 | opencv_lib_dir = '' # directory containing OpenCV library files
 7 | 
 8 | #if not explicitly provided, we try to locate OpenCV in the current Conda environment
 9 | conda_env = os.environ['CONDA_PREFIX']
10 | 
11 | if len(conda_env) > 0 and len(opencv_inc_dir) == 0 and len(opencv_lib_dir) == 0:
12 | 	print("Detected active conda environment:", conda_env)
13 | 	
14 | 	opencv_inc_dir = conda_env + '/include/opencv4'
15 | 	opencv_lib_dir = conda_env + '/lib/opencv4'
16 | 
17 | 	print("Assuming OpenCV dependencies in:")
18 | 	print(opencv_inc_dir)
19 | 	print(opencv_lib_dir)
20 | 
21 | if len(opencv_inc_dir) == 0:
22 | 	print("Error: You have to provide an OpenCV include directory. Edit this file.")
23 | 	exit()
24 | if len(opencv_lib_dir) == 0:
25 | 	print("Error: You have to provide an OpenCV library directory. Edit this file.")
26 | 	exit()
27 | 
28 | setup(
29 | 	name='dsacstar',
30 | 	ext_modules=[CppExtension(
31 | 		name='dsacstar', 
32 | 		sources=['dsacstar.cpp','thread_rand.cpp'],
33 | 		include_dirs=[opencv_inc_dir],
34 | 		library_dirs=[opencv_lib_dir],
35 | 		libraries=['opencv_core','opencv_calib3d'],
36 | 		extra_compile_args=['-fopenmp']
37 | 		)],		
38 | 	cmdclass={'build_ext': BuildExtension})
39 | 


--------------------------------------------------------------------------------
/dsacstar/stop_watch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2016, TU Dresden
 3 | Copyright (c) 2017, Heidelberg University
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 |     * Redistributions in binary form must reproduce the above copyright
11 |       notice, this list of conditions and the following disclaimer in the
12 |       documentation and/or other materials provided with the distribution.
13 |     * Neither the name of the TU Dresden, Heidelberg University nor the
14 |       names of its contributors may be used to endorse or promote products
15 |       derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY
21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | */
28 | 
29 | 
30 | #pragma once
31 | 
32 | #include <chrono>
33 | 
34 | /**
35 |  * @brief Class for time measurements.
36 |  */
37 | class StopWatch
38 | {
39 | public:
40 |     /**
41 |      * @brief Construction. Initializes the stop watch.
42 |      */
43 |     StopWatch(){ init(); }
44 |   
45 |     /**
46 |      * @brief Initialization. Starts the time measurement.
47 |      * 
48 |      * @return void
49 |      */
50 |     void init()
51 |     {
52 | 	start = std::chrono::high_resolution_clock::now();
53 |     }
54 |     
55 |     /**
56 |      * @brief Stops and restarts the time measurement.
57 |      * 
58 |      * @return float The time in ms since the last init or stop call.
59 |      */
60 |     float stop()
61 |     {
62 | 	std::chrono::high_resolution_clock::time_point now;
63 | 	now = std::chrono::high_resolution_clock::now();
64 | 	
65 | 	std::chrono::high_resolution_clock::duration duration = now - start;
66 | 	
67 | 	start = now;
68 | 	
69 | 	return static_cast<float>(
70 | 	    1000.0 * std::chrono::duration_cast<std::chrono::duration<double>>(
71 | 	    duration).count());
72 |     }
73 |     
74 | private:
75 |     std::chrono::high_resolution_clock::time_point start; // start time of the current measurement.
76 | };
77 | 


--------------------------------------------------------------------------------
/dsacstar/thread_rand.cpp:
--------------------------------------------------------------------------------
 1 | #include "thread_rand.h"
 2 | #include <omp.h>
 3 | 
 4 | std::vector<std::mt19937> ThreadRand::generators;
 5 | bool ThreadRand::initialised = false;
 6 | 
 7 | void ThreadRand::forceInit(unsigned seed)
 8 | {
 9 |     initialised = false;
10 |     init(seed);
11 | }
12 | 
13 | void ThreadRand::init(unsigned seed)
14 | {
15 |     #pragma omp critical
16 |     {
17 | 	if(!initialised)
18 | 	{
19 | 	    unsigned nThreads = omp_get_max_threads();
20 | 	    
21 | 	    for(unsigned i = 0; i < nThreads; i++)
22 | 	    {    
23 | 		generators.push_back(std::mt19937());
24 | 		generators[i].seed(i+seed);
25 | 	    }
26 | 
27 | 	    initialised = true;
28 | 	}    
29 |     }
30 | }
31 | 
32 | int ThreadRand::irand(int min, int max, int tid)
33 | {
34 |     std::uniform_int_distribution<int> dist(min, max);
35 | 
36 |     unsigned threadID = omp_get_thread_num();
37 |     if(tid >= 0) threadID = tid;
38 |     
39 |     if(!initialised) init();
40 |   
41 |     return dist(ThreadRand::generators[threadID]);
42 | }
43 | 
44 | double ThreadRand::drand(double min, double max, int tid)
45 | {
46 |     std::uniform_real_distribution<double> dist(min, max);
47 |     
48 |     unsigned threadID = omp_get_thread_num();
49 |     if(tid >= 0) threadID = tid;
50 | 
51 |     if(!initialised) init();
52 | 
53 |     return dist(ThreadRand::generators[threadID]);
54 | }
55 | 
56 | double ThreadRand::dgauss(double mean, double stdDev, int tid)
57 | {
58 |     std::normal_distribution<double> dist(mean, stdDev);
59 |     
60 |     unsigned threadID = omp_get_thread_num();
61 |     if(tid >= 0) threadID = tid;
62 | 
63 |     if(!initialised) init();
64 | 
65 |     return dist(ThreadRand::generators[threadID]);
66 | }
67 | 
68 | int irand(int incMin, int excMax, int tid)
69 | {
70 |     return ThreadRand::irand(incMin, excMax - 1, tid);
71 | }
72 | 
73 | double drand(double incMin, double incMax,int tid)
74 | {
75 |     return ThreadRand::drand(incMin, incMax, tid);
76 | }
77 | 
78 | int igauss(int mean, int stdDev, int tid)
79 | {
80 |     return (int) ThreadRand::dgauss(mean, stdDev, tid);
81 | }
82 | 
83 | double dgauss(double mean, double stdDev, int tid)
84 | {
85 |     return ThreadRand::dgauss(mean, stdDev, tid);
86 | }


--------------------------------------------------------------------------------
/dsacstar/thread_rand.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Based on the DSAC++ and ESAC code.
  3 | https://github.com/vislearn/LessMore
  4 | https://github.com/vislearn/esac
  5 | 
  6 | Copyright (c) 2016, TU Dresden
  7 | Copyright (c) 2019, Heidelberg University
  8 | All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without
 11 | modification, are permitted provided that the following conditions are met:
 12 |     * Redistributions of source code must retain the above copyright
 13 |       notice, this list of conditions and the following disclaimer.
 14 |     * Redistributions in binary form must reproduce the above copyright
 15 |       notice, this list of conditions and the following disclaimer in the
 16 |       documentation and/or other materials provided with the distribution.
 17 |     * Neither the name of the TU Dresden, Heidelberg University nor the
 18 |       names of its contributors may be used to endorse or promote products
 19 |       derived from this software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL TU DRESDEN OR HEIDELBERG UNIVERSITY BE LIABLE FOR ANY
 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <random>
 36 | 
 37 | /** Classes and methods for generating random numbers in multi-threaded programs. */
 38 | 
 39 | /**
 40 |  * @brief Provides random numbers for multiple threads.
 41 |  * 
 42 |  * Singelton class. Holds a random number generator for each thread and gives random numbers for the current thread.
 43 |  */
 44 | class ThreadRand
 45 | {
 46 | public:
 47 |   /**
 48 |    * @brief Returns a random integer (uniform distribution).
 49 |    * 
 50 |    * @param min Minimum value of the random integer (inclusive).
 51 |    * @param max Maximum value of the random integer (exclusive).
 52 |    * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
 53 |    * @return int Random integer value.
 54 |    */
 55 |   static int irand(int min, int max, int tid = -1);
 56 |   
 57 |   /**
 58 |    * @brief Returns a random double value (uniform distribution).
 59 |    * 
 60 |    * @param min Minimum value of the random double (inclusive).
 61 |    * @param max Maximum value of the random double (inclusive).
 62 |    * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
 63 |    * @return double Random double value.
 64 |    */
 65 |   static double drand(double min, double max, int tid = -1);
 66 |   
 67 |   /**
 68 |    * @brief Returns a random double value (Gauss distribution).
 69 |    * 
 70 |    * @param mean Mean of the Gauss distribution to sample from.
 71 |    * @param stdDev Standard deviation of the Gauss distribution to sample from.
 72 |    * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
 73 |    * @return double Random double value.
 74 |    */
 75 |   static double dgauss(double mean, double stdDev, int tid = -1);
 76 |     
 77 |   /**
 78 |    * @brief Re-Initialize the object with the given seed.
 79 |    * 
 80 |    * @param seed Seed to initialize the random number generators (seed is incremented by one for each generator).
 81 |    * @return void
 82 |    */
 83 |   static void forceInit(unsigned seed);
 84 |   
 85 |   /**
 86 |    * @brief List of random number generators. One for each thread.
 87 |    * 
 88 |    */
 89 |   static std::vector<std::mt19937> generators;
 90 | 
 91 |   /**
 92 |    * @brief Initialize class with the given seed.
 93 |    * 
 94 |    * Method will create a random number generator for each thread. The given seed 
 95 |    * will be incremented by one for each generator. This methods is automatically 
 96 |    * called when this calss is used the first time.
 97 |    * 
 98 |    * @param seed Optional parameter. Seed to be used when initializing the generators. Will be incremented by one for each generator.
 99 |    * @return void
100 |    */
101 |   static void init(unsigned seed = 1305);
102 | 
103 | private:  
104 |   /**
105 |    * @brief True if the class has been initialized already
106 |    */
107 |   static bool initialised;  
108 | };
109 | 
110 | /**
111 |   * @brief Returns a random integer (uniform distribution).
112 |   * 
113 |   * This method used the ThreadRand class.
114 |   * 
115 |   * @param min Minimum value of the random integer (inclusive).
116 |   * @param max Maximum value of the random integer (exclusive).
117 |   * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
118 |   * @return int Random integer value.
119 |   */
120 | int irand(int incMin, int excMax, int tid = -1);
121 | /**
122 |   * @brief Returns a random double value (uniform distribution).
123 |   * 
124 |   * This method used the ThreadRand class.
125 |   * 
126 |   * @param min Minimum value of the random double (inclusive).
127 |   * @param max Maximum value of the random double (inclusive).
128 |   * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
129 |   * @return double Random double value.
130 |   */
131 | double drand(double incMin, double incMax, int tid = -1);
132 | 
133 |   /**
134 |    * @brief Returns a random integer value (Gauss distribution).
135 |    * 
136 |    * This method used the ThreadRand class.
137 |    * 
138 |    * @param mean Mean of the Gauss distribution to sample from.
139 |    * @param stdDev Standard deviation of the Gauss distribution to sample from.
140 |    * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
141 |    * @return double Random integer value.
142 |    */
143 | int igauss(int mean, int stdDev, int tid = -1);
144 | 
145 |   /**
146 |    * @brief Returns a random double value (Gauss distribution).
147 |    * 
148 |    * This method used the ThreadRand class.
149 |    * 
150 |    * @param mean Mean of the Gauss distribution to sample from.
151 |    * @param stdDev Standard deviation of the Gauss distribution to sample from.
152 |    * @param tid Optional parameter. ID of the thread to use. If not given, the method will obtain the thread ID itself.
153 |    * @return double Random double value.
154 |    */
155 | double dgauss(double mean, double stdDev, int tid = -1);
156 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: ace
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - anaconda
  6 |   - conda-forge
  7 |   - defaults
  8 | dependencies:
  9 |   - _libgcc_mutex=0.1=main
 10 |   - _openmp_mutex=5.1=1_gnu
 11 |   - alsa-lib=1.2.3.2=h166bdaf_0
 12 |   - appdirs=1.4.4=pyhd3eb1b0_0
 13 |   - blas=1.0=mkl
 14 |   - blosc=1.21.3=h6a678d5_0
 15 |   - brotli=1.0.9=h5eee18b_7
 16 |   - brotli-bin=1.0.9=h5eee18b_7
 17 |   - brotlipy=0.7.0=py38h27cfd23_1003
 18 |   - brunsli=0.1=h2531618_0
 19 |   - bzip2=1.0.8=h7b6447c_0
 20 |   - c-ares=1.19.0=h5eee18b_0
 21 |   - ca-certificates=2023.01.10=h06a4308_0
 22 |   - cairo=1.16.0=h6cf1ce9_1008
 23 |   - certifi=2022.12.7=py38h06a4308_0
 24 |   - cffi=1.15.1=py38h5eee18b_3
 25 |   - cfitsio=3.470=h5893167_7
 26 |   - charls=2.2.0=h2531618_0
 27 |   - charset-normalizer=2.0.4=pyhd3eb1b0_0
 28 |   - cloudpickle=2.0.0=pyhd3eb1b0_0
 29 |   - contourpy=1.0.5=py38hdb19cb5_0
 30 |   - cryptography=39.0.1=py38h9ce1e76_0
 31 |   - cuda-cudart=11.8.89=0
 32 |   - cuda-cupti=11.8.87=0
 33 |   - cuda-libraries=11.8.0=0
 34 |   - cuda-nvrtc=11.8.89=0
 35 |   - cuda-nvtx=11.8.86=0
 36 |   - cuda-runtime=11.8.0=0
 37 |   - cycler=0.11.0=pyhd8ed1ab_0
 38 |   - cytoolz=0.12.0=py38h5eee18b_0
 39 |   - dask-core=2022.7.0=py38h06a4308_0
 40 |   - dataclasses=0.8=pyhc8e2a94_3
 41 |   - dbus=1.13.6=h48d8840_2
 42 |   - expat=2.4.8=h27087fc_0
 43 |   - ffmpeg=4.3=hf484d3e_0
 44 |   - filelock=3.9.0=py38h06a4308_0
 45 |   - flit-core=3.8.0=py38h06a4308_0
 46 |   - fontconfig=2.14.0=h8e229c2_0
 47 |   - fonttools=4.25.0=pyhd3eb1b0_0
 48 |   - freetype=2.12.1=h4a9f257_0
 49 |   - freetype-py=2.3.0=pyhd8ed1ab_0
 50 |   - fsspec=2023.3.0=py38h06a4308_0
 51 |   - future=0.18.3=pyhd8ed1ab_0
 52 |   - gettext=0.19.8.1=h73d1719_1008
 53 |   - giflib=5.2.1=h5eee18b_3
 54 |   - glib=2.68.4=h9c3ff4c_1
 55 |   - glib-tools=2.68.4=h9c3ff4c_1
 56 |   - gmp=6.2.1=h295c915_3
 57 |   - gmpy2=2.1.2=py38heeb90bb_0
 58 |   - gnutls=3.6.15=he1e5248_0
 59 |   - graphite2=1.3.13=h58526e2_1001
 60 |   - gst-plugins-base=1.18.5=hf529b03_0
 61 |   - gstreamer=1.18.5=h76c114f_0
 62 |   - harfbuzz=2.9.1=h83ec7ef_1
 63 |   - hdf5=1.10.6=nompi_h6a2412b_1114
 64 |   - icu=68.2=h9c3ff4c_0
 65 |   - idna=3.4=py38h06a4308_0
 66 |   - imagecodecs-lite=2019.12.3=py38h71d37f0_5
 67 |   - imageio=2.26.0=py38h06a4308_0
 68 |   - importlib_resources=5.12.0=pyhd8ed1ab_0
 69 |   - intel-openmp=2021.4.0=h06a4308_3561
 70 |   - jasper=1.900.1=h07fcdf6_1006
 71 |   - jinja2=3.1.2=py38h06a4308_0
 72 |   - joblib=1.1.1=py38h06a4308_0
 73 |   - jpeg=9e=h5eee18b_1
 74 |   - jxrlib=1.1=h7b6447c_2
 75 |   - kiwisolver=1.4.4=py38h6a678d5_0
 76 |   - krb5=1.19.4=h568e23c_0
 77 |   - lame=3.100=h7b6447c_0
 78 |   - lcms2=2.12=h3be6417_0
 79 |   - ld_impl_linux-64=2.38=h1181459_1
 80 |   - lerc=3.0=h295c915_0
 81 |   - libaec=1.0.4=he6710b0_1
 82 |   - libblas=3.9.0=12_linux64_mkl
 83 |   - libbrotlicommon=1.0.9=h5eee18b_7
 84 |   - libbrotlidec=1.0.9=h5eee18b_7
 85 |   - libbrotlienc=1.0.9=h5eee18b_7
 86 |   - libcblas=3.9.0=12_linux64_mkl
 87 |   - libclang=11.1.0=default_ha53f305_1
 88 |   - libcublas=11.11.3.6=0
 89 |   - libcufft=10.9.0.58=0
 90 |   - libcufile=1.6.0.25=0
 91 |   - libcurand=10.3.2.56=0
 92 |   - libcurl=7.88.1=h91b91d3_0
 93 |   - libcusolver=11.4.1.48=0
 94 |   - libcusparse=11.7.5.86=0
 95 |   - libdeflate=1.17=h5eee18b_0
 96 |   - libedit=3.1.20221030=h5eee18b_0
 97 |   - libev=4.33=h7f8727e_1
 98 |   - libevent=2.1.10=h9b69904_4
 99 |   - libffi=3.4.2=h6a678d5_6
100 |   - libgcc-ng=11.2.0=h1234567_1
101 |   - libgfortran-ng=11.2.0=h00389a5_1
102 |   - libgfortran5=11.2.0=h1234567_1
103 |   - libglib=2.68.4=h174f98d_1
104 |   - libglu=9.0.0=hf484d3e_1
105 |   - libgomp=11.2.0=h1234567_1
106 |   - libiconv=1.16=h7f8727e_2
107 |   - libidn2=2.3.2=h7f8727e_0
108 |   - liblapack=3.9.0=12_linux64_mkl
109 |   - liblapacke=3.9.0=12_linux64_mkl
110 |   - libllvm11=11.1.0=hf817b99_2
111 |   - libnghttp2=1.46.0=hce63b2e_0
112 |   - libnpp=11.8.0.86=0
113 |   - libnvjpeg=11.9.0.86=0
114 |   - libogg=1.3.4=h7f98852_1
115 |   - libopencv=4.4.0=py38_2
116 |   - libopus=1.3.1=h7f98852_1
117 |   - libpng=1.6.39=h5eee18b_0
118 |   - libpq=13.3=hd57d9b9_0
119 |   - libssh2=1.10.0=h8f2d780_0
120 |   - libstdcxx-ng=11.2.0=h1234567_1
121 |   - libtasn1=4.16.0=h27cfd23_0
122 |   - libtiff=4.2.0=hf544144_3
123 |   - libunistring=0.9.10=h27cfd23_0
124 |   - libuuid=2.32.1=h7f98852_1000
125 |   - libvorbis=1.3.7=h9c3ff4c_0
126 |   - libwebp=1.2.4=h11a3e52_1
127 |   - libwebp-base=1.2.4=h5eee18b_1
128 |   - libxcb=1.13=h7f98852_1004
129 |   - libxkbcommon=1.0.3=he3ba5ed_0
130 |   - libxml2=2.9.12=h72842e0_0
131 |   - libzopfli=1.0.3=he6710b0_0
132 |   - locket=1.0.0=py38h06a4308_0
133 |   - lz4-c=1.9.4=h6a678d5_0
134 |   - markupsafe=2.1.1=py38h7f8727e_0
135 |   - matplotlib=3.7.1=py38h578d9bd_0
136 |   - matplotlib-base=3.7.1=py38h417a72b_1
137 |   - mkl=2021.4.0=h06a4308_640
138 |   - mkl-service=2.4.0=py38h7f8727e_0
139 |   - mkl_fft=1.3.1=py38hd3c417c_0
140 |   - mkl_random=1.2.2=py38h51133e4_0
141 |   - mpc=1.1.0=h10f8cd9_1
142 |   - mpfr=4.0.2=hb69a4c5_1
143 |   - mpmath=1.2.1=py38h06a4308_0
144 |   - munkres=1.1.4=pyh9f0ad1d_0
145 |   - mysql-common=8.0.25=ha770c72_2
146 |   - mysql-libs=8.0.25=hfa10184_2
147 |   - ncurses=6.4=h6a678d5_0
148 |   - nettle=3.7.3=hbbd107a_1
149 |   - networkx=2.8.4=py38h06a4308_1
150 |   - nspr=4.32=h9c3ff4c_1
151 |   - nss=3.69=hb5efdd6_1
152 |   - numpy=1.23.5=py38h14f4228_0
153 |   - numpy-base=1.23.5=py38h31eccc5_0
154 |   - opencv=4.4.0=py38_2
155 |   - openh264=2.1.1=h4ff587b_0
156 |   - openjpeg=2.4.0=h3ad879b_0
157 |   - openssl=1.1.1t=h7f8727e_0
158 |   - packaging=23.0=py38h06a4308_0
159 |   - partd=1.2.0=pyhd3eb1b0_1
160 |   - pcre=8.45=h9c3ff4c_0
161 |   - pillow=9.4.0=py38h6a678d5_0
162 |   - pip=23.0.1=py38h06a4308_0
163 |   - pixman=0.40.0=h36c2ea0_0
164 |   - pooch=1.4.0=pyhd3eb1b0_0
165 |   - pthread-stubs=0.4=h36c2ea0_1001
166 |   - py-opencv=4.4.0=py38h23f93f0_2
167 |   - pycparser=2.21=pyhd3eb1b0_0
168 |   - pyglet=1.5.27=py38h578d9bd_3
169 |   - pyopengl=3.1.6=pyhd8ed1ab_1
170 |   - pyopenssl=23.0.0=py38h06a4308_0
171 |   - pyparsing=3.0.9=pyhd8ed1ab_0
172 |   - pyqt=5.12.3=py38ha8c2ead_4
173 |   - pyrender=0.1.45=pyh8a188c0_3
174 |   - pysocks=1.7.1=py38h06a4308_0
175 |   - python=3.8.16=h7a1cb2a_3
176 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
177 |   - python_abi=3.8=2_cp38
178 |   - pytorch=2.0.0=py3.8_cuda11.8_cudnn8.7.0_0
179 |   - pytorch-cuda=11.8=h7e8668a_3
180 |   - pytorch-mutex=1.0=cuda
181 |   - pywavelets=1.4.1=py38h5eee18b_0
182 |   - pyyaml=6.0=py38h5eee18b_1
183 |   - qt=5.12.9=hda022c4_4
184 |   - readline=8.2=h5eee18b_0
185 |   - requests=2.28.1=py38h06a4308_1
186 |   - scikit-image=0.19.3=py38h6a678d5_1
187 |   - scipy=1.10.0=py38h14f4228_1
188 |   - setuptools=65.6.3=py38h06a4308_0
189 |   - six=1.16.0=pyhd3eb1b0_1
190 |   - snappy=1.1.9=h295c915_0
191 |   - sqlite=3.41.1=h5eee18b_0
192 |   - sympy=1.11.1=py38h06a4308_0
193 |   - tifffile=2020.6.3=py_0
194 |   - tk=8.6.12=h1ccaba5_0
195 |   - toolz=0.12.0=py38h06a4308_0
196 |   - torchaudio=2.0.0=py38_cu118
197 |   - torchtriton=2.0.0=py38
198 |   - torchvision=0.15.0=py38_cu118
199 |   - tornado=6.1=py38h0a891b7_3
200 |   - trimesh=3.21.2=pyhd8ed1ab_0
201 |   - typing_extensions=4.4.0=py38h06a4308_0
202 |   - urllib3=1.26.14=py38h06a4308_0
203 |   - wheel=0.38.4=py38h06a4308_0
204 |   - x264=1!157.20191217=h7b6447c_0
205 |   - xorg-kbproto=1.0.7=h7f98852_1002
206 |   - xorg-libice=1.0.10=h7f98852_0
207 |   - xorg-libsm=1.2.3=hd9c2040_1000
208 |   - xorg-libx11=1.7.2=h7f98852_0
209 |   - xorg-libxau=1.0.9=h7f98852_0
210 |   - xorg-libxdmcp=1.1.3=h7f98852_0
211 |   - xorg-libxext=1.3.4=h7f98852_1
212 |   - xorg-libxrender=0.9.10=h7f98852_1003
213 |   - xorg-renderproto=0.11.1=h7f98852_1002
214 |   - xorg-xextproto=7.3.0=h7f98852_1002
215 |   - xorg-xproto=7.0.31=h7f98852_1007
216 |   - xz=5.2.10=h5eee18b_1
217 |   - yaml=0.2.5=h7b6447c_0
218 |   - zfp=0.5.5=h295c915_6
219 |   - zipp=3.15.0=pyhd8ed1ab_0
220 |   - zlib=1.2.13=h5eee18b_0
221 |   - zstd=1.5.2=ha4553b6_0
222 |   - pip:
223 |       - pyqt5-sip==4.19.18
224 |       - pyqtchart==5.12
225 |       - pyqtwebengine==5.12.1
226 | 


--------------------------------------------------------------------------------
/eval_poses.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright © Niantic, Inc. 2022.
  3 | 
  4 | import argparse
  5 | import logging
  6 | import math
  7 | from pathlib import Path
  8 | 
  9 | import cv2
 10 | import numpy as np
 11 | from scipy.spatial.transform import Rotation
 12 | 
 13 | from dataset import CamLocDataset
 14 | 
 15 | _logger = logging.getLogger(__name__)
 16 | 
 17 | if __name__ == '__main__':
 18 |     logging.basicConfig(level=logging.INFO)
 19 |     parser = argparse.ArgumentParser(
 20 |         description='Compute metrics for a pre-existing poses file.',
 21 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 22 | 
 23 |     parser.add_argument('scene', type=Path, help='name of a scene in the dataset folder, e.g. Cambridge_GreatCourt')
 24 |     parser.add_argument('poses_file', type=Path, help='file containing poses estimated for the input scene')
 25 | 
 26 |     opt = parser.parse_args()
 27 | 
 28 |     # Load dataset.
 29 |     testset = CamLocDataset(
 30 |         opt.scene / "test",
 31 |         mode=0,  # Default for ACE, we don't need scene coordinates/RGB-D.
 32 |     )
 33 |     _logger.info(f"Loaded scene with {len(testset)} frames.")
 34 | 
 35 |     # load pre-existing poses
 36 |     with opt.poses_file.open('r') as f:
 37 |         frame_poses = f.readlines()
 38 |     _logger.info(f"Loaded {len(frame_poses)} poses.")
 39 | 
 40 |     # Check.
 41 |     assert len(testset) == len(frame_poses)
 42 | 
 43 |     # Keep track of rotation and translation errors for calculation of the median error.
 44 |     rErrs = []
 45 |     tErrs = []
 46 | 
 47 |     # Percentage of frames predicted within certain thresholds from their GT pose.
 48 |     pct10_5 = 0
 49 |     pct5 = 0
 50 |     pct2 = 0
 51 |     pct1 = 0
 52 | 
 53 |     # Iterate over the dataset.
 54 |     for image_idx, (_, _, gt_pose, _, _, _, _, image_file) in enumerate(testset):
 55 |         # Parse estimated pose from the input file.
 56 |         pose_file, qw, qx, qy, qz, tx, ty, tz, _, _, _ = frame_poses[image_idx].split()
 57 | 
 58 |         # Check that the files match.
 59 |         assert Path(image_file).name == pose_file
 60 | 
 61 |         # We do everything in np.
 62 |         gt_pose = gt_pose.numpy()
 63 | 
 64 |         # Convert quaternion to rotation matrix.
 65 |         r_mat = Rotation.from_quat([float(x) for x in [qx, qy, qz, qw]]).as_matrix()
 66 |         t_vec = np.array([float(x) for x in [tx, ty, tz]])
 67 | 
 68 |         # We saved the inverse pose
 69 |         estimated_pose_inv = np.eye(4)
 70 |         estimated_pose_inv[:3, :3] = r_mat
 71 |         estimated_pose_inv[:3, 3] = t_vec
 72 | 
 73 |         # Pose we use for evaluation.
 74 |         estimated_pose = np.linalg.inv(estimated_pose_inv)
 75 | 
 76 |         # calculate pose errors
 77 |         t_err = float(np.linalg.norm(gt_pose[0:3, 3] - estimated_pose[0:3, 3]))
 78 | 
 79 |         gt_R = gt_pose[0:3, 0:3]
 80 |         out_R = estimated_pose[0:3, 0:3]
 81 | 
 82 |         r_err = np.matmul(out_R, np.transpose(gt_R))
 83 |         r_err = cv2.Rodrigues(r_err)[0]
 84 |         r_err = np.linalg.norm(r_err) * 180 / math.pi
 85 | 
 86 |         _logger.info("Rotation Error: %.2fdeg, Translation Error: %.1fcm" % (r_err, t_err * 100))
 87 | 
 88 |         # Save the errors.
 89 |         rErrs.append(r_err)
 90 |         tErrs.append(t_err * 100)
 91 | 
 92 |         # Check various thresholds.
 93 |         if r_err < 5 and t_err < 0.1:  # 10cm/5deg
 94 |             pct10_5 += 1
 95 |         if r_err < 5 and t_err < 0.05:  # 5cm/5deg
 96 |             pct5 += 1
 97 |         if r_err < 2 and t_err < 0.02:  # 2cm/2deg
 98 |             pct2 += 1
 99 |         if r_err < 1 and t_err < 0.01:  # 1cm/1deg
100 |             pct1 += 1
101 | 
102 |     total_frames = len(rErrs)
103 |     assert total_frames == len(testset)
104 | 
105 |     # Compute median errors.
106 |     tErrs.sort()
107 |     rErrs.sort()
108 |     median_idx = total_frames // 2
109 |     median_rErr = rErrs[median_idx]
110 |     median_tErr = tErrs[median_idx]
111 | 
112 |     # Compute final metrics.
113 |     pct10_5 = pct10_5 / total_frames * 100
114 |     pct5 = pct5 / total_frames * 100
115 |     pct2 = pct2 / total_frames * 100
116 |     pct1 = pct1 / total_frames * 100
117 | 
118 |     _logger.info("===================================================")
119 |     _logger.info("Test complete.")
120 | 
121 |     _logger.info('Accuracy:')
122 |     _logger.info(f'\t10cm/5deg: {pct10_5:.1f}%')
123 |     _logger.info(f'\t5cm/5deg: {pct5:.1f}%')
124 |     _logger.info(f'\t2cm/2deg: {pct2:.1f}%')
125 |     _logger.info(f'\t1cm/1deg: {pct1:.1f}%')
126 | 
127 |     _logger.info(f"Median Error: {median_rErr:.1f}deg, {median_tErr:.1f}cm")
128 | 


--------------------------------------------------------------------------------
/merge_ensemble_results.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright © Niantic, Inc. 2022.
 3 | 
 4 | import logging
 5 | from argparse import ArgumentParser
 6 | from collections import defaultdict
 7 | from dataclasses import dataclass, field
 8 | from pathlib import Path
 9 | from typing import List
10 | 
11 | _logger = logging.getLogger(__name__)
12 | 
13 | @dataclass
14 | class FrameResult:
15 |     inlier_count: int = 0
16 |     quaternion: List[float] = field(default_factory=lambda: [1, 0, 0, 0])
17 |     translation: List[float] = field(default_factory=lambda: [0, 0, 0])
18 |     r_err: float = 0
19 |     t_err: float = 0
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     logging.basicConfig(level=logging.INFO)
24 |     parser = ArgumentParser(
25 |         description="Merge results created by multiple nets trained on clustered datasets, "
26 |                     "keeping the best pose for each image (in terms of inlier count).")
27 |     parser.add_argument('poses_path', type=Path,
28 |                         help="Path to a folder containing the estimated poses for each network.")
29 |     parser.add_argument('out_file', type=Path,
30 |                         help="Path to the output file containing the best pose for each image.")
31 |     parser.add_argument('--poses_suffix', type=str, default='.txt', help='Suffix to select a subset of pose files.')
32 | 
33 |     args = parser.parse_args()
34 | 
35 |     poses_path: Path = args.poses_path
36 |     out_file: Path = args.out_file
37 | 
38 |     pose_files = sorted(poses_path.glob(f"poses_*{args.poses_suffix}"))
39 |     _logger.info(f"Found {len(pose_files)} pose files.")
40 | 
41 |     frame_poses = defaultdict(FrameResult)
42 | 
43 |     for in_file in pose_files:
44 |         _logger.info(f"Parsing results from: {in_file}")
45 |         with in_file.open('r') as f:
46 |             for line in f.readlines():
47 |                 current_result = FrameResult()
48 |                 img, current_result.quaternion[0], current_result.quaternion[1], current_result.quaternion[2], \
49 |                 current_result.quaternion[3],\
50 |                 current_result.translation[0], current_result.translation[1], current_result.translation[2],\
51 |                 current_result.r_err, current_result.t_err, current_result.inlier_count = line.split()
52 | 
53 |                 # Convert to the appropriate datatypes.
54 |                 current_result.inlier_count = int(current_result.inlier_count)
55 |                 current_result.quaternion = [float(x) for x in current_result.quaternion]
56 |                 current_result.translation = [float(x) for x in current_result.translation]
57 |                 current_result.r_err = float(current_result.r_err)
58 |                 current_result.t_err = float(current_result.t_err)
59 | 
60 |                 # Update global dict if needed.
61 |                 if frame_poses[img].inlier_count < current_result.inlier_count:
62 |                     frame_poses[img] = current_result
63 | 
64 |     _logger.info(f"Found results for {len(frame_poses)} query frames.")
65 | 
66 |     # Save the output.
67 |     with out_file.open('w') as f:
68 |         for img_name in sorted(frame_poses.keys()):
69 |             frame_result = frame_poses[img_name]
70 |             f.write(
71 |                 f"{img_name} "
72 |                 f"{' '.join(str(x) for x in frame_result.quaternion)} "
73 |                 f"{' '.join(str(x) for x in frame_result.translation)} "
74 |                 f"{frame_result.r_err} {frame_result.t_err} {frame_result.inlier_count}\n")
75 |     _logger.info(f"Saved merged poses to: {out_file}")
76 | 


--------------------------------------------------------------------------------
/scripts/train_12scenes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("12scenes_apt1_kitchen" "12scenes_apt1_living" "12scenes_apt2_bed" "12scenes_apt2_kitchen" "12scenes_apt2_living" "12scenes_apt2_luke" "12scenes_office1_gates362" "12scenes_office1_gates381" "12scenes_office1_lounge" "12scenes_office1_manolis" "12scenes_office2_5a" "12scenes_office2_5b")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/12Scenes"
14 | mkdir -p "$out_dir"
15 | 
16 | for scene in ${scenes[*]}; do
17 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
18 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
19 | done
20 | 
21 | for scene in ${scenes[*]}; do
22 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/train_12scenes_pgt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("pgt_12scenes_apt1_kitchen" "pgt_12scenes_apt1_living" "pgt_12scenes_apt2_bed" "pgt_12scenes_apt2_kitchen" "pgt_12scenes_apt2_living" "pgt_12scenes_apt2_luke" "pgt_12scenes_office1_gates362" "pgt_12scenes_office1_gates381" "pgt_12scenes_office1_lounge" "pgt_12scenes_office1_manolis" "pgt_12scenes_office2_5a" "pgt_12scenes_office2_5b")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/pgt_12Scenes"
14 | mkdir -p "$out_dir"
15 | 
16 | for scene in ${scenes[*]}; do
17 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
18 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
19 | done
20 | 
21 | for scene in ${scenes[*]}; do
22 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/train_7scenes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("7scenes_chess" "7scenes_fire" "7scenes_heads" "7scenes_office" "7scenes_pumpkin" "7scenes_redkitchen" "7scenes_stairs")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/7Scenes"
14 | mkdir -p "$out_dir"
15 | 
16 | for scene in ${scenes[*]}; do
17 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
18 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
19 | done
20 | 
21 | for scene in ${scenes[*]}; do
22 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/train_7scenes_pgt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("pgt_7scenes_chess" "pgt_7scenes_fire" "pgt_7scenes_heads" "pgt_7scenes_office" "pgt_7scenes_pumpkin" "pgt_7scenes_redkitchen" "pgt_7scenes_stairs")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/pgt_7Scenes"
14 | mkdir -p "$out_dir"
15 | 
16 | for scene in ${scenes[*]}; do
17 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
18 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
19 | done
20 | 
21 | for scene in ${scenes[*]}; do
22 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -5 | head -1)"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ./train_7scenes.sh
 4 | ./train_7scenes_pgt.sh
 5 | ./train_12scenes.sh
 6 | ./train_12scenes_pgt.sh
 7 | ./train_cambridge.sh
 8 | ./train_wayspots.sh
 9 | ./train_cambridge_ensemble.sh 2
10 | ./train_cambridge_ensemble.sh 3
11 | ./train_cambridge_ensemble.sh 4
12 | ./train_cambridge_ensemble.sh 5
13 | ./train_cambridge_ensemble.sh 6
14 | ./train_cambridge_ensemble.sh 7
15 | 


--------------------------------------------------------------------------------
/scripts/train_cambridge.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("Cambridge_GreatCourt" "Cambridge_KingsCollege" "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_StMarysChurch")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/Cambridge"
14 | 
15 | mkdir -p "$out_dir"
16 | 
17 | for scene in ${scenes[*]}; do
18 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
19 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
20 | done
21 | 
22 | for scene in ${scenes[*]}; do
23 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -2 | head -1)"
24 | done
25 | 


--------------------------------------------------------------------------------
/scripts/train_cambridge_ensemble.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("Cambridge_GreatCourt" "Cambridge_KingsCollege" "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_StMarysChurch")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | merging_exe="${REPO_PATH}/merge_ensemble_results.py"
12 | eval_exe="${REPO_PATH}/eval_poses.py"
13 | 
14 | datasets_folder="${REPO_PATH}/datasets"
15 | out_dir="${REPO_PATH}/output/Cambridge_Ensemble"
16 | mkdir -p "$out_dir"
17 | 
18 | num_clusters=${1:-1}
19 | 
20 | for scene in ${scenes[*]}; do
21 |   current_out_dir="$out_dir/$scene"
22 |   mkdir -p "$current_out_dir"
23 | 
24 |   for cluster_idx in $(seq 0 $((num_clusters-1))); do
25 |     echo "Training network for scene: $scene and cluster: $cluster_idx/$num_clusters"
26 | 
27 |     net_file="${current_out_dir}/${cluster_idx}_${num_clusters}.pt"
28 | 
29 |     [[ -f "$net_file" ]] || python $training_exe "$datasets_folder/$scene" "$net_file" --num_clusters $num_clusters --cluster_idx $cluster_idx | tee "${current_out_dir}/train_${cluster_idx}_${num_clusters}.txt"
30 |     python $testing_exe "$datasets_folder/$scene" "$net_file" --session "${cluster_idx}_${num_clusters}" 2>&1 | tee "${current_out_dir}/test_${cluster_idx}_${num_clusters}.txt"
31 |   done
32 | 
33 |   python $merging_exe "$current_out_dir" "$current_out_dir/merged_poses_${num_clusters}.txt" --poses_suffix "_${num_clusters}.txt"
34 |   python $eval_exe "$datasets_folder/$scene" "$current_out_dir/merged_poses_${num_clusters}.txt" 2>&1 | tee "$current_out_dir/eval_merged_${num_clusters}.txt"
35 | done
36 | 
37 | for scene in ${scenes[*]}; do
38 |   current_out_dir="$out_dir/$scene"
39 |   echo "${scene}: $(cat "$current_out_dir/eval_merged_${num_clusters}.txt" | tail -1)"
40 | done
41 | 


--------------------------------------------------------------------------------
/scripts/train_wayspots.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("wayspots_squarebench" "wayspots_bears" "wayspots_cubes" "wayspots_inscription" "wayspots_lawn" "wayspots_map"  "wayspots_statue" "wayspots_tendrils" "wayspots_therock" "wayspots_wintersign")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/wayspots"
14 | mkdir -p "$out_dir"
15 | 
16 | for scene in ${scenes[*]}; do
17 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt"
18 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" 2>&1 | tee "$out_dir/log_${scene}.txt"
19 | done
20 | 
21 | for scene in ${scenes[*]}; do
22 |   echo "${scene}: $(cat "${out_dir}/log_${scene}.txt" | tail -6 | head -1)"
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/viz_12scenes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("12scenes_apt1_kitchen" "12scenes_apt1_living" "12scenes_apt2_bed" "12scenes_apt2_kitchen" "12scenes_apt2_living" "12scenes_apt2_luke" "12scenes_office1_gates362" "12scenes_office1_gates381" "12scenes_office1_lounge" "12scenes_office1_manolis" "12scenes_office2_5a" "12scenes_office2_5b")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/viz_maps/12Scenes"
14 | renderings_dir="${REPO_PATH}/output/renderings/12Scenes"
15 | 
16 | mkdir -p "$out_dir"
17 | 
18 | for scene in ${scenes[*]}; do
19 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_camera_z_offset 2
20 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_pose_error_threshold 10
21 |   /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4"
22 | done


--------------------------------------------------------------------------------
/scripts/viz_7scenes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("7scenes_chess" "7scenes_fire" "7scenes_heads" "7scenes_office" "7scenes_pumpkin" "7scenes_redkitchen" "7scenes_stairs")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/viz_maps/7Scenes"
14 | renderings_dir="${REPO_PATH}/output/renderings/7Scenes"
15 | 
16 | mkdir -p "$out_dir"
17 | mkdir -p "renderings_dir"
18 | 
19 | for scene in ${scenes[*]}; do
20 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_camera_z_offset 2
21 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True  --render_target_path "$renderings_dir" --render_pose_error_threshold 10 --render_frame_skip 3
22 |   /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4"
23 | done


--------------------------------------------------------------------------------
/scripts/viz_cambridge.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=( "Cambridge_OldHospital" "Cambridge_ShopFacade" "Cambridge_KingsCollege" "Cambridge_GreatCourt" "Cambridge_StMarysChurch")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/viz_maps/Cambridge"
14 | renderings_dir="${REPO_PATH}/output/renderings/Cambridge"
15 | 
16 | mkdir -p "$out_dir"
17 | 
18 | for scene in ${scenes[*]}; do
19 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_map_error_threshold 50 --render_map_depth_filter 50
20 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_sparse_queries True --render_pose_error_threshold 100 --render_map_depth_filter 50 --render_camera_z_offset 8
21 |   /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4"
22 | done


--------------------------------------------------------------------------------
/scripts/viz_wayspots.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Find the path to the root of the repo.
 4 | SCRIPT_PATH=$(dirname $(realpath -s "$0"))
 5 | REPO_PATH=$(realpath -s "${SCRIPT_PATH}/..")
 6 | 
 7 | scenes=("wayspots_squarebench" "wayspots_bears" "wayspots_cubes" "wayspots_inscription" "wayspots_lawn" "wayspots_map"  "wayspots_statue" "wayspots_tendrils" "wayspots_therock" "wayspots_wintersign")
 8 | 
 9 | training_exe="${REPO_PATH}/train_ace.py"
10 | testing_exe="${REPO_PATH}/test_ace.py"
11 | 
12 | datasets_folder="${REPO_PATH}/datasets"
13 | out_dir="${REPO_PATH}/output/viz_maps/wayspots"
14 | renderings_dir="${REPO_PATH}/output/renderings/wayspots"
15 | 
16 | mkdir -p "$out_dir"
17 | 
18 | for scene in ${scenes[*]}; do
19 |   python $training_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_flipped_portrait True
20 |   python $testing_exe "$datasets_folder/$scene" "$out_dir/$scene.pt" --render_visualization True --render_target_path "$renderings_dir" --render_flipped_portrait True
21 |   /usr/bin/ffmpeg -framerate 30 -pattern_type glob -i "$renderings_dir/$scene/*.png" -c:v libx264 -pix_fmt yuv420p "$renderings_dir/$scene.mp4"
22 | done
23 | 
24 | 


--------------------------------------------------------------------------------
/test_ace.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright © Niantic, Inc. 2022.
  3 | 
  4 | import argparse
  5 | import logging
  6 | import math
  7 | import time
  8 | from distutils.util import strtobool
  9 | from pathlib import Path
 10 | 
 11 | import cv2
 12 | import numpy as np
 13 | import torch
 14 | from torch.cuda.amp import autocast
 15 | from torch.utils.data import DataLoader
 16 | 
 17 | import dsacstar
 18 | from ace_network import Regressor
 19 | from dataset import CamLocDataset
 20 | 
 21 | import ace_vis_util as vutil
 22 | from ace_visualizer import ACEVisualizer
 23 | 
 24 | _logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def _strtobool(x):
 28 |     return bool(strtobool(x))
 29 | 
 30 | 
 31 | if __name__ == '__main__':
 32 |     # Setup logging.
 33 |     logging.basicConfig(level=logging.INFO)
 34 | 
 35 |     parser = argparse.ArgumentParser(
 36 |         description='Test a trained network on a specific scene.',
 37 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 38 | 
 39 |     parser.add_argument('scene', type=Path,
 40 |                         help='path to a scene in the dataset folder, e.g. "datasets/Cambridge_GreatCourt"')
 41 | 
 42 |     parser.add_argument('network', type=Path, help='path to a network trained for the scene (just the head weights)')
 43 | 
 44 |     parser.add_argument('--encoder_path', type=Path, default=Path(__file__).parent / "ace_encoder_pretrained.pt",
 45 |                         help='file containing pre-trained encoder weights')
 46 | 
 47 |     parser.add_argument('--session', '-sid', default='',
 48 |                         help='custom session name appended to output files, '
 49 |                              'useful to separate different runs of a script')
 50 | 
 51 |     parser.add_argument('--image_resolution', type=int, default=480, help='base image resolution')
 52 | 
 53 |     # ACE is RGB-only, no need for this param.
 54 |     # parser.add_argument('--mode', '-m', type=int, default=1, choices=[1, 2], help='test mode: 1 = RGB, 2 = RGB-D')
 55 | 
 56 |     # DSACStar RANSAC parameters. ACE Keeps them at default.
 57 |     parser.add_argument('--hypotheses', '-hyps', type=int, default=64,
 58 |                         help='number of hypotheses, i.e. number of RANSAC iterations')
 59 | 
 60 |     parser.add_argument('--threshold', '-t', type=float, default=10,
 61 |                         help='inlier threshold in pixels (RGB) or centimeters (RGB-D)')
 62 | 
 63 |     parser.add_argument('--inlieralpha', '-ia', type=float, default=100,
 64 |                         help='alpha parameter of the soft inlier count; controls the softness of the '
 65 |                              'hypotheses score distribution; lower means softer')
 66 | 
 67 |     parser.add_argument('--maxpixelerror', '-maxerrr', type=float, default=100,
 68 |                         help='maximum reprojection (RGB, in px) or 3D distance (RGB-D, in cm) error when checking '
 69 |                              'pose consistency towards all measurements; error is clamped to this value for stability')
 70 | 
 71 |     # Params for the visualization. If enabled, it will slow down relocalisation considerably. But you get a nice video :)
 72 |     parser.add_argument('--render_visualization', type=_strtobool, default=False,
 73 |                         help='create a video of the mapping process')
 74 | 
 75 |     parser.add_argument('--render_target_path', type=Path, default='renderings',
 76 |                         help='target folder for renderings, visualizer will create a subfolder with the map name')
 77 | 
 78 |     parser.add_argument('--render_flipped_portrait', type=_strtobool, default=False,
 79 |                         help='flag for wayspots dataset where images are sideways portrait')
 80 | 
 81 |     parser.add_argument('--render_sparse_queries', type=_strtobool, default=False,
 82 |                         help='set to true if your queries are not a smooth video')
 83 | 
 84 |     parser.add_argument('--render_pose_error_threshold', type=int, default=20,
 85 |                         help='pose error threshold for the visualisation in cm/deg')
 86 | 
 87 |     parser.add_argument('--render_map_depth_filter', type=int, default=10,
 88 |                         help='to clean up the ACE point cloud remove points too far away')
 89 | 
 90 |     parser.add_argument('--render_camera_z_offset', type=int, default=4,
 91 |                         help='zoom out of the scene by moving render camera backwards, in meters')
 92 | 
 93 |     parser.add_argument('--render_frame_skip', type=int, default=1,
 94 |                         help='skip every xth frame for long and dense query sequences')
 95 | 
 96 |     opt = parser.parse_args()
 97 | 
 98 |     device = torch.device("cuda")
 99 |     num_workers = 6
100 | 
101 |     scene_path = Path(opt.scene)
102 |     head_network_path = Path(opt.network)
103 |     encoder_path = Path(opt.encoder_path)
104 |     session = opt.session
105 | 
106 |     # Setup dataset.
107 |     testset = CamLocDataset(
108 |         scene_path / "test",
109 |         mode=0,  # Default for ACE, we don't need scene coordinates/RGB-D.
110 |         image_height=opt.image_resolution,
111 |     )
112 |     _logger.info(f'Test images found: {len(testset)}')
113 | 
114 |     # Setup dataloader. Batch size 1 by default.
115 |     testset_loader = DataLoader(testset, shuffle=False, num_workers=6)
116 | 
117 |     # Load network weights.
118 |     encoder_state_dict = torch.load(encoder_path, map_location="cpu")
119 |     _logger.info(f"Loaded encoder from: {encoder_path}")
120 |     head_state_dict = torch.load(head_network_path, map_location="cpu")
121 |     _logger.info(f"Loaded head weights from: {head_network_path}")
122 | 
123 |     # Create regressor.
124 |     network = Regressor.create_from_split_state_dict(encoder_state_dict, head_state_dict)
125 | 
126 |     # Setup for evaluation.
127 |     network = network.to(device)
128 |     network.eval()
129 | 
130 |     # Save the outputs in the same folder as the network being evaluated.
131 |     output_dir = head_network_path.parent
132 |     scene_name = scene_path.name
133 |     # This will contain aggregate scene stats (median translation/rotation errors, and avg processing time per frame).
134 |     test_log_file = output_dir / f'test_{scene_name}_{opt.session}.txt'
135 |     _logger.info(f"Saving test aggregate statistics to: {test_log_file}")
136 |     # This will contain each frame's pose (stored as quaternion + translation) and errors.
137 |     pose_log_file = output_dir / f'poses_{scene_name}_{opt.session}.txt'
138 |     _logger.info(f"Saving per-frame poses and errors to: {pose_log_file}")
139 | 
140 |     # Setup output files.
141 |     test_log = open(test_log_file, 'w', 1)
142 |     pose_log = open(pose_log_file, 'w', 1)
143 | 
144 |     # Metrics of interest.
145 |     avg_batch_time = 0
146 |     num_batches = 0
147 | 
148 |     # Keep track of rotation and translation errors for calculation of the median error.
149 |     rErrs = []
150 |     tErrs = []
151 | 
152 |     # Percentage of frames predicted within certain thresholds from their GT pose.
153 |     pct10_5 = 0
154 |     pct5 = 0
155 |     pct2 = 0
156 |     pct1 = 0
157 | 
158 |     # Generate video of training process
159 |     if opt.render_visualization:
160 |         # infer rendering folder from map file name
161 |         target_path = vutil.get_rendering_target_path(
162 |             opt.render_target_path,
163 |             opt.network)
164 |         ace_visualizer = ACEVisualizer(target_path,
165 |                                        opt.render_flipped_portrait,
166 |                                        opt.render_map_depth_filter,
167 |                                        reloc_vis_error_threshold=opt.render_pose_error_threshold)
168 | 
169 |         # we need to pass the training set in case the visualiser has to regenerate the map point cloud
170 |         trainset = CamLocDataset(
171 |             scene_path / "train",
172 |             mode=0,  # Default for ACE, we don't need scene coordinates/RGB-D.
173 |             image_height=opt.image_resolution,
174 |         )
175 | 
176 |         # Setup dataloader. Batch size 1 by default.
177 |         trainset_loader = DataLoader(trainset, shuffle=False, num_workers=6)
178 | 
179 |         ace_visualizer.setup_reloc_visualisation(
180 |             frame_count=len(testset),
181 |             data_loader=trainset_loader,
182 |             network=network,
183 |             camera_z_offset=opt.render_camera_z_offset,
184 |             reloc_frame_skip=opt.render_frame_skip)
185 |     else:
186 |         ace_visualizer = None
187 | 
188 |     # Testing loop.
189 |     testing_start_time = time.time()
190 |     with torch.no_grad():
191 |         for image_B1HW, _, gt_pose_B44, _, intrinsics_B33, _, _, filenames in testset_loader:
192 |             batch_start_time = time.time()
193 |             batch_size = image_B1HW.shape[0]
194 | 
195 |             image_B1HW = image_B1HW.to(device, non_blocking=True)
196 | 
197 |             # Predict scene coordinates.
198 |             with autocast(enabled=True):
199 |                 scene_coordinates_B3HW = network(image_B1HW)
200 | 
201 |             # We need them on the CPU to run RANSAC.
202 |             scene_coordinates_B3HW = scene_coordinates_B3HW.float().cpu()
203 | 
204 |             # Each frame is processed independently.
205 |             for frame_idx, (scene_coordinates_3HW, gt_pose_44, intrinsics_33, frame_path) in enumerate(
206 |                     zip(scene_coordinates_B3HW, gt_pose_B44, intrinsics_B33, filenames)):
207 | 
208 |                 # Extract focal length and principal point from the intrinsics matrix.
209 |                 focal_length = intrinsics_33[0, 0].item()
210 |                 ppX = intrinsics_33[0, 2].item()
211 |                 ppY = intrinsics_33[1, 2].item()
212 |                 # We support a single focal length.
213 |                 assert torch.allclose(intrinsics_33[0, 0], intrinsics_33[1, 1])
214 | 
215 |                 # Remove path from file name
216 |                 frame_name = Path(frame_path).name
217 | 
218 |                 # Allocate output variable.
219 |                 out_pose = torch.zeros((4, 4))
220 | 
221 |                 # Compute the pose via RANSAC.
222 |                 inlier_count = dsacstar.forward_rgb(
223 |                     scene_coordinates_3HW.unsqueeze(0),
224 |                     out_pose,
225 |                     opt.hypotheses,
226 |                     opt.threshold,
227 |                     focal_length,
228 |                     ppX,
229 |                     ppY,
230 |                     opt.inlieralpha,
231 |                     opt.maxpixelerror,
232 |                     network.OUTPUT_SUBSAMPLE,
233 |                 )
234 | 
235 |                 # Calculate translation error.
236 |                 t_err = float(torch.norm(gt_pose_44[0:3, 3] - out_pose[0:3, 3]))
237 | 
238 |                 # Rotation error.
239 |                 gt_R = gt_pose_44[0:3, 0:3].numpy()
240 |                 out_R = out_pose[0:3, 0:3].numpy()
241 | 
242 |                 r_err = np.matmul(out_R, np.transpose(gt_R))
243 |                 # Compute angle-axis representation.
244 |                 r_err = cv2.Rodrigues(r_err)[0]
245 |                 # Extract the angle.
246 |                 r_err = np.linalg.norm(r_err) * 180 / math.pi
247 | 
248 |                 _logger.info(f"Rotation Error: {r_err:.2f}deg, Translation Error: {t_err * 100:.1f}cm")
249 | 
250 |                 if ace_visualizer is not None:
251 |                     ace_visualizer.render_reloc_frame(
252 |                         query_pose=gt_pose_44.numpy(),
253 |                         query_file=frame_path,
254 |                         est_pose=out_pose.numpy(),
255 |                         est_error=max(r_err, t_err*100),
256 |                         sparse_query=opt.render_sparse_queries)
257 | 
258 |                 # Save the errors.
259 |                 rErrs.append(r_err)
260 |                 tErrs.append(t_err * 100)
261 | 
262 |                 # Check various thresholds.
263 |                 if r_err < 5 and t_err < 0.1:  # 10cm/5deg
264 |                     pct10_5 += 1
265 |                 if r_err < 5 and t_err < 0.05:  # 5cm/5deg
266 |                     pct5 += 1
267 |                 if r_err < 2 and t_err < 0.02:  # 2cm/2deg
268 |                     pct2 += 1
269 |                 if r_err < 1 and t_err < 0.01:  # 1cm/1deg
270 |                     pct1 += 1
271 | 
272 |                 # Write estimated pose to pose file (inverse).
273 |                 out_pose = out_pose.inverse()
274 | 
275 |                 # Translation.
276 |                 t = out_pose[0:3, 3]
277 | 
278 |                 # Rotation to axis angle.
279 |                 rot, _ = cv2.Rodrigues(out_pose[0:3, 0:3].numpy())
280 |                 angle = np.linalg.norm(rot)
281 |                 axis = rot / angle
282 | 
283 |                 # Axis angle to quaternion.
284 |                 q_w = math.cos(angle * 0.5)
285 |                 q_xyz = math.sin(angle * 0.5) * axis
286 | 
287 |                 # Write to output file. All in a single line.
288 |                 pose_log.write(f"{frame_name} "
289 |                                f"{q_w} {q_xyz[0].item()} {q_xyz[1].item()} {q_xyz[2].item()} "
290 |                                f"{t[0]} {t[1]} {t[2]} "
291 |                                f"{r_err} {t_err} {inlier_count}\n")
292 | 
293 |             avg_batch_time += time.time() - batch_start_time
294 |             num_batches += 1
295 | 
296 |     total_frames = len(rErrs)
297 |     assert total_frames == len(testset)
298 | 
299 |     # Compute median errors.
300 |     tErrs.sort()
301 |     rErrs.sort()
302 |     median_idx = total_frames // 2
303 |     median_rErr = rErrs[median_idx]
304 |     median_tErr = tErrs[median_idx]
305 | 
306 |     # Compute average time.
307 |     avg_time = avg_batch_time / num_batches
308 | 
309 |     # Compute final metrics.
310 |     pct10_5 = pct10_5 / total_frames * 100
311 |     pct5 = pct5 / total_frames * 100
312 |     pct2 = pct2 / total_frames * 100
313 |     pct1 = pct1 / total_frames * 100
314 | 
315 |     _logger.info("===================================================")
316 |     _logger.info("Test complete.")
317 | 
318 |     _logger.info('Accuracy:')
319 |     _logger.info(f'\t10cm/5deg: {pct10_5:.1f}%')
320 |     _logger.info(f'\t5cm/5deg: {pct5:.1f}%')
321 |     _logger.info(f'\t2cm/2deg: {pct2:.1f}%')
322 |     _logger.info(f'\t1cm/1deg: {pct1:.1f}%')
323 | 
324 |     _logger.info(f"Median Error: {median_rErr:.1f}deg, {median_tErr:.1f}cm")
325 |     _logger.info(f"Avg. processing time: {avg_time * 1000:4.1f}ms")
326 | 
327 |     # Write to the test log file as well.
328 |     test_log.write(f"{median_rErr} {median_tErr} {avg_time}\n")
329 | 
330 |     test_log.close()
331 |     pose_log.close()
332 | 


--------------------------------------------------------------------------------
/train_ace.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright © Niantic, Inc. 2022.
  3 | 
  4 | import argparse
  5 | import logging
  6 | from distutils.util import strtobool
  7 | from pathlib import Path
  8 | 
  9 | from ace_trainer import TrainerACE
 10 | 
 11 | 
 12 | def _strtobool(x):
 13 |     return bool(strtobool(x))
 14 | 
 15 | 
 16 | if __name__ == '__main__':
 17 | 
 18 |     # Setup logging levels.
 19 |     logging.basicConfig(level=logging.INFO)
 20 | 
 21 |     parser = argparse.ArgumentParser(
 22 |         description='Fast training of a scene coordinate regression network.',
 23 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 24 | 
 25 |     parser.add_argument('scene', type=Path,
 26 |                         help='path to a scene in the dataset folder, e.g. "datasets/Cambridge_GreatCourt"')
 27 | 
 28 |     parser.add_argument('output_map_file', type=Path,
 29 |                         help='target file for the trained network')
 30 | 
 31 |     parser.add_argument('--encoder_path', type=Path, default=Path(__file__).parent / "ace_encoder_pretrained.pt",
 32 |                         help='file containing pre-trained encoder weights')
 33 | 
 34 |     parser.add_argument('--num_head_blocks', type=int, default=1,
 35 |                         help='depth of the regression head, defines the map size')
 36 | 
 37 |     parser.add_argument('--learning_rate_min', type=float, default=0.0005,
 38 |                         help='lowest learning rate of 1 cycle scheduler')
 39 | 
 40 |     parser.add_argument('--learning_rate_max', type=float, default=0.005,
 41 |                         help='highest learning rate of 1 cycle scheduler')
 42 | 
 43 |     parser.add_argument('--training_buffer_size', type=int, default=8000000,
 44 |                         help='number of patches in the training buffer')
 45 | 
 46 |     parser.add_argument('--samples_per_image', type=int, default=1024,
 47 |                         help='number of patches drawn from each image when creating the buffer')
 48 | 
 49 |     parser.add_argument('--batch_size', type=int, default=5120,
 50 |                         help='number of patches for each parameter update (has to be a multiple of 512)')
 51 | 
 52 |     parser.add_argument('--epochs', type=int, default=16,
 53 |                         help='number of runs through the training buffer')
 54 | 
 55 |     parser.add_argument('--repro_loss_hard_clamp', type=int, default=1000,
 56 |                         help='hard clamping threshold for the reprojection losses')
 57 | 
 58 |     parser.add_argument('--repro_loss_soft_clamp', type=int, default=50,
 59 |                         help='soft clamping threshold for the reprojection losses')
 60 | 
 61 |     parser.add_argument('--repro_loss_soft_clamp_min', type=int, default=1,
 62 |                         help='minimum value of the soft clamping threshold when using a schedule')
 63 | 
 64 |     parser.add_argument('--use_half', type=_strtobool, default=True,
 65 |                         help='train with half precision')
 66 | 
 67 |     parser.add_argument('--use_homogeneous', type=_strtobool, default=True,
 68 |                         help='train with half precision')
 69 | 
 70 |     parser.add_argument('--use_aug', type=_strtobool, default=True,
 71 |                         help='Use any augmentation.')
 72 | 
 73 |     parser.add_argument('--aug_rotation', type=int, default=15,
 74 |                         help='max inplane rotation angle')
 75 | 
 76 |     parser.add_argument('--aug_scale', type=float, default=1.5,
 77 |                         help='max scale factor')
 78 | 
 79 |     parser.add_argument('--image_resolution', type=int, default=480,
 80 |                         help='base image resolution')
 81 | 
 82 |     parser.add_argument('--repro_loss_type', type=str, default="dyntanh",
 83 |                         choices=["l1", "l1+sqrt", "l1+log", "tanh", "dyntanh"],
 84 |                         help='Loss function on the reprojection error. Dyn varies the soft clamping threshold')
 85 | 
 86 |     parser.add_argument('--repro_loss_schedule', type=str, default="circle", choices=['circle', 'linear'],
 87 |                         help='How to decrease the softclamp threshold during training, circle is slower first')
 88 | 
 89 |     parser.add_argument('--depth_min', type=float, default=0.1,
 90 |                         help='enforce minimum depth of network predictions')
 91 | 
 92 |     parser.add_argument('--depth_target', type=float, default=10,
 93 |                         help='default depth to regularize training')
 94 | 
 95 |     parser.add_argument('--depth_max', type=float, default=1000,
 96 |                         help='enforce maximum depth of network predictions')
 97 | 
 98 |     # Clustering params, for the ensemble training used in the Cambridge experiments. Disabled by default.
 99 |     parser.add_argument('--num_clusters', type=int, default=None,
100 |                         help='split the training sequence in this number of clusters. disabled by default')
101 | 
102 |     parser.add_argument('--cluster_idx', type=int, default=None,
103 |                         help='train on images part of this cluster. required only if --num_clusters is set.')
104 | 
105 |     # Params for the visualization. If enabled, it will slow down training considerably. But you get a nice video :)
106 |     parser.add_argument('--render_visualization', type=_strtobool, default=False,
107 |                         help='create a video of the mapping process')
108 | 
109 |     parser.add_argument('--render_target_path', type=Path, default='renderings',
110 |                         help='target folder for renderings, visualizer will create a subfolder with the map name')
111 | 
112 |     parser.add_argument('--render_flipped_portrait', type=_strtobool, default=False,
113 |                         help='flag for wayspots dataset where images are sideways portrait')
114 | 
115 |     parser.add_argument('--render_map_error_threshold', type=int, default=10,
116 |                         help='reprojection error threshold for the visualisation in px')
117 | 
118 |     parser.add_argument('--render_map_depth_filter', type=int, default=10,
119 |                         help='to clean up the ACE point cloud remove points too far away')
120 | 
121 |     parser.add_argument('--render_camera_z_offset', type=int, default=4,
122 |                         help='zoom out of the scene by moving render camera backwards, in meters')
123 | 
124 |     options = parser.parse_args()
125 | 
126 |     trainer = TrainerACE(options)
127 |     trainer.train()
128 | 


--------------------------------------------------------------------------------