├── .github └── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── LICENSE-CODE ├── README.md ├── SECURITY.md ├── media ├── New.png ├── indoor6_sfm.png ├── pretrained_models_results.png ├── run_inference_screenshot.png ├── scene6_landmark0_sample.jpg ├── table_new.png ├── table_newscenes.png ├── table_old.png ├── teaser_wide.png └── video_figure.png ├── paper ├── DoEtalCVPR2022.pdf └── DoSinha3DV2024.pdf └── src ├── dataloader └── indoor6.py ├── inference.py ├── local_inference.py ├── local_training.py ├── main.py ├── models ├── blocks.py ├── conv2d_layers.py └── efficientlitesld.py ├── pretrained_efficientnetlite0.net ├── requirements.txt ├── run_inference.py ├── run_training.py ├── train.py └── utils ├── generate_visibility_depth_normal.py ├── heatmap.py ├── landmark_selection.py ├── merge_landmark_files.py ├── pnp.py ├── read_write_models.py └── select_additional_landmarks.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These code reviewers should be added by default. 2 | * @snsinha @omiksik @tien-d 3 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /LICENSE-CODE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scene Landmark Detection for Camera Localization 2 | 3 | ## Introduction 4 | 5 | ![teaser](media/teaser_wide.png) 6 | We have devised a new method to detect scene-specific _scene landmarks_ for localizing a camera within a pre-mapped scene. Our method is privacy-preserving, has low storage requirements and achieves high accuracy. **[Left]** Scene landmarks detected in a query image. **[Middle]** A CNN-based heatmap prediction architecture is trained. **[Right]** The 3D scene 7 | landmarks (_in red_) and the estimated camera pose (_in blue_) are shown overlaid over the 3D point cloud (_in gray_). The 3D point 8 | cloud is shown only for visualization. It is not actually used for camera localization. 9 | 10 | --- 11 | 12 | ## Papers 13 | **Improved Scene Landmark Detection for Camera Localization**![new](media/New.png) 14 | Tien Do and Sudipta N. Sinha 15 | International Conference on 3D Vision (**3DV**), 2024 16 | [pdf](paper/DoSinha3DV2024.pdf) 17 | 18 | **Learning to Detect Scene Landmarks for Camera Localization** 19 | Tien Do, Ondrej Miksik, Joseph DeGol, Hyun Soo Park, and Sudipta N. Sinha 20 | IEEE/CVF Conference on Computer Vision and Pattern Recognition (**CVPR**), 2022 21 | [pdf](paper/DoEtalCVPR2022.pdf)   [video](https://www.youtube.com/watch?v=HM2yLCLz5nY) 22 | 23 | **Indoor6 Dataset** 24 | [download](https://drive.google.com/drive/folders/1w7Adnd6MXmNOacT072JnQ6emHUeLrD71?usp=drive_link) 25 | 26 | ## Bibtex 27 | If you find our work to be useful in your research, please consider citing our paper: 28 | ``` 29 | @InProceedings{Do_Sinha_2024_ImprovedSceneLandmarkLoc, 30 | author = {Do, Tien and Sinha, Sudipta N.}, 31 | title = {Improved Scene Landmark Detection for Camera Localization}, 32 | booktitle = {Proceedings of the International Conference on 3D Vision (3DV)}, 33 | month = {March}, 34 | year = {2024} 35 | } 36 | 37 | @InProceedings{Do_2022_SceneLandmarkLoc, 38 | author = {Do, Tien and Miksik, Ondrej and DeGol, Joseph and Park, Hyun Soo and Sinha, Sudipta N.}, 39 | title = {Learning to Detect Scene Landmarks for Camera Localization}, 40 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 41 | month = {June}, 42 | year = {2022} 43 | } 44 | ``` 45 | 46 | # Indoor-6 Dataset 47 | 48 | The Indoor-6 dataset was created from multiple sessions captured in six indoor scenes over multiple days. The pseudo 49 | ground truth (pGT) 3D point clouds and camera poses for each scene are computed using [COLMAP](https://colmap.github.io/). All training data uses only colmap reconstruction from training images. The figure below 50 | shows the camera poses (in red) and point clouds (in gray) and for each scene, the number of video and images in the 51 | training and test split respectively. Compared to [7-scenes](https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/), the scenes in Indoor-6 are larger, have multiple rooms, 52 | contains illumination variations as the images span multiple days and different times of day. 53 | 54 | ![indoor6_sfm](media/indoor6_sfm.png) 55 | Indoor-6 dataset SfM reconstructions. Train/val/test splits and download urls per scene are listed below: 56 | * [scene1](https://drive.google.com/file/d/1AJhPh9nnZO0HJyxuXXZdtKtA7kFRi3LQ/view?usp=drive_link) (6289/798/799 images) 57 | * scene2 (3021/283/284 images) 58 | * [scene2a](https://drive.google.com/file/d/1DgTQ7fflZJ7DdbHDRZF-6gXdB_vJF7fY/view?usp=drive_link) (4890/256/257 images) 59 | * [scene3](https://drive.google.com/file/d/12aER7rQkvGS_DPeugTHo_Ma_Fi7JuflS/view?usp=drive_link) (4181/313/315 images) 60 | * scene4 (1942/272/272 images) 61 | * [scene4a](https://drive.google.com/file/d/1gibneq5ixZ0lmeNAYTmY4Mh8a244T2nl/view?usp=drive_link) (2285/158/158 images) 62 | * [scene5](https://drive.google.com/file/d/18wHn_69-eV22N4I8R0rWQkcSQ3EtCYMX/view?usp=drive_link) (4946/512/424 images) 63 | * [scene6](https://drive.google.com/file/d/1mZYnoKo37KXRjREK5CKs5IzDox2G3Prt/view?usp=drive_link) (1761/322/323 images) 64 | * [colmap](https://drive.google.com/file/d/1oMo552DYo2U5Fvjm5MrTYPMqpMjXEf7m/view?usp=drive_link) (colmap reconstructions for all scenes.) 65 | 66 | **Note**: We added two new scenes (`scene2a` and `scene4a`) to the Indoor-6 dataset after our CVPR 2022 paper was published. This was because we were unable to release `scene2` and `scene4` from the original dataset due to privacy reasons. 67 | The two new scenes have been included as replacements. Please refer to our 3DV 2024 paper for a quantitative evaluation of our method and several baselines on the latest version of the dataset. 68 | 69 | # Source code 70 | The repository contains all the source code for our project. The most recent version can be found in the `3dv24` git branch (which is now the default branch of the repository). The best performing pretrained models for `SLD-star` as proposed in our 3DV 2024 paper are also available (see below). It significantly outperforms the `SLD+NBE` approach proposed in our CVPR 2022 paper. The source code for the `SLD+NBE` method is not maintained anymore. The older version of the code (pre 3DV 2024) can be found in the `main` branch. 71 | 72 | ## Environment Setup 73 | ``` 74 | pip install -r requirements.txt 75 | ``` 76 | 77 | * Python 3.9.13 on Windows 11. 78 | * CUDA version: release 11.8 (V11.8.89) 79 | * PyTorch version: 2.1.0+cu118 80 | 81 | For development purposes, training was tested to run on both CUDA and CPU on both Linux and Windows platforms, as well as using the latest experimental version of pyTorch with Metal Performance Shaders on Mac OS X (see below). 82 | 83 | By default the code will select hardware acceleration for your device, if available. 84 | 85 | ### Experimental Mac OS Metal Performance Shaders (MPS) 86 | 87 | To enable the MPS backend, make sure you are running the latest Apple Silicon compatible hardware and follow [these instructions](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) to get the latest Nightly build of pyTorch instead. 88 | 89 | _NOTE_: MPS has max supported precision of FP32. 90 | 91 | ## Layout 92 | 93 | The source code expects the following directory structure (currently in your home directory). 94 | 95 | ``` 96 | └── data 97 | | └── outputs 98 | | └── checkpoints 99 | | └── indoor6 100 | | └── scene1 101 | | └── scene2a 102 | | └── scene3 103 | | └── scene4a 104 | | └── scene5 105 | | └── scene6 106 | └── SceneLandmarkLocalization 107 | └── src 108 | └── README.md (this) 109 | ``` 110 | 111 | * Download the indoor6 dataset and place the contents in the `/data/indoor6/` folder, as indicated above. 112 | * Download the pretrained models for `SLD-star` (see below) from our 3DV 2024 paper and place them in the `/data/checkpoints` folder, as indicated above. 113 | [pretrained models](https://drive.google.com/file/d/1s8bUgAuy2LX4QMcKE8yKz6JRyhL3JgxZ/view?usp=drive_link) 114 | 115 | * Clone this repo into `/SceneLandmarkLocalization`. 116 | * Finally, create the folder `/data/outputs` for storing trained models and other files that will be created when training your own models using the training routine. 117 | 118 | ## Running Inference using Pre-trained Models 119 | 120 | Instructions to test the `SLD-star` models from our 3DV 2024 paper are listed below. 121 | 122 | **Step 1.** First, verify the contents of the checkpoints folder. You should see the following files and directories. 123 | ``` 124 | └── data 125 | └── checkpoints 126 | └── scene1_1000-125_v10 127 | └── scene1_1000-125_v10.txt 128 | └── scene2a_1000-125_v10 129 | └── scene2a_1000-125_v10.txt 130 | └── scene3_1000-125_v10 131 | └── scene3_1000-125_v10.txt 132 | └── scene4a_1000-125_v10 133 | └── scene4a_1000-125_v10.txt 134 | └── scene5_1000-125_v10 135 | └── scene5_1000-125_v10.txt 136 | └── scene6_1000-125_v10 137 | └── scene6_1000-125_v10.txt 138 | ``` 139 | 140 | **Step 2.** For `1000-125_v10`, each scene has eight model checkpoints. For example, `scene6` has these files. 141 | ``` 142 | └── scene6_1000-125_v10 143 | └── scene6-000-125 144 | └── model-best_median.ckpt 145 | └── scene6-125-250 146 | └── model-best_median.ckpt 147 | └── scene6-250-375 148 | └── model-best_median.ckpt 149 | └── scene6-375-500 150 | └── model-best_median.ckpt 151 | └── scene6-500-625 152 | └── model-best_median.ckpt 153 | └── scene6-625-750 154 | └── model-best_median.ckpt 155 | └── scene6-750-875 156 | └── model-best_median.ckpt 157 | └── scene6-875-1000 158 | └── model-best_median.ckpt 159 | ``` 160 | 161 | **Step 3.** Each experiment file for the `1000-125_v10` experiment, for e.g. `scene6_1000-125_v10.txt` contains eight lines, one for each model checkpoint (or landmark subset). Each line contains various attributes for the associated model. 162 | 163 | **Step 4.** Check the Python script `/SceneLandmarkLocalization/src/run_inference.py`. The relative paths hardcoded in the variables `checkpoint_dir` and `dataset_dir` both assume the directory layout that was described earlier. The variable `experiment` is set to `1000-125_v10` which corresponds to the `SLD-star` model trained for 1000 landmarks partitioned into eight subsets each with 125 landmarks. The suffix `v10` is a tag to keep track of the experiment and generated model checkpoints. 164 | 165 | **Step 5.** Now, run the following script. 166 | ``` 167 | cd SceneLandmarkLocalization/src 168 | python run_inference.py 169 | ``` 170 | 171 | **Step 6.** When the script finishes running, the following text will be displayed on the console. The final accuracy (5cm/5deg recall) in percent is printed alongwith the mean inference speed. 172 | ![indoor6_sfm](media/run_inference_screenshot.png) 173 | 174 | **Step 7.** The metrics are also written to the file `/data/checkpoints/RESULTS-1000-125_v10.txt`. Note that, `1000-125_v10` is the experiment name specified in the `run_inference.py` script. 175 | 176 | ## Training Models 177 | 178 | We now discuss how to train an `SLD-star` model ensemble. 179 | As proposed in our 3DV 2024 paper, the model ensemble is a set of models that share the same architecture (derived from an EfficientNet backbone), but have independent sets of model parameters. 180 | Each model (or network) in the ensemble is trained on a different subset of scene landmarks. 181 | In our implementation, we define the subsets by considering the ordered list of all the scene landmarks and partitioning that list into blocks of fixed size. For convenience, we choose block sizes that exactly divide the total number of landmarks to ensure that all the subsets have the same size. 182 |
183 | For example, given 1000 scene landmarks and choosing a block size of 125, we will obtain eight subsets. The first subset will consist of landmarks with indices in the range `[0,125]` in the ordered list. 184 | The second subset will have landmarks with indices in the range `[125,250]` and so on. 185 |
186 | We will now discuss how to run the training code. 187 | 188 | 189 | **Step 1.** Now, run the following script. 190 | 191 | To train a single model in the ensemble (for a specific scene), you might need to edit certain variables and modify the default values hardcoded in the `SceneLandmarkLocalization/src/run_training.py` script. 192 | Then, just run it as follows. 193 | ``` 194 | cd SceneLandmarkLocalization/src 195 | python run_training.py 196 | ``` 197 | 198 | **Step 2.** Editing the script and modifying the parameter values. 199 | 200 | The important hyperparameters and settings that might need to be modified are the follows. 201 | 202 | 1. ***Paths:*** The default values for the dataset path and output paths are as follows (based on the assumed directory structure). However, these can be modified as needed. 203 | ``` 204 | dataset_dir = '../../data/indoor6' 205 | output_dir = '../../data/outputs' 206 | ``` 207 | 208 | 2. ***Scene ID and landmarks:*** The names of the landmark and visibility files. 209 | ``` 210 | scene_name = 'scene6' 211 | landmark_config = 'landmarks/landmarks-1000v10' 212 | visibility_config = 'landmarks/visibility-1000v10_depth_normal' 213 | ``` 214 | 215 | 3. ***Ensemble configuration:*** The number of landmarks and the block size of the ensemble. `subset_index` indicates which network within the ensemble will be trained. So in the following example, the value `0` indicates that the model will be trained for the landmarks in the index range of `[0,125]`. So for this `1000-125` ensemble, you will need to change `subset_index` to `1, 2, ..., 7` to train all eight networks. 216 | 217 | ``` 218 | num_landmarks = 1000 219 | block_size = 125 220 | subset_index = 0 221 | ``` 222 | 223 | 4. ***Version No.:*** A string tag which is appended to the generated model names and experiment files. This helps us avoid nameclashes when training and testing multiple sets of models. 224 | 225 | **Step 3.** When training completes, check the output directory, you should see a directory that contains the model checkpoint for the specified scene. There will also be an experiment text file with the same name. 226 | Inside the scene directory are sub-directories, one for each network in the ensemble. For example, the subdirectories for the `1000-125` ensemble for `scene6` will be named as `scene6-000-125`, `scene6-125-250` and so on. 227 | Look inside these subdirectories for the model checkpoint file `model-best_median.ckpt`. 228 | 229 | # Contributing 230 | 231 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 232 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 233 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 234 | 235 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 236 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 237 | provided by the bot. You will only need to do this once across all repos using our CLA. 238 | 239 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 240 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 241 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 242 | 243 | # Legal Notices 244 | 245 | Microsoft and any contributors grant you a license to the Microsoft documentation and other content 246 | in this repository under the [Creative Commons Attribution 4.0 International Public License](https://creativecommons.org/licenses/by/4.0/legalcode), 247 | see the [LICENSE](LICENSE) file, and grant you a license to any code in the repository under the [MIT License](https://opensource.org/licenses/MIT), see the 248 | [LICENSE-CODE](LICENSE-CODE) file. 249 | 250 | Microsoft, Windows, Microsoft Azure and/or other Microsoft products and services referenced in the documentation 251 | may be either trademarks or registered trademarks of Microsoft in the United States and/or other countries. 252 | The licenses for this project do not grant you rights to use any Microsoft names, logos, or trademarks. 253 | Microsoft's general trademark guidelines can be found at http://go.microsoft.com/fwlink/?LinkID=254653. 254 | 255 | Privacy information can be found at https://privacy.microsoft.com/en-us/ 256 | 257 | Microsoft and any contributors reserve all other rights, whether under their respective copyrights, patents, 258 | or trademarks, whether by implication, estoppel or otherwise. 259 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /media/New.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/New.png -------------------------------------------------------------------------------- /media/indoor6_sfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/indoor6_sfm.png -------------------------------------------------------------------------------- /media/pretrained_models_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/pretrained_models_results.png -------------------------------------------------------------------------------- /media/run_inference_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/run_inference_screenshot.png -------------------------------------------------------------------------------- /media/scene6_landmark0_sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/scene6_landmark0_sample.jpg -------------------------------------------------------------------------------- /media/table_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/table_new.png -------------------------------------------------------------------------------- /media/table_newscenes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/table_newscenes.png -------------------------------------------------------------------------------- /media/table_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/table_old.png -------------------------------------------------------------------------------- /media/teaser_wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/teaser_wide.png -------------------------------------------------------------------------------- /media/video_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/media/video_figure.png -------------------------------------------------------------------------------- /paper/DoEtalCVPR2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/paper/DoEtalCVPR2022.pdf -------------------------------------------------------------------------------- /paper/DoSinha3DV2024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/paper/DoSinha3DV2024.pdf -------------------------------------------------------------------------------- /src/dataloader/indoor6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import fnmatch 4 | import numpy as np 5 | import os 6 | import pickle 7 | from PIL import Image 8 | 9 | import sys 10 | sys.path.append('../utils') 11 | 12 | import torch 13 | from torch.utils.data.dataset import Dataset 14 | from torch.utils.data import DataLoader 15 | from torchvision import transforms 16 | 17 | from utils.pnp import Quaternion2Rotation 18 | 19 | np.random.seed(0) 20 | 21 | class Indoor6(Dataset): 22 | def __init__(self, root_folder="", 23 | scene_id='', mode='all', 24 | landmark_idx=[None], skip_image_index=1, 25 | input_image_downsample=1, gray_image_output=False, 26 | landmark_config='landmarks/landmarks-50', 27 | visibility_config='landmarks/visibility-50', 28 | use_precomputed_focal_length=False): 29 | super(Indoor6, self).__init__() 30 | 31 | self.to_tensor = transforms.ToTensor() 32 | 33 | self.image_folder = os.path.join(root_folder, 34 | scene_id, 35 | 'images') 36 | image_files_all = fnmatch.filter(os.listdir(self.image_folder), '*.color.jpg') 37 | image_files_all = sorted(image_files_all)[::skip_image_index] 38 | 39 | self.image_files = [] 40 | if mode == 'train': 41 | self.image_files = \ 42 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 43 | 'train'][::skip_image_index] 44 | self.image_indices = \ 45 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 46 | 'train_idx'][::skip_image_index] 47 | elif mode == 'test': 48 | self.image_files = \ 49 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 50 | 'test'][::skip_image_index] 51 | self.image_indices = \ 52 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 53 | 'test_idx'][::skip_image_index] 54 | elif mode == 'val': 55 | self.image_files = \ 56 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 57 | 'val'][::skip_image_index] 58 | self.image_indices = \ 59 | pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb'))[ 60 | 'val_idx'][::skip_image_index] 61 | else: 62 | self.image_files = image_files_all 63 | self.image_indices = np.arange(0, len(image_files_all)) 64 | 65 | self.image_indices = np.asarray(self.image_indices) 66 | self.num_images = len(self.image_files) 67 | self.gray_image_output = gray_image_output 68 | self.mode = mode 69 | 70 | landmark_file = open(root_folder + '/' + scene_id 71 | + '/%s.txt' % landmark_config, 'r') 72 | num_landmark = int(landmark_file.readline()) 73 | self.landmark = [] 74 | for l in range(num_landmark): 75 | pl = landmark_file.readline().split() 76 | pl = np.array([float(pl[i]) for i in range(len(pl))]) 77 | self.landmark.append(pl) 78 | self.landmark = np.asarray(self.landmark)[:, 1:] 79 | 80 | self.image_downsampled = input_image_downsample 81 | 82 | visibility_file = root_folder + '/' + scene_id + '/%s.txt' % visibility_config 83 | self.visibility = np.loadtxt(visibility_file).astype(bool) 84 | 85 | if landmark_idx[0] != None: 86 | self.landmark = self.landmark[landmark_idx] 87 | self.visibility = self.visibility[landmark_idx] 88 | 89 | self.landmark = self.landmark.transpose() 90 | 91 | ## Precomputed fixed focal length 92 | self.precomputed_focal_length = None 93 | if use_precomputed_focal_length: 94 | PRECOMPUTED_FOCAL_LENGTH = {'scene1': 900, 'scene2a': 1100, 'scene3': 900, 'scene4a': 900, 'scene5': 900, 'scene6': 900} 95 | self.precomputed_focal_length = PRECOMPUTED_FOCAL_LENGTH[scene_id] 96 | 97 | 98 | def original_image_name(self, index): 99 | 100 | intrinsics = open(os.path.join(self.image_folder, 101 | self.image_files[index].replace('color.jpg', 'intrinsics.txt'))) 102 | intrinsics = intrinsics.readline().split() 103 | 104 | return intrinsics[6] 105 | 106 | 107 | def _modify_intrinsic(self, index, use_precomputed_focal_length=False): 108 | W = None 109 | H = None 110 | K = None 111 | K_inv = None 112 | 113 | while K_inv is None: 114 | try: 115 | intrinsics = open(os.path.join(self.image_folder, 116 | self.image_files[index].replace('color.jpg', 'intrinsics.txt'))) 117 | intrinsics = intrinsics.readline().split() 118 | 119 | W = int(intrinsics[0]) // (self.image_downsampled * 32) * 32 120 | H = int(intrinsics[1]) // (self.image_downsampled * 32) * 32 121 | 122 | scale_factor_x = W / float(intrinsics[0]) 123 | scale_factor_y = H / float(intrinsics[1]) 124 | 125 | if use_precomputed_focal_length: 126 | fx = self.precomputed_focal_length * scale_factor_x 127 | fy = self.precomputed_focal_length * scale_factor_y 128 | else: 129 | fx = float(intrinsics[2]) * scale_factor_x 130 | fy = float(intrinsics[2]) * scale_factor_y 131 | 132 | cx = float(intrinsics[3]) * scale_factor_x 133 | cy = float(intrinsics[4]) * scale_factor_y 134 | 135 | K = np.array([[fx, 0., cx], 136 | [0., fy, cy], 137 | [0., 0., 1.]], dtype=float) 138 | 139 | K_inv = np.linalg.inv(K) 140 | 141 | except(RuntimeError, TypeError, NameError): 142 | pass 143 | return K, K_inv, W, H 144 | 145 | def _load_and_resize_image(self, index, W, H): 146 | color_img_rs = None 147 | while color_img_rs is None: 148 | try: 149 | # Load color image 150 | color_img = Image.open(os.path.join(self.image_folder, self.image_files[index])) 151 | color_img_rs = color_img.resize((W, H), resample=Image.BILINEAR) 152 | except(RuntimeError, TypeError, NameError): 153 | pass 154 | 155 | color_tensor = self.to_tensor(color_img_rs) 156 | 157 | return color_tensor 158 | 159 | def _load_pose(self, index): 160 | pose = None 161 | while pose is None: 162 | try: 163 | # Load 3x4 pose matrix and make it 4x4 by appending vector [0., 0., 0., 1.] 164 | pose = np.loadtxt(os.path.join(self.image_folder, self.image_files[index].replace('color.jpg', 'pose.txt'))) 165 | except (RuntimeError, TypeError, NameError): 166 | pass 167 | 168 | pose_s = np.vstack((pose, np.array([0., 0., 0., 1.]))) 169 | 170 | return pose_s 171 | 172 | def __getitem__(self, index): 173 | K, K_inv, W_modified, H_modified = self._modify_intrinsic(index, use_precomputed_focal_length=False if self.precomputed_focal_length is None else True) 174 | color_tensor = self._load_and_resize_image(index, W_modified, H_modified) 175 | C_T_G = self._load_pose(index) 176 | 177 | landmark3d = C_T_G @ np.vstack((self.landmark, np.ones((1, self.landmark.shape[1])))) 178 | 179 | output = {'pose_gt': torch.tensor(C_T_G), 180 | 'image': color_tensor, 181 | 'intrinsics': torch.tensor(K, dtype=torch.float32, requires_grad=False), 182 | 'inv_intrinsics': torch.tensor(K_inv, dtype=torch.float32, requires_grad=False), 183 | 'landmark3d': torch.tensor(landmark3d[:3], dtype=torch.float32, requires_grad=False), 184 | } 185 | 186 | proj = K @ (C_T_G[:3, :3] @ self.landmark + C_T_G[:3, 3:]) 187 | landmark2d = proj / proj[2:] 188 | output['landmark2d'] = landmark2d[:2] 189 | 190 | inside_patch = (landmark2d[0] < W_modified) * \ 191 | (landmark2d[0] >= 0) * \ 192 | (landmark2d[1] < H_modified) * \ 193 | (landmark2d[1] >= 0) # L vector 194 | 195 | # visible by propagated colmap visibility and inside image 196 | _mask1 = self.visibility[:, self.image_indices[index]] * inside_patch 197 | 198 | # outside patch 199 | # _mask2 = ~inside_patch 200 | 201 | # inside image but not visible by colmap 202 | _mask3 = (self.visibility[:, self.image_indices[index]] == 0) * inside_patch 203 | 204 | visibility_mask = 1.0 * _mask1 + 0.5 * _mask3 205 | output['visibility'] = visibility_mask 206 | 207 | return output 208 | 209 | def __len__(self): 210 | return self.num_images 211 | 212 | 213 | class Indoor6Patches(Indoor6): 214 | def __init__(self, root_folder="", 215 | scene_id='', mode='all', 216 | landmark_idx=[None], skip_image_index=1, 217 | input_image_downsample=1, gray_image_output=False, 218 | patch_size=96, 219 | positive_samples=4, random_samples=4, 220 | landmark_config='landmarks/landmarks-50', 221 | visibility_config='landmarks/visibility-50', 222 | augmentation=True): 223 | super().__init__(root_folder=root_folder, 224 | scene_id=scene_id, mode=mode, 225 | landmark_idx=landmark_idx, skip_image_index=skip_image_index, 226 | input_image_downsample=input_image_downsample, gray_image_output=gray_image_output, 227 | landmark_config=landmark_config, 228 | visibility_config=visibility_config) 229 | self.patch_size = patch_size 230 | self.positive_samples = positive_samples 231 | self.random_samples = random_samples 232 | self.landmark_idx = landmark_idx 233 | self.augmentation = augmentation 234 | 235 | self.num_landmarks = self.landmark.shape[1] 236 | 237 | def _extract_patch(self, C_T_G, lm_idx, K, W_modified, H_modified, center=False, adjust_boundary=True): 238 | 239 | proj = K @ (C_T_G[:3, :3] @ self.landmark[:, lm_idx:(lm_idx + 1)] + C_T_G[:3, 3:]) 240 | proj /= copy.copy(proj[2:]) 241 | 242 | # Extract patch 243 | y = int(proj[1, 0]) 244 | x = int(proj[0, 0]) 245 | 246 | if center: 247 | dy = -self.patch_size // 2 248 | dx = -self.patch_size // 2 249 | else: 250 | dy = -np.random.rand(1) * self.patch_size 251 | dx = -np.random.rand(1) * self.patch_size 252 | 253 | _top = int(y + dy) 254 | _bottom = _top + int(self.patch_size) 255 | _left = int(x + dx) 256 | _right = _left + int(self.patch_size) 257 | 258 | if adjust_boundary: 259 | # Adjust the boundary 260 | if _top < 0: 261 | _top = 0 262 | _bottom = int(self.patch_size) 263 | elif _bottom >= H_modified: 264 | _top = H_modified - int(self.patch_size) 265 | _bottom = H_modified 266 | 267 | if _left < 0: 268 | _left = 0 269 | _right = int(self.patch_size) 270 | elif _right >= W_modified: 271 | _left = W_modified - int(self.patch_size) 272 | _right = W_modified 273 | 274 | return _left, _right, _top, _bottom 275 | 276 | def _project_landmarks_into_patch(self, K, C_T_G, img_idx, _top, _bottom, _left, _right): 277 | proj = K @ (C_T_G[:3, :3] @ self.landmark + C_T_G[:3, 3:]) 278 | in_front_of_camera = proj[2] > 0.0 279 | proj /= copy.copy(proj[2:]) 280 | 281 | proj_patch = np.zeros_like(proj[:2]) 282 | proj_patch[0] = proj[0] - _left 283 | proj_patch[1] = proj[1] - _top 284 | 285 | # L vector 286 | inside_patch = (proj[0] < _right) * (proj[0] >= _left) * (proj[1] < _bottom) * ( 287 | proj[1] >= _top) * in_front_of_camera 288 | 289 | # visible by propagated colmap visibility and inside patch 290 | _mask1 = self.visibility[:, self.image_indices[img_idx]] * inside_patch 291 | 292 | # outside patch 293 | # _mask2 = ~inside_patch 294 | 295 | # inside patch but not visible by colmap 296 | _mask3 = (self.visibility[:, self.image_indices[img_idx]] == 0) * inside_patch 297 | 298 | visibility_mask = 1.0 * _mask1 + 0.5 * _mask3 299 | 300 | return proj_patch, visibility_mask 301 | 302 | def __getitem__(self, index): 303 | 304 | patches = [] 305 | keypoint_locations = [] 306 | landmark_visibility_on_patch = [] 307 | L = self.landmark.shape[1] # number of keypoints 308 | 309 | list_landmarks = np.random.permutation(L)[:self.positive_samples] 310 | 311 | ## Create positive examples 312 | for lm_idx in list_landmarks: 313 | ## Randomly draw image index from visibility mask 314 | training_img_ids_observe_lm_idx = self.visibility[lm_idx, self.image_indices].reshape(-1) 315 | total_images_observed_this_lm = np.sum(training_img_ids_observe_lm_idx) 316 | if total_images_observed_this_lm == 0: 317 | print('no positive example') 318 | img_idx_positive_sample_for_lm_idx = np.random.randint(self.num_images) 319 | else: 320 | # img_idx_observe_lm_idx = (index % int(np.sum(training_img_ids_observe_lm_idx))) 321 | random_indices_observe_this_lm = np.random.randint(0, total_images_observed_this_lm) 322 | img_idx_positive_sample_for_lm_idx = np.where(training_img_ids_observe_lm_idx==1)[0][random_indices_observe_this_lm] 323 | 324 | K, K_inv, W_modified, H_modified = self._modify_intrinsic(img_idx_positive_sample_for_lm_idx) 325 | C_T_G = self._load_pose(img_idx_positive_sample_for_lm_idx) 326 | color_tensor = self._load_and_resize_image(img_idx_positive_sample_for_lm_idx, W_modified, H_modified) 327 | 328 | if not self.augmentation: 329 | _left, _right, _top, _bottom = self._extract_patch(C_T_G, lm_idx, K, W_modified, H_modified, 330 | center=False, adjust_boundary=True) 331 | color_patch = color_tensor.reshape(1, 3, H_modified, W_modified)[:, :, _top:_bottom, _left:_right] 332 | Cg_T_G = C_T_G 333 | K_scale = K 334 | else: 335 | ## Random rotation, change K, T 336 | q = np.random.rand(4) - 0.5 337 | q[1] *= 0.1 # pitch 338 | q[2] *= 0.1 # yaw 339 | q[3] *= 0.1 # roll 340 | q[0] = 1.0 341 | q /= np.linalg.norm(q) 342 | Cg_R_C = Quaternion2Rotation(q) 343 | Cg_T_C = np.eye(4) 344 | Cg_T_C[:3, :3] = Cg_R_C 345 | 346 | Cg_T_G = Cg_T_C @ C_T_G 347 | K_scale = K.copy() 348 | K_scale[:2, :2] *= (0.9 + 0.2*np.random.rand()) 349 | K_scale_inv = np.linalg.inv(K_scale) 350 | 351 | _left, _right, _top, _bottom = self._extract_patch(Cg_T_G, lm_idx, K_scale, W_modified, H_modified, 352 | center=False, adjust_boundary=False) 353 | 354 | ## Extract patch 355 | YY_patch, XX_patch = torch.meshgrid(torch.arange(_top, _bottom, 1), 356 | torch.arange(_left, _right, 1)) 357 | XX_patch = XX_patch.reshape(1, self.patch_size, self.patch_size).float() 358 | YY_patch = YY_patch.reshape(1, self.patch_size, self.patch_size).float() 359 | 360 | in_H_out = K @ Cg_R_C.T @ K_scale_inv 361 | in_H_out = torch.tensor(in_H_out, dtype=torch.float) 362 | in_p_out = in_H_out @ torch.cat((XX_patch, 363 | YY_patch, 364 | torch.ones_like(XX_patch)), dim=1).reshape((3, self.patch_size**2)) 365 | in_p_out = in_p_out / in_p_out[2:].clone() 366 | 367 | scale = torch.tensor([[2. / W_modified, 0.], 368 | [0., 2. / H_modified]], dtype=torch.float).reshape(2, 2) 369 | center = torch.tensor([0.5 * (W_modified - 1), 370 | 0.5 * (H_modified - 1)], dtype=torch.float).reshape(2, 1) 371 | in_p_out_normalized = scale @ (in_p_out[:2] - center) 372 | 373 | invalid_pixel_mask = (in_p_out_normalized[0] < -1) + \ 374 | (in_p_out_normalized[0] > 1) + \ 375 | (in_p_out_normalized[1] < -1) + \ 376 | (in_p_out_normalized[1] > 1) 377 | 378 | if torch.sum(invalid_pixel_mask>0) > 0.25 * self.patch_size ** 2: 379 | _left, _right, _top, _bottom = self._extract_patch(C_T_G, lm_idx, K, W_modified, H_modified, 380 | center=False, adjust_boundary=True) 381 | color_patch = color_tensor.reshape(1, 3, H_modified, W_modified)[:, :, _top:_bottom, _left:_right] 382 | 383 | # Not using augmented transformation 384 | K_scale = K.copy() 385 | Cg_T_G = C_T_G 386 | else: 387 | grid_sampler = in_p_out_normalized.reshape(1, 2, self.patch_size, self.patch_size).permute(0, 2, 3, 1) 388 | color_tensor = color_tensor.reshape(1, 3, H_modified, W_modified) 389 | color_patch = torch.nn.functional.grid_sample(color_tensor, grid_sampler, 390 | padding_mode='zeros', mode='bilinear', align_corners=False) 391 | color_patch = torch.nn.functional.interpolate(color_patch, size=(self.patch_size, self.patch_size)) 392 | 393 | keypoints_2d, visibility_mask = self._project_landmarks_into_patch(K_scale, Cg_T_G, img_idx_positive_sample_for_lm_idx, _top, _bottom, _left, _right) 394 | patches.append(color_patch) 395 | keypoint_locations.append(keypoints_2d.reshape((1, 2, L))) 396 | landmark_visibility_on_patch.append(visibility_mask.reshape((1, L))) 397 | 398 | ## Create random examples 399 | patches_random = [] 400 | keypoint_locations_random = [] 401 | landmark_visibility_on_patch_random = [] 402 | 403 | C_T_G = self._load_pose(index) 404 | K, K_inv, W_modified, H_modified = self._modify_intrinsic(index) 405 | color_tensor = self._load_and_resize_image(index, W_modified, H_modified) 406 | 407 | for _ in range(self.random_samples): 408 | _top = int(np.random.rand(1) * (H_modified - self.patch_size)) 409 | _bottom = _top + self.patch_size 410 | _left = int(np.random.rand(1) * (W_modified - self.patch_size)) 411 | _right = _left + self.patch_size 412 | 413 | keypoints_2d, visibility_mask = self._project_landmarks_into_patch(K, C_T_G, index, _top, _bottom, _left, _right) 414 | 415 | patches_random.append(color_tensor[:, _top:_bottom, _left:_right].clone().reshape(1, 3, self.patch_size, self.patch_size)) 416 | keypoint_locations_random.append(keypoints_2d.reshape((1, 2, L))) 417 | landmark_visibility_on_patch_random.append(visibility_mask.reshape((1, L))) 418 | 419 | patches = torch.cat(patches+patches_random, dim=0) 420 | keypoint_locations = np.concatenate(keypoint_locations+keypoint_locations_random, axis=0) 421 | landmark_visibility_on_patch = np.concatenate(landmark_visibility_on_patch+landmark_visibility_on_patch_random, axis=0) 422 | 423 | ## COLOR AUGMENTATION 424 | if self.augmentation: 425 | if torch.rand(1) > 0.5: 426 | patches += 0.02 * ( 427 | torch.rand((patches.shape[0], patches.shape[1], 1, 1)) - 0.5) * torch.ones_like(patches) 428 | else: 429 | patches += 0.2 * ( 430 | torch.rand((patches.shape[0], 1, 1, 1)) - 0.5) * torch.ones_like(patches) 431 | clipped_patches = torch.clip(patches, 0, 1) 432 | 433 | 434 | output = {'patches': clipped_patches, 435 | 'landmark2d': torch.tensor(keypoint_locations, dtype=torch.float, requires_grad=False), 436 | 'visibility': torch.tensor(landmark_visibility_on_patch, requires_grad=False), 437 | } 438 | 439 | return output 440 | -------------------------------------------------------------------------------- /src/inference.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import os 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from tqdm import tqdm 7 | import random 8 | from datetime import datetime 9 | 10 | from dataloader.indoor6 import Indoor6 11 | from models.efficientlitesld import EfficientNetSLD 12 | from utils.pnp import * 13 | 14 | 15 | def compute_error(C_R_G, C_t_G, C_R_G_hat, C_t_G_hat): 16 | 17 | rot_err = 180 / np.pi * np.arccos(np.clip(0.5 * (np.trace(C_R_G.T @ C_R_G_hat) - 1.0), a_min=-1., a_max=1.)) 18 | trans_err = np.linalg.norm(C_R_G_hat.T @ C_t_G_hat - C_R_G.T @ C_t_G) 19 | 20 | return rot_err, trans_err 21 | 22 | 23 | def compute_2d3d(opt, pred_heatmap, peak_threshold, landmark2d, landmark3d, C_b_f_gt, H_hm, W_hm, K_inv, 24 | METRICS_LOGGING=None): 25 | N = pred_heatmap.shape[0] 26 | G_p_f = np.zeros((3, N)) 27 | C_b_f_hm = np.zeros((3, N)) 28 | weights = np.zeros(N) 29 | validIdx = 0 30 | 31 | pixel_error = [] 32 | angular_error = [] 33 | for l in range(N): 34 | pred_heatmap_l = pred_heatmap[l] 35 | max_pred_heatmap_l = np.max(pred_heatmap_l) 36 | 37 | if max_pred_heatmap_l > peak_threshold: 38 | peak_yx = np.unravel_index(np.argmax(pred_heatmap_l), np.array(pred_heatmap_l).shape) 39 | peak_yx = np.array(peak_yx) 40 | 41 | # Patch size extraction 42 | P = int(min(1+2*np.min(np.array([peak_yx[0], H_hm-1.0-peak_yx[0], peak_yx[1], W_hm-1.0-peak_yx[1]])), 43 | 1+64//opt.output_downsample)) 44 | 45 | patch_peak_yx = pred_heatmap_l[peak_yx[0] - P // 2:peak_yx[0] + P // 2 + 1, 46 | peak_yx[1] - P // 2:peak_yx[1] + P // 2 + 1] 47 | xx_patch, yy_patch = np.meshgrid(np.arange(peak_yx[1] - P // 2, peak_yx[1] + P // 2 + 1, 1), 48 | np.arange(peak_yx[0] - P // 2, peak_yx[0] + P // 2 + 1, 1)) 49 | 50 | refine_y = np.sum(patch_peak_yx * yy_patch) / np.sum(patch_peak_yx) 51 | refine_x = np.sum(patch_peak_yx * xx_patch) / np.sum(patch_peak_yx) 52 | 53 | 54 | pixel_error.append(np.linalg.norm(landmark2d[:2, l] - 55 | opt.output_downsample * np.array([refine_x, refine_y]))) 56 | 57 | pred_bearing = K_inv @ np.array([refine_x, refine_y, 1]) 58 | pred_bearing = pred_bearing / np.linalg.norm(pred_bearing) 59 | gt_bearing = C_b_f_gt[:, l] 60 | gt_bearing = gt_bearing / np.linalg.norm(gt_bearing) 61 | angular_error_batch = np.arccos( 62 | np.clip(pred_bearing @ gt_bearing, a_min=-1, a_max=1)) * 180 / np.pi 63 | 64 | angular_error.append(angular_error_batch) 65 | 66 | weights[validIdx] = max_pred_heatmap_l 67 | C_b_f_hm[:, validIdx] = pred_bearing 68 | G_p_f[:, validIdx] = landmark3d[:, l] 69 | validIdx += 1 70 | 71 | return G_p_f[:, :validIdx], C_b_f_hm[:, :validIdx], weights[:validIdx], np.asarray(pixel_error), np.asarray(angular_error) 72 | 73 | 74 | def compute_pose(G_p_f, C_b_f_hm, weights, minimal_tight_thr, opt_tight_thr): 75 | 76 | Ndetected_landmarks = C_b_f_hm.shape[1] 77 | 78 | if Ndetected_landmarks >= 4: 79 | ## P3P ransac 80 | C_T_G_hat, PnP_inlier = P3PKe_Ransac(G_p_f, C_b_f_hm, weights, 81 | thres=minimal_tight_thr) 82 | 83 | if np.sum(PnP_inlier) >= 4: 84 | C_T_G_opt = RunPnPNL(C_T_G_hat, 85 | G_p_f[:, PnP_inlier], 86 | C_b_f_hm[:, PnP_inlier], 87 | weights[PnP_inlier], 88 | cutoff=opt_tight_thr) 89 | return np.sum(PnP_inlier), C_T_G_opt 90 | 91 | return 0, None 92 | 93 | 94 | def inference(opt, minimal_tight_thr=1e-2, opt_tight_thr=5e-3, mode='test'): 95 | 96 | # random.seed(datetime.now().timestamp()) 97 | 98 | PRETRAINED_MODEL = opt.pretrained_model 99 | 100 | device = opt.gpu_device 101 | 102 | test_dataset = Indoor6(landmark_idx=np.arange(opt.landmark_indices[0], opt.landmark_indices[-1]), 103 | scene_id=opt.scene_id, 104 | mode=mode, 105 | root_folder=opt.dataset_folder, 106 | input_image_downsample=2, 107 | landmark_config=opt.landmark_config, 108 | visibility_config=opt.visibility_config, 109 | skip_image_index=1, 110 | use_precomputed_focal_length=opt.use_precomputed_focal_length) 111 | 112 | test_dataloader = DataLoader(dataset=test_dataset, num_workers=1, batch_size=1, shuffle=False, pin_memory=True) 113 | 114 | num_landmarks = test_dataset.landmark.shape[1] 115 | landmark_data = test_dataset.landmark 116 | 117 | cnns = [] 118 | nLandmarks = opt.landmark_indices 119 | num_landmarks = opt.landmark_indices[-1] - opt.landmark_indices[0] 120 | 121 | for idx, pretrained_model in enumerate(PRETRAINED_MODEL): 122 | if opt.model == 'efficientnet': 123 | cnn = EfficientNetSLD(num_landmarks=nLandmarks[idx+1]-nLandmarks[idx], output_downsample=opt.output_downsample).to(device=device) 124 | 125 | cnn.load_state_dict(torch.load(pretrained_model)) 126 | cnn = cnn.to(device=device) 127 | cnn.eval() 128 | 129 | # Adding pretrained model 130 | cnns.append(cnn) 131 | 132 | peak_threshold = 3e-1 133 | img_id = 0 134 | 135 | METRICS_LOGGING = {'image_name': '', 136 | 'angular_error': 180., 137 | 'pixel_error': 1800., 138 | 'rot_err_all': 180., 139 | 'trans_err_all': 180., 140 | 'heatmap_peak': 0.0, 141 | 'ndetected': 0, 142 | } 143 | test_image_logging = [] 144 | 145 | with torch.no_grad(): 146 | 147 | ## Only works for indoor-6 148 | indoor6W = 640 // opt.output_downsample 149 | indoor6H = 352 // opt.output_downsample 150 | HH, WW = torch.meshgrid(torch.arange(indoor6H), torch.arange(indoor6W)) 151 | WW = WW.reshape(1, 1, indoor6H, indoor6W).to('cuda') 152 | HH = HH.reshape(1, 1, indoor6H, indoor6W).to('cuda') 153 | 154 | with tqdm(test_dataloader) as tq: 155 | for idx, batch in enumerate(tq): 156 | #for idx, batch in enumerate(tqdm(test_dataloader)): 157 | 158 | image = batch['image'].to(device=device) 159 | B, _, H, W = image.shape 160 | 161 | K_inv = batch['inv_intrinsics'].to(device=device) 162 | C_T_G_gt = batch['pose_gt'].cpu().numpy() 163 | 164 | landmark2d = batch['intrinsics'] @ batch['landmark3d'].reshape(B, 3, num_landmarks) 165 | landmark2d /= landmark2d[:, 2:].clone() 166 | landmark2d = landmark2d.numpy() 167 | 168 | pred_heatmap = [] 169 | for cnn in cnns: 170 | pred = cnn(image) 171 | pred_heatmap.append(pred['1']) 172 | 173 | pred_heatmap = torch.cat(pred_heatmap, axis=1) 174 | pred_heatmap *= (pred_heatmap > peak_threshold).float() 175 | 176 | # tmp = torch.sqrt(pred_heatmap) 177 | # 178 | # w^{1.5} 179 | # pred_heatmap *= tmp 180 | # 181 | # w^{2.5} 182 | # pred_heatmap *= tmp 183 | # pred_heatmap *= pred_heatmap 184 | 185 | # w^2 186 | pred_heatmap *= pred_heatmap 187 | 188 | K_inv[:, :, :2] *= opt.output_downsample 189 | 190 | ## Compute 2D location of landmarks 191 | P = torch.max(torch.max(pred_heatmap, dim=3)[0], dim=2)[0] 192 | pred_normalized_heatmap = pred_heatmap / (torch.sum(pred_heatmap, axis=(2, 3), keepdim=True) + 1e-4) 193 | projx = torch.sum(WW * pred_normalized_heatmap, axis=(2, 3)).reshape(B, 1, num_landmarks) 194 | projy = torch.sum(HH * pred_normalized_heatmap, axis=(2, 3)).reshape(B, 1, num_landmarks) 195 | xy1 = torch.cat((projx, projy, torch.ones_like(projx)), axis=1) 196 | uv1 = K_inv @ xy1 197 | C_B_f = uv1 / torch.sqrt(torch.sum(uv1 ** 2, axis=1, keepdim=True)) 198 | C_B_f = C_B_f.cpu().numpy() 199 | P = P.cpu().numpy() 200 | xy1 = xy1.cpu().numpy() 201 | 202 | ## Compute error 203 | for b in range(B): 204 | Pb = P[b]>peak_threshold 205 | G_p_f = landmark_data[:, Pb] 206 | C_b_f = C_B_f[b][:, Pb] 207 | 208 | ## MAKING THIS CHANGE FOR ABLATION STUDY IN PAPER: PLEASE REMOVE LATER! 209 | ## weights = np.ones_like(P[b][Pb]) 210 | weights = P[b][Pb] 211 | 212 | xy1b = xy1[b][:2, Pb] 213 | 214 | pnp_inlier, C_T_G_hat = compute_pose(G_p_f, C_b_f, weights, 215 | minimal_tight_thr, opt_tight_thr) 216 | 217 | rot_err, trans_err = 180., 1800. 218 | if pnp_inlier >= 4: 219 | rot_err, trans_err = compute_error(C_T_G_gt[b][:3, :3], C_T_G_gt[b][:3, 3], 220 | C_T_G_hat[:3, :3], C_T_G_hat[:3, 3]) 221 | 222 | ## Logging information 223 | pixel_error = np.linalg.norm(landmark2d[b][:2, Pb] - opt.output_downsample * xy1b, axis=0) 224 | C_b_f_gt = batch['landmark3d'][b] 225 | C_b_f_gt = torch.nn.functional.normalize(C_b_f_gt, dim=0).cpu().numpy() 226 | angular_error = np.arccos(np.clip(np.sum(C_b_f * C_b_f_gt[:, Pb], axis=0), -1, 1)) * 180. / np.pi 227 | 228 | m = copy.deepcopy(METRICS_LOGGING) 229 | m['image_name'] = test_dataset.image_files[img_id] 230 | m['pixel_error'] = pixel_error 231 | m['angular_error'] = angular_error 232 | m['heatmap_peak'] = weights 233 | m['rot_err_all'] = np.array([rot_err]) 234 | m['trans_err_all'] = np.array([trans_err]) 235 | test_image_logging.append(m) 236 | img_id += 1 237 | 238 | elapsedtime = tq.format_dict["elapsed"] 239 | processing_speed = len(test_dataset)/elapsedtime 240 | 241 | metrics_output = {'angular_error': [], 242 | 'pixel_error': [], 243 | 'heatmap_peak': [], 244 | 'rot_err_all': [], 245 | 'trans_err_all': []} 246 | 247 | for k in metrics_output: 248 | for imgdata in test_image_logging: 249 | metrics_output[k].append(imgdata[k]) 250 | metrics_output[k] = np.concatenate(metrics_output[k]) 251 | 252 | metrics_output['r5'] = np.sum(metrics_output['rot_err_all'] < 5) / len(test_dataset) 253 | metrics_output['r10'] = np.sum(metrics_output['rot_err_all'] < 10) / len(test_dataset) 254 | metrics_output['p5'] = np.sum(metrics_output['trans_err_all'] < 0.05) / len(test_dataset) 255 | metrics_output['p10'] = np.sum(metrics_output['trans_err_all'] < 0.1) / len(test_dataset) 256 | metrics_output['r1p1'] = np.sum((metrics_output['rot_err_all'] < 1) * (metrics_output['trans_err_all'] < 0.01))/len(test_dataset) 257 | metrics_output['r2p2'] = np.sum((metrics_output['rot_err_all'] < 2) * (metrics_output['trans_err_all'] < 0.02))/len(test_dataset) 258 | metrics_output['r5p5'] = np.sum((metrics_output['rot_err_all'] < 5) * (metrics_output['trans_err_all'] < 0.05))/len(test_dataset) 259 | metrics_output['r10p10'] = np.sum((metrics_output['rot_err_all'] < 10) * (metrics_output['trans_err_all'] < 0.1)) / len(test_dataset) 260 | metrics_output['median_rot_error'] = np.median(metrics_output['rot_err_all']) 261 | metrics_output['median_trans_error'] = np.median(metrics_output['trans_err_all']) 262 | metrics_output['speed'] = processing_speed 263 | return metrics_output 264 | 265 | 266 | def inference_landmark_stats(opt, mode='test'): 267 | import pickle 268 | 269 | PRETRAINED_MODEL = opt.pretrained_model 270 | 271 | device = opt.gpu_device 272 | 273 | test_dataset = Indoor6(landmark_idx=np.arange(opt.landmark_indices[0], opt.landmark_indices[-1]), 274 | scene_id=opt.scene_id, 275 | mode=mode, 276 | root_folder=opt.dataset_folder, 277 | input_image_downsample=2, 278 | landmark_config=opt.landmark_config, 279 | visibility_config=opt.visibility_config, 280 | skip_image_index=1) 281 | 282 | test_dataloader = DataLoader(dataset=test_dataset, num_workers=1, batch_size=1, shuffle=False, pin_memory=True) 283 | 284 | num_landmarks = test_dataset.landmark.shape[1] 285 | 286 | cnns = [] 287 | nLandmarks = opt.landmark_indices 288 | num_landmarks = opt.landmark_indices[-1] - opt.landmark_indices[0] 289 | 290 | for idx, pretrained_model in enumerate(PRETRAINED_MODEL): 291 | if opt.model == 'efficientnet': 292 | cnn = EfficientNetSLD(num_landmarks=nLandmarks[idx+1]-nLandmarks[idx], output_downsample=opt.output_downsample).to(device=device) 293 | 294 | cnn.load_state_dict(torch.load(pretrained_model)) 295 | cnn = cnn.to(device=device) 296 | cnn.eval() 297 | 298 | # Adding pretrained model 299 | cnns.append(cnn) 300 | 301 | peak_threshold = 2e-1 302 | 303 | SINGLE_LANDMARK_STATS = {'image_idx': [], 304 | 'pixel_error': [], 305 | } 306 | landmark_stats = [copy.deepcopy(SINGLE_LANDMARK_STATS) for _ in range(num_landmarks)] 307 | img_idx = 0 308 | 309 | with torch.no_grad(): 310 | 311 | ## Only works for indoor-6 312 | indoor6W = 640 // opt.output_downsample 313 | indoor6H = 352 // opt.output_downsample 314 | HH, WW = torch.meshgrid(torch.arange(indoor6H), torch.arange(indoor6W)) 315 | WW = WW.reshape(1, 1, indoor6H, indoor6W).to('cuda') 316 | HH = HH.reshape(1, 1, indoor6H, indoor6W).to('cuda') 317 | 318 | for idx, batch in enumerate(tqdm(test_dataloader)): 319 | 320 | image = batch['image'].to(device=device) 321 | B, _, H, W = image.shape 322 | landmark2d = batch['intrinsics'] @ batch['landmark3d'].reshape(B, 3, num_landmarks) 323 | landmark2d /= landmark2d[:, 2:].clone() 324 | landmark2d = landmark2d.numpy() 325 | 326 | pred_heatmap = [] 327 | for cnn in cnns: 328 | pred = cnn(image) 329 | pred_heatmap.append(pred['1']) 330 | 331 | pred_heatmap = torch.cat(pred_heatmap, axis=1) 332 | pred_heatmap *= (pred_heatmap > peak_threshold).float() 333 | 334 | ## Compute 2D location of landmarks 335 | P = torch.max(torch.max(pred_heatmap, dim=3)[0], dim=2)[0] 336 | pred_normalized_heatmap = pred_heatmap / (torch.sum(pred_heatmap, axis=(2, 3), keepdim=True) + 1e-4) 337 | projx = torch.sum(WW * pred_normalized_heatmap, axis=(2, 3)).reshape(B, 1, num_landmarks) 338 | projy = torch.sum(HH * pred_normalized_heatmap, axis=(2, 3)).reshape(B, 1, num_landmarks) 339 | xy1 = torch.cat((projx, projy, torch.ones_like(projx)), axis=1) 340 | P = P.cpu().numpy() 341 | xy1 = xy1.cpu().numpy() 342 | 343 | ## Compute error 344 | for b in range(B): 345 | for l in range(num_landmarks): 346 | if P[b,l] > peak_threshold: 347 | pixel_error = np.linalg.norm(landmark2d[b][:2, l] - 348 | opt.output_downsample * xy1[b][:2, l]) 349 | landmark_stats[l]['pixel_error'].append(pixel_error) 350 | landmark_stats[l]['image_idx'].append(test_dataset.image_indices[img_idx]) 351 | img_idx += 1 352 | 353 | landmark_stats_np = np.zeros((num_landmarks, 5)) 354 | for l in range(num_landmarks): 355 | landmark_stats_np[l, 0] = l 356 | landmark_stats_np[l, 1] = len(landmark_stats[l]['image_idx']) 357 | if landmark_stats_np[l, 1] > 0: 358 | pixel_error = np.array(landmark_stats[l]['pixel_error']) 359 | landmark_stats_np[l, 2] = np.mean(pixel_error) 360 | landmark_stats_np[l, 3] = np.median(pixel_error) 361 | landmark_stats_np[l, 4] = np.max(pixel_error) 362 | np.savetxt(os.path.join(opt.output_folder, 'landmark_stats.txt'), landmark_stats_np) 363 | pickle.dump(landmark_stats, open(os.path.join(opt.output_folder, 'landmark_stats.pkl'), 'wb')) 364 | 365 | return 366 | -------------------------------------------------------------------------------- /src/local_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | #from __future__ import print_function 3 | import argparse 4 | import os 5 | import time 6 | 7 | Args = None 8 | 9 | def local_inference(): 10 | cmd = 'python main.py --action test --dataset_folder %s --scene_id %s --landmark_config %s --visibility_config %s' % (Args.dataset_dir, Args.scene_id, Args.landmark_config, Args.visibility_config) 11 | cmd += ' --output_downsample 8' 12 | cmd += ' --landmark_indices 0' 13 | for i in range(0, len(Args.landmark_indices)): 14 | cmd += ' --landmark_indices %d' % (Args.landmark_indices[i]) 15 | for ckpt in Args.checkpoint_names: 16 | cmd += ' --pretrained_model %s/%s/%s/model-best_median.ckpt' % (Args.checkpoint_dir, Args.experimentGroupName, ckpt) 17 | cmd += ' --output_folder %s/%s' % (Args.checkpoint_dir, Args.experimentGroupName) 18 | print("Running [" + cmd + "]") 19 | os.system(cmd) 20 | 21 | if __name__ == '__main__': 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | '--experiment_file', default="", type=str, required=True, 26 | help="Experiment file path.") 27 | parser.add_argument( 28 | '--dataset_dir', default="", type=str, required=True, 29 | help="Dataset path.") 30 | parser.add_argument( 31 | '--checkpoint_dir', default="", type=str, required=True, 32 | help="Checkpoints folder path.") 33 | 34 | Args = parser.parse_args() 35 | 36 | tmp = os.path.basename(Args.experiment_file) 37 | Args.experimentGroupName = tmp[:tmp.rindex('.')] 38 | Args.landmark_indices = [] 39 | Args.checkpoint_names = [] 40 | exp_file = os.path.join(Args.checkpoint_dir, Args.experiment_file) 41 | fd = open(exp_file, 'r') 42 | while True: 43 | line = fd.readline() 44 | if line == '': 45 | break 46 | split_line = line.split() 47 | 48 | Args.scene_id = split_line[0] 49 | expName = split_line[1] 50 | 51 | Args.landmark_config = split_line[2] 52 | Args.visibility_config = split_line[3] 53 | 54 | Args.checkpoint_names.append(expName) 55 | fields = expName.split('-') 56 | Args.landmark_indices.append(int(fields[2])) 57 | 58 | local_inference() -------------------------------------------------------------------------------- /src/local_training.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | import argparse 3 | import os 4 | #import re 5 | 6 | Args = None 7 | 8 | def launch_training(): 9 | print("Experiment File: %s" % Args.experiment_file) 10 | print("Model Dir: %s" % Args.model_dir) 11 | cmd = 'python main.py --action train_patches' 12 | cmd += ' --training_batch_size %d' % (Args.training_batch_size) 13 | cmd += ' --output_downsample %d' % (Args.output_downsample) 14 | cmd += ' --num_epochs %d' % (Args.num_epochs) 15 | cmd += ' --dataset_folder %s' % (Args.dataset_dir) 16 | cmd += ' --scene_id %s' % (Args.scene_id) 17 | cmd += ' --landmark_config %s' % (Args.landmark_config) 18 | cmd += ' --visibility_config %s' % (Args.visibility_config) 19 | cmd += ' --output_folder %s' % (Args.model_dir) 20 | cmd += ' --landmark_indices %d' % (Args.landmark_index_start) 21 | cmd += ' --landmark_indices %d' % (Args.landmark_index_stop) 22 | os.system(cmd) 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument( 27 | '--dataset_dir', type=str, required=True, 28 | help="Dataset folder path.") 29 | parser.add_argument( 30 | '--experiment_file', type=str, required=True, 31 | help="Experiment file path.") 32 | parser.add_argument( 33 | '--scene_id', type=str, required=True, 34 | help="name of scene.") 35 | parser.add_argument( 36 | '--landmark_config', type=str, required=True, 37 | help='Landmark configuration.') 38 | parser.add_argument( 39 | '--visibility_config', type=str, required=True, 40 | help='Visibility configuration.') 41 | parser.add_argument( 42 | '--num_landmarks', type=int, required=True, 43 | help='number of landmarks.') 44 | parser.add_argument( 45 | '--block_size', type=int, required=True, 46 | help='number of landmarks in each block.') 47 | parser.add_argument( 48 | '--subset_index', type=int, required=True, 49 | help='index of landmark subset (starts from 0).') 50 | parser.add_argument( 51 | '--output_dir', type=str, required=True, 52 | help='folder to save experiment file in.') 53 | parser.add_argument( 54 | '--model_dir', type=str, required=True, 55 | help='folder to save model ckpt file in.') 56 | parser.add_argument( 57 | '--training_batch_size', type=int, required=True, 58 | help='batch size.') 59 | parser.add_argument( 60 | '--output_downsample', type=int, required=True, 61 | help='Downsample factor for heat map resolution.') 62 | parser.add_argument( 63 | '--num_epochs', type=int, required=True, 64 | help='the number of epochs used for training.') 65 | Args = parser.parse_args() 66 | 67 | # Write the experiment file 68 | exp_fn = os.path.join(Args.output_dir, Args.experiment_file) 69 | fd = open(exp_fn, "w") 70 | for lid in range(0, Args.num_landmarks, Args.block_size): 71 | Args.landmark_index_start = lid 72 | Args.landmark_index_stop = lid + Args.block_size 73 | str = '%s %s-%03d-%03d %s %s local' % (Args.scene_id, Args.scene_id, Args.landmark_index_start, Args.landmark_index_stop, Args.landmark_config, Args.visibility_config) 74 | print(str, file=fd) 75 | fd.close() 76 | 77 | # Launch the training job for the specified subset only. 78 | Args.landmark_index_start = Args.block_size * Args.subset_index 79 | Args.landmark_index_stop = Args.block_size * (Args.subset_index + 1) 80 | launch_training() -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from inference import * 3 | from train import * 4 | 5 | DEVICE = None 6 | # auto-detect default device 7 | if torch.backends.mps.is_available(): 8 | # Code to run on macOS 9 | torch.backends.mps.enabled = True 10 | DEVICE = "mps" 11 | print ("MPS enabled") 12 | elif torch.cuda.is_available(): 13 | # Windows or Linux GPU acceleration 14 | torch.backends.cudnn.enabled = True 15 | torch.backends.cudnn.benchmark = True 16 | DEVICE = "cuda" 17 | print ("CUDA enabled") 18 | else: 19 | # CPU 20 | torch.backends.cudnn.enabled = False 21 | DEVICE = "cpu" 22 | print ("CPU enabled") 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser( 26 | description='Scene Landmark Detection', 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 28 | parser.add_argument( 29 | '--dataset_folder', type=str, required=True, 30 | help='Root directory, where all data is stored') 31 | parser.add_argument( 32 | '--output_folder', type=str, required=True, 33 | help='Output folder') 34 | parser.add_argument( 35 | '--landmark_config', type=str, default='landmarks/landmarks-300', 36 | help='File containing scene-specific 3D landmarks.') 37 | parser.add_argument( 38 | '--landmark_indices', type=int, action='append', 39 | help = 'Landmark indices, specify twice', 40 | required=True) 41 | parser.add_argument( 42 | '--visibility_config', type=str, default='landmarks/visibility_aug-300', 43 | help='File containing information about visibility of landmarks in cameras associated with training set.') 44 | parser.add_argument( 45 | '--scene_id', type=str, default='scene6', 46 | help='Scene id') 47 | parser.add_argument( 48 | '--model', type=str, default='efficientnet', 49 | help='Network architecture backbone.') 50 | parser.add_argument( 51 | '--output_downsample', type=int, default=4, 52 | help='Down sampling factor for output resolution') 53 | parser.add_argument( 54 | '--gpu_device', type=str, default=DEVICE, 55 | help='GPU device') 56 | parser.add_argument( 57 | '--pretrained_model', type=str, action='append', default=[], 58 | help='Pretrained detector model') 59 | parser.add_argument( 60 | '--num_epochs', type=int, default=200, 61 | help='Number of training epochs.') 62 | parser.add_argument( 63 | '--action', type=str, default='test', 64 | help='train/train_patches/test') 65 | parser.add_argument( 66 | '--use_precomputed_focal_length', type=int, default=0) 67 | parser.add_argument( 68 | '--training_batch_size', type=int, default=8, 69 | help='Batch size used during training.') 70 | 71 | opt = parser.parse_args() 72 | 73 | #print('scene_id: ', opt.scene_id) 74 | #print('action: ', opt.action) 75 | #print('training_batch_size: ', opt.training_batch_size) 76 | #print('output downsample: ', opt.output_downsample) 77 | 78 | if opt.action == 'train': 79 | train(opt) 80 | opt.pretrained_model = [opt.output_folder + '/model-best_median.ckpt'] 81 | eval_stats = inference(opt, minimal_tight_thr=1e-3, opt_tight_thr=1e-3) 82 | print("{:>10} {:>30} {:>30} {:>20}".format('Scene ID', 83 | 'Median trans error (cm)', 84 | 'Median rotation error (deg)', 85 | 'Recall 5cm5deg (%)')) 86 | print("{:>10} {:>30.4} {:>30.4} {:>20.2%}".format(opt.scene_id, 87 | 100. * eval_stats['median_trans_error'], 88 | eval_stats['median_rot_error'], 89 | eval_stats['r5p5'])) 90 | elif opt.action == 'train_patches': 91 | train_patches(opt) 92 | opt.pretrained_model = [opt.output_folder + '/model-best_median.ckpt'] 93 | eval_stats = inference(opt, minimal_tight_thr=1e-3, opt_tight_thr=1e-3) 94 | print("{:>10} {:>30} {:>30} {:>20}".format('Scene ID', 95 | 'Median trans error (cm)', 96 | 'Median rotation error (deg)', 97 | 'Recall 5cm5deg (%)')) 98 | print("{:>10} {:>30.4} {:>30.4} {:>20.2%}".format(opt.scene_id, 99 | 100. * eval_stats['median_trans_error'], 100 | eval_stats['median_rot_error'], 101 | eval_stats['r5p5'])) 102 | elif opt.action == 'landmark_stats': 103 | inference_landmark_stats(opt, mode='train') 104 | elif opt.action == 'test': 105 | if opt.scene_id == 'all': 106 | eval_stats = {} 107 | pretrained_folder = opt.pretrained_model 108 | output_folder = opt.output_folder 109 | for scene_id in ['1', '2a', '3', '4a', '5', '6']: 110 | opt.scene_id = 'scene' + scene_id 111 | opt.pretrained_model = [pretrained_folder + 'scene%s.ckpt' % scene_id] 112 | opt.output_folder = os.path.join(output_folder, 'scene' + scene_id) 113 | eval_stats[opt.scene_id] = inference(opt, minimal_tight_thr=1e-3, opt_tight_thr=1e-3) 114 | 115 | print("{:>10} {:>30} {:>30} {:>20}".format('Scene ID', 116 | 'Median trans error (cm)', 117 | 'Median rotation error (deg)', 118 | 'Recall 5cm5deg (%)')) 119 | for x in eval_stats: 120 | print("{:>10} {:>30.4} {:>30.4} {:>20.2%}".format(x, 121 | 100. * eval_stats[x]['median_trans_error'], 122 | eval_stats[x]['median_rot_error'], 123 | eval_stats[x]['r5p5'])) 124 | else: 125 | 126 | eval_stats = inference(opt, minimal_tight_thr=1e-3, opt_tight_thr=1e-3) 127 | metricsFilename = opt.output_folder + '/metrics.txt' 128 | print(metricsFilename) 129 | fd = open(metricsFilename, "w") 130 | fd.write("%f\n" % (eval_stats['r5p5'])) 131 | fd.write("%f\n" % (eval_stats['speed'])) 132 | fd.close() 133 | 134 | print("{:>10} {:>30} {:>30} {:>20} {:>15} {:>15} {:>15} {:>15} {:>20} {:>20}".format('Scene ID', 135 | 'Median trans error (cm)', 136 | 'Median rotation error (deg)', 137 | 'Recall 1cm1deg (%)', 138 | '2cm2deg (%)', 139 | '5cm5deg (%)', 140 | '10cm10deg (%)', 141 | '5deg (%)', 142 | 'Median Pixel Error', 143 | 'Median Angular Error')) 144 | print("{:>10} {:>30.4} {:>30.4} {:>20.2%} {:>15.2%} {:>15.2%} {:>15.2%} {:>15.2%} {:>20.4} {:>20.4}".format(opt.scene_id, 145 | 100. * eval_stats['median_trans_error'], 146 | eval_stats['median_rot_error'], 147 | eval_stats['r1p1'], 148 | eval_stats['r2p2'], 149 | eval_stats['r5p5'], 150 | eval_stats['r10p10'], 151 | eval_stats['r5'], 152 | np.median(eval_stats['pixel_error']), 153 | np.median(eval_stats['angular_error']))) 154 | -------------------------------------------------------------------------------- /src/models/blocks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .conv2d_layers import Conv2dSameExport 4 | 5 | 6 | def _make_encoder(use_pretrained, exportable=True, output_downsample=4): 7 | 8 | # pretrained = _make_pretrained_efficientnet_lite0(use_pretrained, exportable=exportable) 9 | pretrained = torch.load('pretrained_efficientnetlite0.net') 10 | 11 | if output_downsample <= 16: 12 | pretrained.layer2[0][0].conv_dw.stride = (1, 1) 13 | if output_downsample <= 8: 14 | pretrained.layer3[0][0].conv_dw.stride = (1, 1) 15 | if output_downsample <= 4: 16 | pretrained.layer4[0][0].conv_dw.stride = (1, 1) 17 | 18 | return pretrained, None 19 | 20 | 21 | def _make_pretrained_efficientnet_lite0(use_pretrained, exportable=False): 22 | efficientnet = torch.hub.load( 23 | "rwightman/gen-efficientnet-pytorch", 24 | "tf_efficientnet_lite0", 25 | pretrained=use_pretrained, 26 | exportable=exportable 27 | ) 28 | return _make_efficientnet_backbone(efficientnet) 29 | 30 | 31 | def _make_efficientnet_backbone(effnet): 32 | pretrained = nn.Module() 33 | 34 | pretrained.layer1 = nn.Sequential( 35 | effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] 36 | ) 37 | pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) 38 | pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) 39 | pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) 40 | 41 | return pretrained 42 | 43 | 44 | def _make_resnet_backbone(resnet): 45 | pretrained = nn.Module() 46 | pretrained.layer1 = nn.Sequential( 47 | resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 48 | ) 49 | 50 | pretrained.layer2 = resnet.layer2 51 | pretrained.layer3 = resnet.layer3 52 | pretrained.layer4 = resnet.layer4 53 | 54 | return pretrained 55 | 56 | 57 | def _make_pretrained_resnext101_wsl(use_pretrained): 58 | resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") 59 | return _make_resnet_backbone(resnet) 60 | 61 | 62 | class Interpolate(nn.Module): 63 | """Interpolation module. 64 | """ 65 | 66 | def __init__(self, scale_factor, mode, align_corners=False): 67 | """Init. 68 | 69 | Args: 70 | scale_factor (float): scaling 71 | mode (str): interpolation mode 72 | """ 73 | super(Interpolate, self).__init__() 74 | 75 | self.interp = nn.functional.interpolate 76 | self.scale_factor = scale_factor 77 | self.mode = mode 78 | self.align_corners = align_corners 79 | 80 | def forward(self, x): 81 | """Forward pass. 82 | 83 | Args: 84 | x (tensor): input 85 | 86 | Returns: 87 | tensor: interpolated data 88 | """ 89 | 90 | x = self.interp( 91 | x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners 92 | ) 93 | 94 | return x 95 | 96 | 97 | class ResidualConvUnit(nn.Module): 98 | """Residual convolution module. 99 | """ 100 | 101 | def __init__(self, features): 102 | """Init. 103 | 104 | Args: 105 | features (int): number of features 106 | """ 107 | super().__init__() 108 | 109 | self.conv1 = nn.Conv2d( 110 | features, features, kernel_size=3, stride=1, padding=1, bias=True 111 | ) 112 | 113 | self.conv2 = nn.Conv2d( 114 | features, features, kernel_size=3, stride=1, padding=1, bias=True 115 | ) 116 | 117 | self.relu = nn.ReLU(inplace=True) 118 | 119 | def forward(self, x): 120 | """Forward pass. 121 | 122 | Args: 123 | x (tensor): input 124 | 125 | Returns: 126 | tensor: output 127 | """ 128 | out = self.relu(x) 129 | out = self.conv1(out) 130 | out = self.relu(out) 131 | out = self.conv2(out) 132 | 133 | return out + x 134 | 135 | 136 | class FeatureFusionBlock(nn.Module): 137 | """Feature fusion block. 138 | """ 139 | 140 | def __init__(self, features): 141 | """Init. 142 | 143 | Args: 144 | features (int): number of features 145 | """ 146 | super(FeatureFusionBlock, self).__init__() 147 | 148 | self.resConfUnit1 = ResidualConvUnit(features) 149 | self.resConfUnit2 = ResidualConvUnit(features) 150 | 151 | def forward(self, *xs): 152 | """Forward pass. 153 | 154 | Returns: 155 | tensor: output 156 | """ 157 | output = xs[0] 158 | 159 | if len(xs) == 2: 160 | output += self.resConfUnit1(xs[1]) 161 | 162 | output = self.resConfUnit2(output) 163 | 164 | output = nn.functional.interpolate( 165 | output, scale_factor=2, mode="bilinear", align_corners=True 166 | ) 167 | 168 | return output 169 | 170 | 171 | class ResidualConvUnit_custom(nn.Module): 172 | """Residual convolution module. 173 | """ 174 | 175 | def __init__(self, features, activation, bn): 176 | """Init. 177 | 178 | Args: 179 | features (int): number of features 180 | """ 181 | super().__init__() 182 | 183 | self.bn = bn 184 | 185 | self.groups = 1 186 | 187 | self.conv1 = nn.Conv2d( 188 | features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups 189 | ) 190 | 191 | self.conv2 = nn.Conv2d( 192 | features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups 193 | ) 194 | 195 | if self.bn == True: 196 | self.bn1 = nn.BatchNorm2d(features) 197 | self.bn2 = nn.BatchNorm2d(features) 198 | 199 | self.activation = activation 200 | 201 | self.skip_add = nn.quantized.FloatFunctional() 202 | 203 | def forward(self, x): 204 | """Forward pass. 205 | 206 | Args: 207 | x (tensor): input 208 | 209 | Returns: 210 | tensor: output 211 | """ 212 | 213 | out = self.activation(x) 214 | out = self.conv1(out) 215 | if self.bn == True: 216 | out = self.bn1(out) 217 | 218 | out = self.activation(out) 219 | out = self.conv2(out) 220 | if self.bn == True: 221 | out = self.bn2(out) 222 | 223 | if self.groups > 1: 224 | out = self.conv_merge(out) 225 | 226 | return self.skip_add.add(out, x) 227 | 228 | # return out + x 229 | 230 | 231 | class FeatureFusionBlock_custom(nn.Module): 232 | """Feature fusion block. 233 | """ 234 | 235 | def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True): 236 | """Init. 237 | 238 | Args: 239 | features (int): number of features 240 | """ 241 | super(FeatureFusionBlock_custom, self).__init__() 242 | 243 | self.deconv = deconv 244 | self.align_corners = align_corners 245 | 246 | self.groups = 1 247 | 248 | self.expand = expand 249 | out_features = features 250 | if self.expand == True: 251 | out_features = features // 2 252 | 253 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) 254 | 255 | self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) 256 | self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) 257 | 258 | self.skip_add = nn.quantized.FloatFunctional() 259 | 260 | def forward(self, *xs): 261 | """Forward pass. 262 | 263 | Returns: 264 | tensor: output 265 | """ 266 | output = xs[0] 267 | 268 | if len(xs) == 2: 269 | res = self.resConfUnit1(xs[1]) 270 | output = self.skip_add.add(output, res) 271 | # output += res 272 | 273 | output = self.resConfUnit2(output) 274 | 275 | output = nn.functional.interpolate( 276 | output, scale_factor=2, mode="bilinear", align_corners=self.align_corners 277 | ) 278 | 279 | output = self.out_conv(output) 280 | 281 | return output 282 | -------------------------------------------------------------------------------- /src/models/conv2d_layers.py: -------------------------------------------------------------------------------- 1 | """ Conv2D w/ SAME padding, CondConv, MixedConv 2 | 3 | A collection of conv layers and padding helpers needed by EfficientNet, MixNet, and 4 | MobileNetV3 models that maintain weight compatibility with original Tensorflow models. 5 | 6 | Copyright 2020 Ross Wightman 7 | """ 8 | import collections.abc 9 | import math 10 | from functools import partial 11 | from itertools import repeat 12 | from typing import Tuple, Optional 13 | 14 | import numpy as np 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | # From PyTorch internals 20 | def _ntuple(n): 21 | def parse(x): 22 | if isinstance(x, collections.abc.Iterable): 23 | return x 24 | return tuple(repeat(x, n)) 25 | return parse 26 | 27 | 28 | _single = _ntuple(1) 29 | _pair = _ntuple(2) 30 | _triple = _ntuple(3) 31 | _quadruple = _ntuple(4) 32 | 33 | 34 | def _is_static_pad(kernel_size, stride=1, dilation=1, **_): 35 | return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 36 | 37 | 38 | def _get_padding(kernel_size, stride=1, dilation=1, **_): 39 | padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 40 | return padding 41 | 42 | 43 | def _calc_same_pad(i: int, k: int, s: int, d: int): 44 | return max((-(i // -s) - 1) * s + (k - 1) * d + 1 - i, 0) 45 | 46 | 47 | def _same_pad_arg(input_size, kernel_size, stride, dilation): 48 | ih, iw = input_size 49 | kh, kw = kernel_size 50 | pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0]) 51 | pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1]) 52 | return [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2] 53 | 54 | 55 | def _split_channels(num_chan, num_groups): 56 | split = [num_chan // num_groups for _ in range(num_groups)] 57 | split[0] += num_chan - sum(split) 58 | return split 59 | 60 | 61 | def conv2d_same( 62 | x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1), 63 | padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1): 64 | ih, iw = x.size()[-2:] 65 | kh, kw = weight.size()[-2:] 66 | pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0]) 67 | pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1]) 68 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 69 | return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups) 70 | 71 | 72 | class Conv2dSame(nn.Conv2d): 73 | """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions 74 | """ 75 | 76 | # pylint: disable=unused-argument 77 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, 78 | padding=0, dilation=1, groups=1, bias=True): 79 | super(Conv2dSame, self).__init__( 80 | in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 81 | 82 | def forward(self, x): 83 | return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 84 | 85 | 86 | class Conv2dSameExport(nn.Conv2d): 87 | """ ONNX export friendly Tensorflow like 'SAME' convolution wrapper for 2D convolutions 88 | 89 | NOTE: This does not currently work with torch.jit.script 90 | """ 91 | 92 | # pylint: disable=unused-argument 93 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 94 | super(Conv2dSameExport, self).__init__( 95 | in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 96 | self.pad = None 97 | self.pad_input_size = (0, 0) 98 | 99 | def forward(self, x): 100 | input_size = x.size()[-2:] 101 | if self.pad is None: 102 | pad_arg = _same_pad_arg(input_size, self.weight.size()[-2:], self.stride, self.dilation) 103 | self.pad = nn.ZeroPad2d(pad_arg) 104 | self.pad_input_size = input_size 105 | 106 | if self.pad is not None: 107 | x = self.pad(x) 108 | return F.conv2d( 109 | x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 110 | 111 | 112 | def get_padding_value(padding, kernel_size, **kwargs): 113 | dynamic = False 114 | if isinstance(padding, str): 115 | # for any string padding, the padding will be calculated for you, one of three ways 116 | padding = padding.lower() 117 | if padding == 'same': 118 | # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact 119 | if _is_static_pad(kernel_size, **kwargs): 120 | # static case, no extra overhead 121 | padding = _get_padding(kernel_size, **kwargs) 122 | else: 123 | # dynamic padding 124 | padding = 0 125 | dynamic = True 126 | elif padding == 'valid': 127 | # 'VALID' padding, same as padding=0 128 | padding = 0 129 | else: 130 | # Default to PyTorch style 'same'-ish symmetric padding 131 | padding = _get_padding(kernel_size, **kwargs) 132 | return padding, dynamic 133 | 134 | 135 | def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): 136 | padding = kwargs.pop('padding', '') 137 | kwargs.setdefault('bias', False) 138 | padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs) 139 | if is_dynamic: 140 | if is_exportable(): 141 | assert not is_scriptable() 142 | return Conv2dSameExport(in_chs, out_chs, kernel_size, **kwargs) 143 | else: 144 | return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs) 145 | else: 146 | return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs) 147 | 148 | 149 | class MixedConv2d(nn.ModuleDict): 150 | """ Mixed Grouped Convolution 151 | Based on MDConv and GroupedConv in MixNet impl: 152 | https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py 153 | """ 154 | 155 | def __init__(self, in_channels, out_channels, kernel_size=3, 156 | stride=1, padding='', dilation=1, depthwise=False, **kwargs): 157 | super(MixedConv2d, self).__init__() 158 | 159 | kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size] 160 | num_groups = len(kernel_size) 161 | in_splits = _split_channels(in_channels, num_groups) 162 | out_splits = _split_channels(out_channels, num_groups) 163 | self.in_channels = sum(in_splits) 164 | self.out_channels = sum(out_splits) 165 | for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)): 166 | conv_groups = out_ch if depthwise else 1 167 | self.add_module( 168 | str(idx), 169 | create_conv2d_pad( 170 | in_ch, out_ch, k, stride=stride, 171 | padding=padding, dilation=dilation, groups=conv_groups, **kwargs) 172 | ) 173 | self.splits = in_splits 174 | 175 | def forward(self, x): 176 | x_split = torch.split(x, self.splits, 1) 177 | x_out = [conv(x_split[i]) for i, conv in enumerate(self.values())] 178 | x = torch.cat(x_out, 1) 179 | return x 180 | 181 | 182 | def get_condconv_initializer(initializer, num_experts, expert_shape): 183 | def condconv_initializer(weight): 184 | """CondConv initializer function.""" 185 | num_params = np.prod(expert_shape) 186 | if (len(weight.shape) != 2 or weight.shape[0] != num_experts or 187 | weight.shape[1] != num_params): 188 | raise (ValueError( 189 | 'CondConv variables must have shape [num_experts, num_params]')) 190 | for i in range(num_experts): 191 | initializer(weight[i].view(expert_shape)) 192 | return condconv_initializer 193 | 194 | 195 | class CondConv2d(nn.Module): 196 | """ Conditional Convolution 197 | Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py 198 | 199 | Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion: 200 | https://github.com/pytorch/pytorch/issues/17983 201 | """ 202 | __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding'] 203 | 204 | def __init__(self, in_channels, out_channels, kernel_size=3, 205 | stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4): 206 | super(CondConv2d, self).__init__() 207 | 208 | self.in_channels = in_channels 209 | self.out_channels = out_channels 210 | self.kernel_size = _pair(kernel_size) 211 | self.stride = _pair(stride) 212 | padding_val, is_padding_dynamic = get_padding_value( 213 | padding, kernel_size, stride=stride, dilation=dilation) 214 | self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript 215 | self.padding = _pair(padding_val) 216 | self.dilation = _pair(dilation) 217 | self.groups = groups 218 | self.num_experts = num_experts 219 | 220 | self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size 221 | weight_num_param = 1 222 | for wd in self.weight_shape: 223 | weight_num_param *= wd 224 | self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param)) 225 | 226 | if bias: 227 | self.bias_shape = (self.out_channels,) 228 | self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels)) 229 | else: 230 | self.register_parameter('bias', None) 231 | 232 | self.reset_parameters() 233 | 234 | def reset_parameters(self): 235 | init_weight = get_condconv_initializer( 236 | partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape) 237 | init_weight(self.weight) 238 | if self.bias is not None: 239 | fan_in = np.prod(self.weight_shape[1:]) 240 | bound = 1 / math.sqrt(fan_in) 241 | init_bias = get_condconv_initializer( 242 | partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape) 243 | init_bias(self.bias) 244 | 245 | def forward(self, x, routing_weights): 246 | B, C, H, W = x.shape 247 | weight = torch.matmul(routing_weights, self.weight) 248 | new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size 249 | weight = weight.view(new_weight_shape) 250 | bias = None 251 | if self.bias is not None: 252 | bias = torch.matmul(routing_weights, self.bias) 253 | bias = bias.view(B * self.out_channels) 254 | # move batch elements with channels so each batch element can be efficiently convolved with separate kernel 255 | x = x.view(1, B * C, H, W) 256 | if self.dynamic_padding: 257 | out = conv2d_same( 258 | x, weight, bias, stride=self.stride, padding=self.padding, 259 | dilation=self.dilation, groups=self.groups * B) 260 | else: 261 | out = F.conv2d( 262 | x, weight, bias, stride=self.stride, padding=self.padding, 263 | dilation=self.dilation, groups=self.groups * B) 264 | out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1]) 265 | 266 | # Literal port (from TF definition) 267 | # x = torch.split(x, 1, 0) 268 | # weight = torch.split(weight, 1, 0) 269 | # if self.bias is not None: 270 | # bias = torch.matmul(routing_weights, self.bias) 271 | # bias = torch.split(bias, 1, 0) 272 | # else: 273 | # bias = [None] * B 274 | # out = [] 275 | # for xi, wi, bi in zip(x, weight, bias): 276 | # wi = wi.view(*self.weight_shape) 277 | # if bi is not None: 278 | # bi = bi.view(*self.bias_shape) 279 | # out.append(self.conv_fn( 280 | # xi, wi, bi, stride=self.stride, padding=self.padding, 281 | # dilation=self.dilation, groups=self.groups)) 282 | # out = torch.cat(out, 0) 283 | return out 284 | 285 | 286 | def select_conv2d(in_chs, out_chs, kernel_size, **kwargs): 287 | assert 'groups' not in kwargs # only use 'depthwise' bool arg 288 | if isinstance(kernel_size, list): 289 | assert 'num_experts' not in kwargs # MixNet + CondConv combo not supported currently 290 | # We're going to use only lists for defining the MixedConv2d kernel groups, 291 | # ints, tuples, other iterables will continue to pass to normal conv and specify h, w. 292 | m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs) 293 | else: 294 | depthwise = kwargs.pop('depthwise', False) 295 | groups = out_chs if depthwise else 1 296 | if 'num_experts' in kwargs and kwargs['num_experts'] > 0: 297 | m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs) 298 | else: 299 | m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs) 300 | return m -------------------------------------------------------------------------------- /src/models/efficientlitesld.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .blocks import _make_encoder 5 | 6 | 7 | class ASPP(nn.Module): 8 | def __init__(self, in_ch, d1, d2, d3, d4, reduction=4): 9 | super(ASPP, self).__init__() 10 | self.aspp_d1 = nn.Sequential( 11 | nn.Conv2d(in_ch, in_ch // reduction, 3, padding=d1, dilation=d1), 12 | nn.BatchNorm2d(in_ch // reduction), 13 | nn.ReLU(inplace=True) 14 | ) 15 | self.aspp_d2 = nn.Sequential( 16 | nn.Conv2d(in_ch, in_ch // reduction, 3, padding=d2, dilation=d2), 17 | nn.BatchNorm2d(in_ch // reduction), 18 | nn.ReLU(inplace=True) 19 | ) 20 | self.aspp_d3 = nn.Sequential( 21 | nn.Conv2d(in_ch, in_ch // reduction, 3, padding=d3, dilation=d3), 22 | nn.BatchNorm2d(in_ch // reduction), 23 | nn.ReLU(inplace=True) 24 | ) 25 | 26 | self.aspp_d4 = nn.Sequential( 27 | nn.Conv2d(in_ch, in_ch // reduction, 3, padding=d4, dilation=d4), 28 | nn.BatchNorm2d(in_ch // reduction), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | def forward(self, x): 33 | d1 = self.aspp_d1(x) 34 | d2 = self.aspp_d2(x) 35 | d3 = self.aspp_d3(x) 36 | d4 = self.aspp_d4(x) 37 | return torch.cat((d1, d2, d3, d4), dim=1) 38 | 39 | 40 | class EfficientNetSLD(torch.nn.Module): 41 | """Network for monocular depth estimation. 42 | """ 43 | 44 | def __init__(self, path=None, num_landmarks=200, output_downsample=4, features=320): 45 | """Init. 46 | 47 | Args: 48 | path (str, optional): Path to saved model. Defaults to None. 49 | features (int, optional): Number of features. Defaults to 256. 50 | backbone (str, optional): Backbone network for encoder. Defaults to efficientnetlite0 51 | """ 52 | super(EfficientNetSLD, self).__init__() 53 | 54 | self.pretrained, _ = _make_encoder(use_pretrained=True, output_downsample=output_downsample) 55 | 56 | self.aspp = nn.Sequential( 57 | ASPP(in_ch=features, d1=1, d2=2, d3=3, d4=4, reduction=4), 58 | ) 59 | 60 | self.heatmap_outputs_res1 = nn.Sequential( 61 | nn.Conv2d(features, num_landmarks, kernel_size=1, stride=1, padding=0) 62 | ) 63 | self.heatmap_outputs_res2 = None 64 | 65 | if output_downsample == 2: 66 | input_channels = features + num_landmarks 67 | output_channels = features 68 | 69 | self.heatmap_features_res2 = nn.Sequential(nn.ConvTranspose2d(in_channels=input_channels, 70 | out_channels=output_channels, 71 | kernel_size=4, stride=2, padding=1, 72 | bias=False), 73 | nn.BatchNorm2d(output_channels), 74 | nn.ReLU(inplace=True) 75 | ) 76 | self.heatmap_outputs_res2 = nn.Conv2d(output_channels, num_landmarks, kernel_size=1, stride=1, bias=False) 77 | 78 | if path: 79 | self.load(path) 80 | 81 | def forward(self, x): 82 | """Forward pass. 83 | 84 | Args: 85 | x (tensor): input data (image) 86 | 87 | Returns: 88 | Heatmap prediction 89 | ['1']: quarter of input spatial dimension 90 | ['2']: half of input spatial dimension 91 | """ 92 | 93 | layer_1 = self.pretrained.layer1(x) 94 | layer_2 = self.pretrained.layer2(layer_1) 95 | layer_3 = self.pretrained.layer3(layer_2) 96 | layer_4 = self.pretrained.layer4(layer_3) 97 | y1 = self.aspp(layer_4) 98 | z1 = self.heatmap_outputs_res1(y1) 99 | 100 | z2 = None 101 | if self.heatmap_outputs_res2 is not None: 102 | y2 = self.heatmap_features_res2(torch.cat((y1, z1), dim=1)) 103 | z2 = self.heatmap_outputs_res2(y2) 104 | 105 | return {'1': z1, '2': z2} 106 | -------------------------------------------------------------------------------- /src/pretrained_efficientnetlite0.net: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SceneLandmarkLocalization/cb1d86f982d15c462e6a1d04c730a05d5efdd0a6/src/pretrained_efficientnetlite0.net -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | # Scene Landmarks Detector Requirements 2 | # Usage: pip install -r requirements.txt 3 | 4 | argparse 5 | matplotlib>=3.2.2 6 | numpy>=1.22.3 7 | Pillow>=8.2.0 8 | scipy>=1.6.2 9 | open3d 10 | #torch==1.10.0+cu113 11 | #torchvision==0.11.1+cu113 12 | #torchaudio==0.10.0+cu113 13 | tqdm>=4.59.0 14 | geffnet 15 | -------------------------------------------------------------------------------- /src/run_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import statistics as st 3 | import sys 4 | import torch 5 | 6 | if __name__ == '__main__': 7 | 8 | home_dir = os.path.expanduser("~") 9 | # specify dataset path, location of checkpoints and the experiment name. 10 | checkpoint_dir = os.path.join(home_dir, 'data/checkpoints') 11 | dataset_dir = os.path.join(home_dir, 'data/indoor6') 12 | experiment = '1000-125_v10' 13 | 14 | # run inference for all six scenes of the indoor6 dataset 15 | for scene_name in ['scene1', 'scene2a', 'scene3', 'scene4a', 'scene5', 'scene6']: 16 | command = 'python ./local_inference.py --experiment_file %s_%s.txt --dataset_dir %s --checkpoint_dir %s' % (scene_name, experiment, dataset_dir, checkpoint_dir) 17 | os.system(command) 18 | 19 | # calculate metrics 20 | t1 = [] 21 | t2 = [] 22 | for scene_name in ['scene1', 'scene2a', 'scene3', 'scene4a', 'scene5', 'scene6']: 23 | subfolder = '%s_%s' % (scene_name, experiment) 24 | mfn = os.path.join(checkpoint_dir, subfolder, "metrics.txt") 25 | mfd = open(mfn, 'r') 26 | idx = 0 27 | for line in mfd.readlines(): 28 | if (idx % 2 == 0): 29 | t1.append(float(line)) 30 | else: 31 | t2.append(float(line)) 32 | idx+=1 33 | mfd.close(); 34 | 35 | print(t1) 36 | print(t2) 37 | metricPcnt = 100.0 * st.fmean(t1) 38 | print(' mean = %s pcnt' % str(metricPcnt)) 39 | print(' rate = %s imgs./sec.' % str(st.fmean(t2))) 40 | 41 | fname = 'RESULTS-%s.txt' % experiment 42 | ffn = os.path.join(checkpoint_dir, fname) 43 | ffd = open(ffn, 'w') 44 | ffd.write(f"{metricPcnt}\n{st.fmean(t2)}\n") 45 | ffd.close(); -------------------------------------------------------------------------------- /src/run_training.py: -------------------------------------------------------------------------------- 1 | from math import exp 2 | import os 3 | import statistics as st 4 | from tabnanny import check 5 | 6 | if __name__ == '__main__': 7 | 8 | home_dir = os.path.expanduser("~") 9 | 10 | # Specify the paths to the dataset and the output folders. 11 | dataset_dir = os.path.join(home_dir, "data/indoor6") 12 | output_dir = os.path.join(home_dir, "data/outputs") 13 | 14 | # Specify a version number which can be incremented when training multiple variants on 15 | # the same scene. 16 | version_no = 10 17 | 18 | # Specify the scene name 19 | scene_name = 'scene6' 20 | 21 | # Specify the landmark file 22 | landmark_config = 'landmarks/landmarks-1000v10' 23 | 24 | # Specify the visibility file 25 | visibility_config = 'landmarks/visibility-1000v10_depth_normal' 26 | 27 | # Specify the batch size for the minibatches used for training. 28 | training_batch_size = 8 29 | 30 | # Specify the downsample factor for the output heatmap. 31 | output_downsample = 8 32 | 33 | # Specify the number of epochs to use during training. 34 | num_epochs = 200 35 | 36 | # Specify the number of landmarks and the block size. The number of landmarks should be 37 | # identical to the number of landmarks in the landmark file specified for the 38 | # landmark_config parameter. 39 | num_landmarks = 1000 40 | 41 | # Specify the number of landmarks that will be present in each subset when the set of 42 | # landmarks is partitioned into mutually exclusive subsets. The value specified here 43 | # should exactly divide the landmark count. For example, when num_landmarks = 1000 and 44 | # block_size = 125, we get 1000/125 = 8 subsets of landmarks. 45 | block_size = 125 46 | 47 | # Specify which subset you want to train the model for. For example, when 48 | # num_landmarks = 1000 and block_size = 125, then subset_index = 0 indicates that the 49 | # range of indices of landmarks in the subset would be [0, 125]. If subset_index = 1, 50 | # then the range of indices would be [125, 250]. 51 | subset_index = 0 52 | 53 | # Format the experiment name. 54 | experiment_name = '%s_%d-%d_v%d' % (scene_name, num_landmarks, block_size, version_no) 55 | 56 | # Format the model_dir string 57 | landmark_start_index = subset_index * block_size 58 | landmark_stop_index = (subset_index + 1) * block_size 59 | 60 | if landmark_start_index < 0 | landmark_stop_index > num_landmarks: 61 | raise Exception('landmark indices are outside valid range!') 62 | else: 63 | tmp = '%s-%03d-%03d' % (scene_name, landmark_start_index, landmark_stop_index) 64 | model_dir = os.path.join(output_dir, experiment_name, tmp) 65 | 66 | # Create the model_dir folder. 67 | os.makedirs(model_dir, exist_ok=True) 68 | 69 | # Create the command line string for the training job. 70 | cmd = 'python ./local_training.py' 71 | cmd += ' --dataset_dir %s' % dataset_dir 72 | cmd += ' --scene_id %s' % scene_name 73 | cmd += ' --experiment_file %s.txt' % experiment_name 74 | cmd += ' --num_landmarks %d' % num_landmarks 75 | cmd += ' --block_size %d' % block_size 76 | cmd += ' --landmark_config %s' % landmark_config 77 | cmd += ' --visibility_config %s' % visibility_config 78 | cmd += ' --subset_index %d' % subset_index 79 | cmd += ' --output_dir %s' % output_dir 80 | cmd += ' --model_dir %s' % model_dir 81 | cmd += ' --training_batch_size %d' % training_batch_size 82 | cmd += ' --output_downsample %d' % output_downsample 83 | cmd += ' --num_epochs %d' % num_epochs 84 | 85 | # Launch training 86 | os.system(cmd) 87 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import os 6 | import pickle 7 | import torch 8 | from torch.utils.data import DataLoader 9 | from tqdm import tqdm 10 | 11 | from inference import * 12 | from dataloader.indoor6 import * 13 | from models.efficientlitesld import EfficientNetSLD 14 | from utils.heatmap import generate_heat_maps_gpu 15 | 16 | 17 | def plotting(ROOT_FOLDER): 18 | data = pickle.load(open('%s/stats.pkl' % ROOT_FOLDER, 'rb')) 19 | fig, axs = plt.subplots(4, 1) 20 | 21 | t = 0 22 | s = [] 23 | epoch = 0 24 | for i in range(len(data['train'])-1): 25 | if data['train'][i+1]['ep'] == epoch + 1: 26 | epoch += 1 27 | else: 28 | t += 1 29 | s.append(data['train'][i]['loss']) 30 | 31 | t = np.arange(0, t) 32 | s = np.array(s) 33 | s = np.convolve(s, np.ones(10)/10., mode='same') 34 | 35 | axs[0].plot(t, np.log(s)) 36 | axs[0].set(xlabel='iterations', ylabel='loss', title='') 37 | axs[0].grid() 38 | 39 | max_grad = np.array([data['train'][i]['max_grad'] for i in range(len(data['train']))]) 40 | axs[1].plot(np.arange(0, len(max_grad)), np.log10(max_grad)) 41 | axs[1].set(xlabel='iterations', ylabel='max gradient', title='') 42 | axs[1].grid() 43 | 44 | t = np.array([data['eval'][i]['ep'] for i in range(len(data['eval']))]) 45 | s = np.array([np.median(data['eval'][i]['pixel_error']) for i in range(len(data['eval']))]) 46 | axs[2].plot(t, s) 47 | axs[2].set(xlabel='epoch', ylabel='Pixel error', title='') 48 | axs[2].grid() 49 | axs[2].set_yticks(np.arange(0, 20, 5), minor=False) 50 | axs[2].set_ylim(0, 20) 51 | 52 | r = np.array([data['eval'][i]['recall'] for i in range(len(data['eval']))]) 53 | axs[3].plot(t, r) 54 | axs[3].set(xlabel='epoch', ylabel='recall', title='') 55 | axs[3].grid() 56 | 57 | plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.8, hspace=1.0) 58 | plt.close() 59 | fig.savefig('%s/curve_train_test.png' % ROOT_FOLDER, format='png', dpi=120) 60 | 61 | 62 | def train(opt): 63 | 64 | if not os.path.exists(opt.output_folder): 65 | os.makedirs(opt.output_folder) 66 | 67 | logging.basicConfig(filename='%s/training.log' % opt.output_folder, filemode='a', level=logging.DEBUG, format='') 68 | logging.info("Scene Landmark Detector Training") 69 | print('Start training ...') 70 | 71 | stats_pkl_logging = {'train': [], 'eval': []} 72 | 73 | device = opt.gpu_device 74 | 75 | assert len(opt.landmark_indices) == 0 or len(opt.landmark_indices) == 2, "landmark indices must be empty or length 2" 76 | 77 | train_dataset = Indoor6(landmark_idx=np.arange(opt.landmark_indices[0], 78 | opt.landmark_indices[1]) if len(opt.landmark_indices) == 2 else [None], 79 | scene_id=opt.scene_id, 80 | mode='train', 81 | root_folder=opt.dataset_folder, 82 | input_image_downsample=2, 83 | landmark_config=opt.landmark_config, 84 | visibility_config=opt.visibility_config, 85 | skip_image_index=1) 86 | 87 | train_dataloader = DataLoader(dataset=train_dataset, num_workers=4, batch_size=opt.training_batch_size, shuffle=True, 88 | pin_memory=True) 89 | 90 | ## Save the trained landmark configurations 91 | np.savetxt(os.path.join(opt.output_folder, 'landmarks.txt'), train_dataset.landmark) 92 | np.savetxt(os.path.join(opt.output_folder, 'visibility.txt'), train_dataset.visibility, fmt='%d') 93 | 94 | num_landmarks = train_dataset.landmark.shape[1] 95 | 96 | if opt.model == 'efficientnet': 97 | cnn = EfficientNetSLD(num_landmarks=num_landmarks, output_downsample=opt.output_downsample).to(device=device) 98 | 99 | optimizer = torch.optim.AdamW(cnn.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-4, weight_decay=0.01) 100 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) 101 | 102 | lowest_median_angular_error = 1e6 103 | 104 | for epoch in range(opt.num_epochs): 105 | # Training 106 | training_loss = 0 107 | for idx, batch in enumerate(tqdm(train_dataloader)): 108 | cnn.train() 109 | 110 | images = batch['image'].to(device=device) 111 | B, _, H, W = images.shape 112 | visibility = batch['visibility'].reshape(B, num_landmarks).to(device=device) 113 | landmark2d = batch['landmark2d'].reshape(B, 2, num_landmarks).to(device=device) 114 | 115 | # Resolution configure 116 | landmark2d /= opt.output_downsample 117 | heat_map_size = [H // opt.output_downsample, W // opt.output_downsample] 118 | 119 | gt = generate_heat_maps_gpu(landmark2d, 120 | visibility, 121 | heat_map_size, 122 | sigma=torch.tensor([5.], dtype=torch.float, device=device, requires_grad=False)) 123 | gt.requires_grad = False 124 | 125 | # Clear gradient 126 | optimizer.zero_grad() 127 | 128 | # CNN forward pass 129 | pred = cnn(images)['1'] 130 | 131 | # Compute loss and do backward pass 132 | losses = torch.sum((pred[visibility != 0.5] - gt[visibility != 0.5]) ** 2) 133 | 134 | training_loss += losses.detach().clone().item() 135 | losses.backward() 136 | optimizer.step() 137 | 138 | logging.info('epoch %d, iter %d, loss %4.4f' % (epoch, idx, losses.item())) 139 | stats_pkl_logging['train'].append({'ep': epoch, 'iter': idx, 'loss': losses.item()}) 140 | 141 | # Saving the ckpt 142 | path = '%s/model-latest.ckpt' % (opt.output_folder) 143 | torch.save(cnn.state_dict(), path) 144 | 145 | if scheduler.get_last_lr()[-1] > 5e-5: 146 | scheduler.step() 147 | 148 | opt.pretrained_model = path 149 | eval_stats = inference(opt, opt_tight_thr=1e-3, minimal_tight_thr=1e-3, mode='val') 150 | 151 | median_angular_error = np.median(eval_stats['angular_error']) 152 | 153 | if (median_angular_error < lowest_median_angular_error): 154 | lowest_median_angular_error = median_angular_error 155 | path = '%s/model-best_median.ckpt' % (opt.output_folder) 156 | torch.save(cnn.state_dict(), path) 157 | 158 | # date time 159 | ts = datetime.datetime.now().timestamp() 160 | dt = datetime.datetime.fromtimestamp(ts) 161 | datestring = dt.strftime("%Y-%m-%d_%H-%M-%S") 162 | 163 | # Print, log and update plot 164 | stats_pkl_logging['eval'].append( 165 | {'ep': epoch, 166 | 'angular_error': eval_stats['angular_error'], 167 | 'pixel_error': eval_stats['pixel_error'], 168 | 'recall': eval_stats['r5p5'] 169 | }) 170 | 171 | str_log = 'epoch %3d: [%s] ' \ 172 | 'tr_loss= %10.2f, ' \ 173 | 'lowest_median= %8.4f deg. ' \ 174 | 'recall= %2.4f ' \ 175 | 'angular-err(deg.)= [%7.4f %7.4f %7.4f] ' \ 176 | 'pixel-err= [%4.3f %4.3f %4.3f] [mean/med./min] ' % (epoch, datestring, training_loss, 177 | lowest_median_angular_error, 178 | eval_stats['r5p5'], 179 | np.mean(eval_stats['angular_error']), 180 | np.median(eval_stats['angular_error']), 181 | np.min(eval_stats['angular_error']), 182 | np.mean(eval_stats['pixel_error']), 183 | np.median(eval_stats['pixel_error']), 184 | np.min(eval_stats['pixel_error'])) 185 | print(str_log) 186 | logging.info(str_log) 187 | 188 | with open('%s/stats.pkl' % opt.output_folder, 'wb') as f: 189 | pickle.dump(stats_pkl_logging, f) 190 | plotting(opt.output_folder) 191 | 192 | 193 | def train_patches(opt): 194 | 195 | if not os.path.exists(opt.output_folder): 196 | os.makedirs(opt.output_folder) 197 | 198 | logging.basicConfig(filename='%s/training.log' % opt.output_folder, filemode='a', level=logging.DEBUG, format='') 199 | logging.info("Scene Landmark Detector Training Patches") 200 | stats_pkl_logging = {'train': [], 'eval': []} 201 | 202 | device = opt.gpu_device 203 | 204 | assert len(opt.landmark_indices) == 0 or len(opt.landmark_indices) == 2, "landmark indices must be empty or length 2" 205 | train_dataset = Indoor6Patches(landmark_idx=np.arange(opt.landmark_indices[0], 206 | opt.landmark_indices[1]) if len(opt.landmark_indices) == 2 else [None], 207 | scene_id=opt.scene_id, 208 | mode='train', 209 | root_folder=opt.dataset_folder, 210 | input_image_downsample=2, 211 | landmark_config=opt.landmark_config, 212 | visibility_config=opt.visibility_config, 213 | skip_image_index=1) 214 | 215 | train_dataloader = DataLoader(dataset=train_dataset, num_workers=4, batch_size=opt.training_batch_size, shuffle=True, 216 | pin_memory=True) 217 | 218 | ## Save the trained landmark configurations 219 | np.savetxt(os.path.join(opt.output_folder, 'landmarks.txt'), train_dataset.landmark) 220 | np.savetxt(os.path.join(opt.output_folder, 'visibility.txt'), train_dataset.visibility, fmt='%d') 221 | 222 | num_landmarks = train_dataset.landmark.shape[1] 223 | 224 | if opt.model == 'efficientnet': 225 | cnn = EfficientNetSLD(num_landmarks=num_landmarks, output_downsample=opt.output_downsample).to(device=device) 226 | 227 | optimizer = torch.optim.AdamW(cnn.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-4, weight_decay=0.01) 228 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.5) 229 | 230 | lowest_median_angular_error = 1e6 231 | 232 | for epoch in range(opt.num_epochs): 233 | # Training 234 | training_loss = 0 235 | for idx, batch in enumerate(tqdm(train_dataloader)): 236 | 237 | cnn.train() 238 | 239 | B1, B2, _, H, W = batch['patches'].shape 240 | B = B1 * B2 241 | patches = batch['patches'] 242 | visibility = batch['visibility'] 243 | landmark2d = batch['landmark2d'] 244 | 245 | # highest supported precision for MPS is FP32 246 | if device.lower() == 'mps': 247 | patches = patches.float() 248 | visibility = visibility.float() 249 | landmark2d = landmark2d.float() 250 | 251 | patches = patches.reshape(B, 3, H, W).to(device=device) 252 | visibility = visibility.reshape(B, num_landmarks).to(device=device) 253 | landmark2d = landmark2d.reshape(B, 2, num_landmarks).to(device=device) 254 | 255 | # Batch randomization 256 | 257 | input_batch_random = np.random.permutation(B) 258 | landmark2d_rand = [landmark2d[input_batch_random[b:b + 1]] for b in range(B)] 259 | patches_rand = [patches[input_batch_random[b:b + 1]] for b in range(B)] 260 | visibility_rand = [visibility[input_batch_random[b:b + 1]] for b in range(B)] 261 | 262 | landmark2d_rand = torch.cat(landmark2d_rand, dim=0) 263 | patches_rand = torch.cat(patches_rand, dim=0) 264 | visibility_rand = torch.cat(visibility_rand, axis=0) 265 | 266 | # Resolution configure 267 | landmark2d_rand /= opt.output_downsample 268 | heat_map_size = [H // opt.output_downsample, W // opt.output_downsample] 269 | 270 | gt = generate_heat_maps_gpu(landmark2d_rand, 271 | visibility_rand, 272 | heat_map_size, 273 | sigma=torch.tensor([20. / opt.output_downsample], dtype=torch.float, device=device, requires_grad=False)) 274 | gt.requires_grad = False 275 | 276 | # Clear gradient 277 | optimizer.zero_grad() 278 | 279 | # CNN forward pass 280 | pred = cnn(patches_rand)['1'] 281 | 282 | # Compute loss and do backward pass 283 | losses = torch.sum((pred[visibility_rand != 0.5] - gt[visibility_rand != 0.5]) ** 2) 284 | 285 | training_loss += losses.detach().clone().item() 286 | losses.backward() 287 | 288 | m = torch.tensor([0.0]).to(device) 289 | for p in cnn.parameters(): 290 | m = torch.max(torch.max(torch.abs(p.grad.data)), m) 291 | 292 | ## Ignore batch with large gradient element 293 | if epoch == 0 or (epoch > 0 and m < 1e4): 294 | optimizer.step() 295 | else: 296 | cnn.load_state_dict(torch.load('%s/model-best_median.ckpt' % (opt.output_folder))) 297 | cnn.to(device=device) 298 | 299 | logging.info('epoch %d, iter %d, loss %4.4f' % (epoch, idx, losses.item())) 300 | stats_pkl_logging['train'].append({'ep': epoch, 'iter': idx, 'loss': losses.item(), 'max_grad': m.cpu().numpy()}) 301 | 302 | # Saving the ckpt 303 | path = '%s/model-latest.ckpt' % (opt.output_folder) 304 | torch.save(cnn.state_dict(), path) 305 | 306 | if scheduler.get_last_lr()[-1] > 5e-5: 307 | scheduler.step() 308 | 309 | opt.pretrained_model = [path] 310 | eval_stats = inference(opt, opt_tight_thr=1e-3, minimal_tight_thr=1e-3, mode='val') 311 | 312 | median_angular_error = np.median(eval_stats['angular_error']) 313 | path = '%s/model-best_median.ckpt' % (opt.output_folder) 314 | 315 | if (median_angular_error < lowest_median_angular_error): 316 | lowest_median_angular_error = median_angular_error 317 | torch.save(cnn.state_dict(), path) 318 | 319 | if (~os.path.exists(path) and len(eval_stats['angular_error']) == 0): 320 | torch.save(cnn.state_dict(), path) 321 | 322 | # date time 323 | ts = datetime.now().timestamp() 324 | dt = datetime.fromtimestamp(ts) 325 | datestring = dt.strftime("%Y-%m-%d_%H-%M-%S") 326 | 327 | # Print, log and update plot 328 | stats_pkl_logging['eval'].append( 329 | {'ep': epoch, 330 | 'angular_error': eval_stats['angular_error'], 331 | 'pixel_error': eval_stats['pixel_error'], 332 | 'recall': eval_stats['r5p5'] 333 | }) 334 | 335 | 336 | try: 337 | str_log = 'epoch %3d: [%s] ' \ 338 | 'tr_loss= %10.2f, ' \ 339 | 'lowest_median= %8.4f deg. ' \ 340 | 'recall= %2.4f ' \ 341 | 'angular-err(deg.)= [%7.4f %7.4f %7.4f] ' \ 342 | 'pixel-err= [%4.3f %4.3f %4.3f] [mean/med./min] ' % (epoch, datestring, training_loss, 343 | lowest_median_angular_error, 344 | eval_stats['r5p5'], 345 | np.mean(eval_stats['angular_error']), 346 | np.median(eval_stats['angular_error']), 347 | np.min(eval_stats['angular_error']), 348 | np.mean(eval_stats['pixel_error']), 349 | np.median(eval_stats['pixel_error']), 350 | np.min(eval_stats['pixel_error'])) 351 | print(str_log) 352 | logging.info(str_log) 353 | except ValueError: #raised if array is empty. 354 | str_log = 'epoch %3d: [%s] ' \ 355 | 'tr_loss= %10.2f, ' \ 356 | 'No correspondences found' % (epoch, datestring, training_loss) 357 | print(str_log) 358 | logging.info(str_log) 359 | 360 | with open('%s/stats.pkl' % opt.output_folder, 'wb') as f: 361 | pickle.dump(stats_pkl_logging, f) 362 | plotting(opt.output_folder) 363 | -------------------------------------------------------------------------------- /src/utils/generate_visibility_depth_normal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import fnmatch 4 | import numpy as np 5 | import open3d as o3d 6 | import os 7 | import pickle 8 | from PIL import Image 9 | from torch.utils.data import DataLoader 10 | from tqdm import tqdm 11 | 12 | import sys 13 | sys.path.append(os.path.join(sys.path[0], '..')) 14 | from dataloader.indoor6 import Indoor6 15 | 16 | def extract(opt): 17 | 18 | DATASET_FOLDER = os.path.join(opt.dataset_folder) 19 | 20 | test_dataset = Indoor6(scene_id=opt.scene_id, 21 | mode='all', 22 | root_folder=DATASET_FOLDER, 23 | input_image_downsample=1, 24 | landmark_config=opt.landmark_config, 25 | visibility_config=opt.visibility_config, 26 | skip_image_index=1) 27 | 28 | test_dataloader = DataLoader(dataset=test_dataset, num_workers=1, batch_size=1, shuffle=False, pin_memory=True) 29 | 30 | return test_dataloader, test_dataset 31 | 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser( 35 | description='Scene Landmark Detection', 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 37 | parser.add_argument( 38 | '--dataset_folder', type=str, required=False, 39 | help='Root directory, where all data is stored') 40 | parser.add_argument( 41 | '--output_folder', type=str, required=False, 42 | help='Output folder') 43 | parser.add_argument( 44 | '--landmark_config', type=str, default='landmarks/landmarks-300', 45 | help='Landmark configuration.') 46 | parser.add_argument( 47 | '--visibility_config', type=str, default='landmarks/visibility-300', 48 | help='Visibility configuration.') 49 | parser.add_argument( 50 | '--scene_id', type=str, default='scene1', 51 | help='Scene id') 52 | 53 | opt = parser.parse_args() 54 | monodepth_folder = os.path.join(opt.dataset_folder, opt.scene_id, 'depth') 55 | 56 | from read_write_models import * 57 | cameras, images, points = read_model(os.path.join(opt.dataset_folder, 'indoor6-colmap/%s/sparse/0' % opt.scene_id), ext='.bin') 58 | indoor6_name_2to_colmap_index = {} 59 | for k in images: 60 | indoor6_name_2to_colmap_index[images[k].name] = k 61 | # print(images[k]) 62 | 63 | dataloader, data = extract(opt) 64 | 65 | augmented_visibility = copy.deepcopy(data.visibility) 66 | monodepth_folder = os.path.join(opt.dataset_folder, 67 | opt.scene_id, 68 | 'depth') 69 | 70 | count_invalid_images = 0 71 | 72 | ############################################################## 73 | ### Creating depth images and augment visibility based on #### 74 | ### the consistency between depth and 3D points from colmap ## 75 | ############################################################## 76 | 77 | for idx, batch in enumerate(tqdm(dataloader)): 78 | _, _, H, W = batch['image'].shape 79 | # batch['intrinsic'] 80 | 81 | original_image_name = data.original_image_name(idx) 82 | colmap_index = indoor6_name_2to_colmap_index[original_image_name] 83 | if images[colmap_index].name != original_image_name: 84 | print('indoor6 name: ', data.image_files[idx], ', original name ', original_image_name) 85 | 86 | 87 | point3D_ids = images[colmap_index].point3D_ids 88 | 89 | K = batch['intrinsics'][0].cpu().numpy() 90 | R = batch['pose_gt'][0, :3, :3].cpu().numpy() 91 | t = batch['pose_gt'][0, :3, 3].cpu().numpy() 92 | 93 | xys = images[colmap_index].xys 94 | 95 | monoscaled_depth_path = os.path.join(monodepth_folder, data.image_files[idx].replace('.jpg', '.scaled_depth.npy')) 96 | dmonodense_scaled = None 97 | if os.path.exists(monoscaled_depth_path): 98 | dmonodense_scaled = np.load(monoscaled_depth_path) 99 | # else: 100 | # dmonodense = np.load(os.path.join(monodepth_folder, data.image_files[idx].replace('jpg', 'npy'))) 101 | 102 | # ds = np.zeros(len(point3D_ids)) 103 | # dmono = np.zeros(len(point3D_ids)) 104 | # validIdx = 0 105 | 106 | # for i, k in enumerate(point3D_ids): 107 | # if k != -1: 108 | # Cp = R @ points[k].xyz + t 109 | # xyz = K @ Cp 110 | # proj_x = xyz[0] / xyz[2] 111 | # proj_y = xyz[1] / xyz[2] 112 | 113 | # px = xys[i][0] 114 | # py = xys[i][1] 115 | 116 | # if Cp[2] < 15.0 and proj_x >= 0 and proj_x < W and proj_y >= 0 and proj_y < H and np.abs(proj_x-px) < 5.0 and np.abs(proj_y-py) < 5.0: 117 | # ds[validIdx] = Cp[2] 118 | # dmono[validIdx] = dmonodense[int(proj_y), int(proj_x)] 119 | 120 | # ## Doing sth here to compute surface normal 121 | # validIdx += 1 122 | 123 | # if validIdx < 10: 124 | # dmonodense_scaled = None 125 | # count_invalid_images += 1 126 | # else: 127 | # ds = ds[:validIdx] 128 | # dmono = dmono[:validIdx] 129 | # A = np.array([[np.sum(dmono**2), np.sum(dmono)], [np.sum(dmono), validIdx]]) 130 | # b = np.array([np.sum(dmono*ds), np.sum(ds)]) 131 | # k = np.linalg.solve(A, b) 132 | 133 | # dmonodense_scaled = k[0] * dmonodense + k[1] 134 | # np.save(monoscaled_depth_path, dmonodense_scaled) 135 | 136 | if dmonodense_scaled is not None: 137 | Cplm = batch['landmark3d'][0].cpu().numpy() 138 | pixlm = K @ Cplm 139 | px = pixlm[0] / pixlm[2] 140 | py = pixlm[1] / pixlm[2] 141 | infront_infrustum = (Cplm[2] > 0.3) * (Cplm[2] < 15.0) * (px >= 0) * (px < W) * (py >=0) * (py < H) 142 | 143 | vis = copy.deepcopy(augmented_visibility[:, data.image_indices[idx]]) 144 | count_colmap_vs_depth_incompatibility = 0 145 | count_infront_infrustum = 0 146 | for l in range(data.landmark.shape[1]): 147 | if infront_infrustum[l]: 148 | count_infront_infrustum += 1 149 | 150 | depth_from_scaled_mono = dmonodense_scaled[int(py[l]), int(px[l])] 151 | depth_from_lm_proj = Cplm[2, l] 152 | rel_depth = np.abs(depth_from_lm_proj - depth_from_scaled_mono) / depth_from_lm_proj 153 | 154 | if vis[l]==0: 155 | if rel_depth < 0.3: ## 30% depth compatible 156 | vis[l] = True 157 | 158 | augmented_visibility[:, data.image_indices[idx]] = vis 159 | 160 | np.savetxt(os.path.join(opt.dataset_folder, opt.scene_id, opt.visibility_config + '_depth.txt'), augmented_visibility, fmt='%d') 161 | 162 | 163 | ######################################################### 164 | ### Adding visibility refinement using surface normal ### 165 | ######################################################### 166 | root_folder=opt.dataset_folder 167 | scene_id=opt.scene_id 168 | 169 | data = pickle.load(open('%s/%s/train_test_val.pkl' % (root_folder, scene_id), 'rb')) 170 | imgs = data['train'] + data['val'] + data['test'] 171 | idx = data['train_idx'] + data['val_idx'] + data['test_idx'] 172 | 173 | landmark_config = opt.landmark_config 174 | visibility_config = opt.visibility_config 175 | visibility_depth_config = visibility_config + '_depth' 176 | 177 | np.random.seed(100) 178 | landmark_colors = np.random.rand(10000, 3) 179 | 180 | landmark_file = open(root_folder + '/' + scene_id + '/%s.txt' % landmark_config, 'r') 181 | num_landmark = int(landmark_file.readline()) 182 | 183 | lm = [] 184 | for l in range(num_landmark): 185 | pl = landmark_file.readline().split() 186 | pl = np.array([float(pl[i]) for i in range(len(pl))]) 187 | lm.append(pl) 188 | lm = np.asarray(lm)[:, 1:].T 189 | 190 | visibility_file = root_folder + '/' + scene_id + '/%s.txt' % visibility_config 191 | visibility = np.loadtxt(visibility_file).astype(bool) 192 | 193 | visibility_file = root_folder + '/' + scene_id + '/%s.txt' % visibility_depth_config 194 | visibility_depth = np.loadtxt(visibility_file).astype(bool) 195 | new_visibility = copy.deepcopy(visibility_depth) 196 | 197 | lm_spheres = [] 198 | mesh_arrows = [] 199 | mesh_arrows_ref = [] 200 | H = 720 201 | W = 1280 202 | 203 | WW, HH = np.meshgrid(np.arange(W), np.arange(H)) 204 | WW = WW.reshape(1, H, W) 205 | HH = HH.reshape(1, H, W) 206 | wh1 = np.concatenate((WW, HH, np.ones_like(HH)), axis=0) 207 | lm_sn = np.zeros((num_landmark, 6)) 208 | lm_sn[:, :3] = lm.T 209 | 210 | for lm_idx in tqdm(range(visibility.shape[0])): 211 | ## Observe from colmap 212 | 213 | visibility_matrix_ids = [i for i in np.where(visibility[lm_idx, idx])[0]] 214 | 215 | images_observe_lm = [imgs[i] for i in visibility_matrix_ids] 216 | pose_paths = [os.path.join(root_folder, scene_id, 'images', ifile.replace('color.jpg', 'pose.txt')) for ifile in images_observe_lm] 217 | depth_paths = [os.path.join(root_folder, scene_id, 'depth', ifile.replace('.jpg', '.scaled_depth.npy')) for ifile in images_observe_lm] 218 | intrinsic_paths = [os.path.join(root_folder, scene_id, 'images', ifile.replace('color.jpg', 'intrinsics.txt')) for ifile in images_observe_lm] 219 | 220 | depths = np.zeros((len(pose_paths), H, W)) 221 | Ts = np.zeros((len(pose_paths), 4, 4)) 222 | Ks = np.zeros((len(pose_paths), 3, 3)) 223 | for i, pp in enumerate(pose_paths): 224 | T = np.loadtxt(pp) 225 | T = np.concatenate( (T, np.array([[0, 0, 0, 1]])), axis=0) 226 | Ts[i] = T 227 | 228 | intrinsics = open(intrinsic_paths[i]) 229 | intrinsics = intrinsics.readline().split() 230 | fx = float(intrinsics[2]) 231 | fy = float(intrinsics[2]) 232 | 233 | cx = float(intrinsics[3]) 234 | cy = float(intrinsics[4]) 235 | 236 | K = np.array([[fx, 0., cx], 237 | [0., fy, cy], 238 | [0., 0., 1.]]) 239 | Ks[i] = K 240 | 241 | 242 | ## First estimate for surface normal using just visibility vector 243 | bsum = np.zeros(3) 244 | for i in range(Ts.shape[0]): 245 | Gpt = lm[:, lm_idx] + Ts[i, :3, :3].T @ Ts[i, :3, 3] 246 | bsum -= (Gpt / np.linalg.norm(Gpt)) 247 | bsum /= np.linalg.norm(bsum) 248 | 249 | ## Refine the surface normal based on depth image 250 | bref = np.zeros(3) 251 | patch_size = 50 252 | for i in range(Ts.shape[0]): 253 | if os.path.exists(depth_paths[i]): 254 | cp = Ts[i, :3, :3] @ lm[:, lm_idx] + Ts[i, :3, 3] 255 | cp = Ks[i] @ cp 256 | cp = cp.reshape(-1) 257 | proj_x = int(cp[0] / cp[2]) 258 | proj_y = int(cp[1] / cp[2]) 259 | 260 | if proj_x >= patch_size and proj_x < W-patch_size and proj_y >= patch_size and proj_y < H-patch_size: 261 | patch_x0, patch_x1 = proj_x-patch_size, proj_x+patch_size 262 | patch_y0, patch_y1 = proj_y-patch_size, proj_y+patch_size 263 | 264 | d = np.load(depth_paths[i])[patch_y0:patch_y1, patch_x0:patch_x1].reshape((1, patch_size * 2, patch_size * 2)) 265 | pcd = np.linalg.inv(Ks[i]) @ (wh1[:, patch_y0:patch_y1, patch_x0:patch_x1] * d).reshape(3, 4 * patch_size ** 2) 266 | 267 | A = np.concatenate((pcd, np.ones((1, 4 * patch_size ** 2))), axis=0) 268 | D, U = np.linalg.eig(A @ A.T) 269 | 270 | sn = Ts[i, :3, :3].T @ U[:3, np.argsort(D)[0]] 271 | sn /= np.linalg.norm(sn) 272 | 273 | if np.sum(bsum * sn) > 0.0: 274 | bref += sn 275 | elif np.sum(bsum * sn) < 0.0: 276 | bref -= sn 277 | 278 | if np.linalg.norm(bref) == 0: 279 | lm_sn[lm_idx, 3:] = bsum 280 | else: 281 | bref /= np.linalg.norm(bref) 282 | lm_sn[lm_idx, 3:] = bref 283 | 284 | visibility_matrix_ids = [i for i in np.where(visibility_depth[lm_idx, idx])[0]] 285 | images_observe_lm = [imgs[i] for i in np.where(visibility_depth[lm_idx, idx])[0]] 286 | pose_paths = [os.path.join(root_folder, scene_id, 'images', ifile.replace('color.jpg', 'pose.txt')) for ifile in images_observe_lm] 287 | for i, pp in enumerate(pose_paths): 288 | T = np.loadtxt(pp) 289 | if visibility_depth[lm_idx, idx[visibility_matrix_ids[i]]]: 290 | Gpt = lm[:, lm_idx] + T[:3, :3].T @ T[:3, 3] 291 | Gpt /= np.linalg.norm(Gpt) 292 | if np.sum(bref * Gpt) > -0.2: ## violate visibility direction 293 | new_visibility[lm_idx, idx[visibility_matrix_ids[i]]] = 0 294 | 295 | np.savetxt(os.path.join(root_folder, scene_id, '%s_normal.txt' % (landmark_config)), lm_sn) 296 | np.savetxt(os.path.join(root_folder, scene_id, '%s_depth_normal.txt' % (visibility_config)), new_visibility, fmt='%d') -------------------------------------------------------------------------------- /src/utils/heatmap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def generate_heat_maps(landmarks, visibility_mask, heatmap_size, K, sigma=3): 6 | ''' 7 | :param landmarks: [3, L] 8 | :param visibility_mask: [L] 9 | :return: hms, hms_weight(1: visible, 0: invisible) 10 | ''' 11 | 12 | 13 | hms = np.zeros((landmarks.shape[1], 14 | heatmap_size[0], 15 | heatmap_size[1]), 16 | dtype=np.float32) 17 | 18 | hms_weights = np.ones((landmarks.shape[1]), dtype=np.float32) 19 | 20 | tmp_size = sigma * 3 21 | 22 | for lm_id in range(landmarks.shape[1]): 23 | landmark_2d = K @ landmarks[:, lm_id] 24 | landmark_2d /= landmark_2d[2] 25 | 26 | mu_x = int(landmark_2d[0] + 0.5) 27 | mu_y = int(landmark_2d[1] + 0.5) 28 | # Check that any part of the gaussian is in-bounds 29 | ul = [int(mu_y - tmp_size), int(mu_x - tmp_size)] 30 | br = [int(mu_y + tmp_size + 1), int(mu_x + tmp_size + 1)] 31 | if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \ 32 | or br[0] < 0 or br[1] < 0 or landmarks[2, lm_id] < 0: 33 | continue 34 | 35 | if visibility_mask[lm_id]: 36 | ## Generate gaussian 37 | size = 2 * tmp_size + 1 38 | x = np.arange(0, size, 1, np.float32) 39 | y = x[:, np.newaxis] 40 | x0 = y0 = size // 2 41 | # The gaussian is not normalized, we want the center value to equal 1 42 | g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) 43 | 44 | # Usable gaussian range 45 | g_y = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0] 46 | g_x = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1] 47 | 48 | # Image range 49 | img_y = max(0, ul[0]), min(br[0], heatmap_size[0]) 50 | img_x = max(0, ul[1]), min(br[1], heatmap_size[1]) 51 | 52 | hms[lm_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ 53 | g[g_y[0]:g_y[1], g_x[0]:g_x[1]] 54 | else: 55 | hms_weights[lm_id] = 0.0 56 | return hms, hms_weights 57 | 58 | 59 | def generate_heat_maps_gpu(landmarks_2d, visibility_mask, heatmap_size, sigma=3): 60 | ''' 61 | gpu version of heat map generation 62 | :param landmarks: [3, L] 63 | :return: hms 64 | ''' 65 | 66 | B, _, L = landmarks_2d.shape 67 | H, W = heatmap_size[0], heatmap_size[1] 68 | 69 | yy_grid, xx_grid = torch.meshgrid(torch.arange(0, heatmap_size[0]), 70 | torch.arange(0, heatmap_size[1])) 71 | xx_grid, yy_grid = xx_grid.to(device=landmarks_2d.device), yy_grid.to(device=landmarks_2d.device) 72 | hms = torch.exp(-((xx_grid.reshape(1, 1, H, W)-landmarks_2d[:, 0].reshape(B, L, 1, 1))**2 + 73 | (yy_grid.reshape(1, 1, H, W)-landmarks_2d[:, 1].reshape(B, L, 1, 1))**2)/(2*sigma**2)) 74 | hms_vis = hms * visibility_mask.reshape(B, L, 1, 1).float() 75 | hms_vis[hms_vis < 0.1] = 0.0 76 | normalizing_factor, _ = torch.max(hms_vis.reshape(B, L, -1), dim=2) 77 | hms_vis[normalizing_factor > 0.5] = hms_vis[normalizing_factor > 0.5] / \ 78 | normalizing_factor.reshape(B, L, 1, 1)[normalizing_factor > 0.5] 79 | 80 | return hms_vis -------------------------------------------------------------------------------- /src/utils/landmark_selection.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pickle 5 | from read_write_models import qvec2rotmat, read_model 6 | from tqdm import tqdm 7 | 8 | 9 | def ComputePerPointTimeSpan(image_ids, images): 10 | timespan = {} 11 | 12 | for imageID in image_ids: 13 | session_id = int(images[imageID].name.split('-')[0]) 14 | if session_id in timespan: 15 | timespan[session_id] += 1 16 | else: 17 | timespan[session_id] = 1 18 | 19 | return len(timespan) 20 | 21 | 22 | def ComputePerPointDepth(pointInGlobal, image_ids, images): 23 | d = np.zeros(len(image_ids)) 24 | for i, imageID in enumerate(image_ids): 25 | R = qvec2rotmat(images[imageID].qvec) 26 | t = images[imageID].tvec 27 | pointInCamerai = R @ pointInGlobal + t 28 | d[i] = pointInCamerai[2] 29 | 30 | pointDepthMean, pointDepthStd = np.mean(d), np.std(d) 31 | 32 | return pointDepthMean, pointDepthStd 33 | 34 | 35 | def ComputePerPointAngularSpan(pointInGlobal, image_ids, images): 36 | N = len(image_ids) 37 | H = np.zeros((3, 3)) 38 | for i, imageID in enumerate(image_ids): 39 | Ri = qvec2rotmat(images[imageID].qvec) 40 | ti = images[imageID].tvec 41 | bi = Ri.T @ (pointInGlobal - ti) 42 | bi = bi / np.linalg.norm(bi) 43 | H += (np.eye(3) - np.outer(bi, bi)) 44 | 45 | H /= N 46 | eigH = np.linalg.eigvals(0.5*(H + H.T)) 47 | 48 | return np.arccos(np.clip(1 - 2.0 * np.min(eigH)/np.max(eigH), 0, 1)) 49 | 50 | 51 | def SaveLandmarksAndVisibilityMask(selected_landmarks, points3D, images, indoor6_imagename_to_index, num_images, root_path, 52 | landmark_config, visibility_config, outformat): 53 | 54 | num_landmarks = len(selected_landmarks['id']) 55 | 56 | visibility_mask = np.zeros((num_landmarks, num_images), dtype=np.uint8) 57 | 58 | for i, pid in enumerate(selected_landmarks['id']): 59 | for imgid in points3D[pid].image_ids: 60 | if images[imgid].name in indoor6_imagename_to_index: 61 | visibility_mask[i, indoor6_imagename_to_index[images[imgid].name]] = 1 62 | 63 | np.savetxt(os.path.join(root_path, '%s%s.txt' % (visibility_config, outformat)), visibility_mask, fmt='%d') 64 | 65 | f = open(os.path.join(root_path, '%s%s.txt' % (landmark_config, outformat)), 'w') 66 | f.write('%d\n' % num_landmarks) 67 | for i in range(selected_landmarks['xyz'].shape[1]): 68 | f.write('%d %4.4f %4.4f %4.4f\n' % (i, 69 | selected_landmarks['xyz'][0, i], 70 | selected_landmarks['xyz'][1, i], 71 | selected_landmarks['xyz'][2, i])) 72 | f.close() 73 | 74 | 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser( 78 | description='Scene Landmark Detection', 79 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 80 | parser.add_argument( 81 | '--dataset_folder', type=str, required=True, 82 | help='Root directory, where all data is stored') 83 | parser.add_argument( 84 | '--scene_id', type=str, default='scene6', 85 | help='Scene id') 86 | parser.add_argument( 87 | '--num_landmarks', type=int, default=300, 88 | help='Number of selected landmarks.') 89 | parser.add_argument( 90 | '--output_format', type=str, default='v2', 91 | help='Landmark file output.') 92 | 93 | opt = parser.parse_args() 94 | opt.landmark_config = "landmarks/landmarks-%d" % (opt.num_landmarks) 95 | opt.visibility_config = "landmarks/visibility-%d" % (opt.num_landmarks) 96 | 97 | scene = opt.scene_id 98 | path = os.path.join(opt.dataset_folder, 'indoor6-colmap/%s-tr/sparse/0/' % scene) 99 | cameras, images, points3D = read_model(path, ext='.bin') 100 | 101 | ## Max number of sessions 102 | sessions = {} 103 | for i in images: 104 | print(images[i].name) 105 | session_id = int(images[i].name.split('-')[0]) 106 | sessions[session_id] = 1 107 | maxSession = len(sessions) 108 | 109 | ## Initialization 110 | numPoints3D = len(points3D) 111 | points3D_ids = np.zeros(numPoints3D) 112 | points3D_scores = np.zeros(numPoints3D) 113 | validIdx = 0 114 | 115 | ## Compute score for each landmark 116 | for i, k in enumerate(tqdm(points3D)): 117 | pointInGlobal = points3D[k].xyz 118 | image_ids = points3D[k].image_ids 119 | trackLength = len(image_ids) 120 | 121 | if trackLength > 25: 122 | depthMean, depthStd = ComputePerPointDepth(pointInGlobal, image_ids, images) 123 | timespan = ComputePerPointTimeSpan(image_ids, images) 124 | anglespan = ComputePerPointAngularSpan(pointInGlobal, image_ids, images) 125 | 126 | depthScore = min(1.0, depthStd / depthMean) 127 | trackLengthScore = 0.25 * np.log2(trackLength) 128 | timeSpanScore = timespan / maxSession 129 | 130 | if timespan >= 1 and depthMean < 10.0 and anglespan > 0.3: 131 | points3D_ids[validIdx] = k 132 | points3D_scores[validIdx] = depthScore + trackLengthScore + timeSpanScore + anglespan 133 | validIdx += 1 134 | 135 | 136 | ## Sort scores 137 | points3D_ids = points3D_ids[:validIdx] 138 | points3D_scores = points3D_scores[:validIdx] 139 | sorted_indices = np.argsort(points3D_scores) 140 | 141 | 142 | ## Greedy selection 143 | selected_landmarks = {'id': np.zeros(opt.num_landmarks), 144 | 'xyz': np.zeros((3, opt.num_landmarks)), 145 | 'score': np.zeros(opt.num_landmarks)} 146 | 147 | ## Selecting first point 148 | selected_landmarks['id'][0] = points3D_ids[sorted_indices[-1]] 149 | selected_landmarks['xyz'][:, 0] = points3D[selected_landmarks['id'][0]].xyz 150 | selected_landmarks['score'][0] = points3D_scores[sorted_indices[-1]] 151 | 152 | nselected = 1 153 | radius = 5.0 154 | 155 | while nselected < opt.num_landmarks: 156 | for i in reversed(sorted_indices): 157 | id = points3D_ids[i] 158 | xyz = points3D[id].xyz 159 | 160 | if np.sum(np.linalg.norm(xyz.reshape(3, 1) - selected_landmarks['xyz'][:, :nselected], axis=0) < radius): 161 | continue 162 | else: 163 | selected_landmarks['id'][nselected] = id 164 | selected_landmarks['xyz'][:, nselected] = xyz 165 | selected_landmarks['score'][nselected] = points3D_scores[i] 166 | nselected += 1 167 | 168 | if nselected == opt.num_landmarks: 169 | break 170 | radius *= 0.5 171 | 172 | ## Saving 173 | indoor6_images = pickle.load(open(os.path.join(opt.dataset_folder, '%s/train_test_val.pkl' % opt.scene_id), 'rb')) 174 | indoor6_imagename_to_index = {} 175 | 176 | for i, f in enumerate(indoor6_images['train']): 177 | image_name = open(os.path.join(opt.dataset_folder, 178 | opt.scene_id, 'images', 179 | f.replace('color.jpg', 180 | 'intrinsics.txt'))).readline().split(' ')[-1][:-1] 181 | indoor6_imagename_to_index[image_name] = indoor6_images['train_idx'][i] 182 | 183 | num_images = len(indoor6_images['train']) + len(indoor6_images['val']) + len(indoor6_images['test']) 184 | SaveLandmarksAndVisibilityMask(selected_landmarks, points3D, images, indoor6_imagename_to_index, num_images, 185 | os.path.join(opt.dataset_folder, opt.scene_id), 186 | opt.landmark_config, opt.visibility_config, opt.output_format) -------------------------------------------------------------------------------- /src/utils/merge_landmark_files.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import numpy as np 4 | import os 5 | 6 | import sys 7 | sys.path.append(os.path.join(sys.path[0], '..')) 8 | 9 | from utils.select_additional_landmarks import load_landmark_visibility_files 10 | 11 | def save_landmark_visibility_mask(landmarks, visibility_mask, 12 | landmark_path, visibility_path): 13 | 14 | num_landmarks = landmarks.shape[1] 15 | 16 | np.savetxt(visibility_path, visibility_mask, fmt='%d') 17 | 18 | f = open(landmark_path, 'w') 19 | f.write('%d\n' % num_landmarks) 20 | for i in range(num_landmarks): 21 | f.write('%d %4.4f %4.4f %4.4f\n' % (i, 22 | landmarks[0, i], 23 | landmarks[1, i], 24 | landmarks[2, i])) 25 | f.close() 26 | 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser( 30 | description='Scene Landmark Detection', 31 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 32 | parser.add_argument( 33 | '--dataset_folder', type=str, required=True, 34 | help='Root directory, where all data is stored') 35 | parser.add_argument( 36 | '--scene_id', type=str, default='scene6', 37 | help='Scene id') 38 | parser.add_argument( 39 | '--landmark_config', type=str, action='append', 40 | help='File containing scene-specific 3D landmarks.') 41 | parser.add_argument( 42 | '--visibility_config', type=str, action='append', 43 | help='File containing information about visibility of landmarks in cameras associated with training set.') 44 | parser.add_argument( 45 | '--output_format', type=str, required=True, 46 | help='Output file format.') 47 | 48 | opt = parser.parse_args() 49 | 50 | assert len(opt.landmark_config) > 1 51 | assert len(opt.landmark_config) == len(opt.visibility_config) 52 | 53 | num_landmarks = 0 54 | num_files = len(opt.landmark_config) 55 | ls = [] 56 | vs = [] 57 | for (lp, vp) in zip(opt.landmark_config, opt.visibility_config): 58 | landmark_path = os.path.join(opt.dataset_folder, opt.scene_id, lp + '.txt') 59 | vis_path = os.path.join(opt.dataset_folder, opt.scene_id, vp + '.txt') 60 | 61 | l, v = load_landmark_visibility_files(landmark_path=landmark_path, 62 | visibility_path=vis_path) 63 | 64 | num_landmarks += l.shape[1] 65 | 66 | ls.append(l) 67 | vs.append(v) 68 | 69 | ls = np.concatenate(ls, axis=1) 70 | vs = np.concatenate(vs, axis=0) 71 | 72 | output_landmark_path = os.path.join(opt.dataset_folder, opt.scene_id, 'landmarks/landmarks-%d%s.txt' % (num_landmarks, opt.output_format)) 73 | 74 | if 'depth_normal' in opt.visibility_config[0]: 75 | output_visibility_path = os.path.join(opt.dataset_folder, opt.scene_id, 'landmarks/visibility-%d%s_depth_normal.txt' % (num_landmarks, opt.output_format)) 76 | else: 77 | output_visibility_path = os.path.join(opt.dataset_folder, opt.scene_id, 'landmarks/visibility-%d%s.txt' % (num_landmarks, opt.output_format)) 78 | save_landmark_visibility_mask(ls, vs, output_landmark_path, output_visibility_path) -------------------------------------------------------------------------------- /src/utils/pnp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.optimize import least_squares 3 | 4 | 5 | def Rotation2Quaternion(R): 6 | """ 7 | Convert a rotation matrix to quaternion 8 | 9 | Parameters 10 | ---------- 11 | R : ndarray of shape (3, 3) 12 | Rotation matrix 13 | Returns 14 | ------- 15 | q : ndarray of shape (4,) 16 | The unit quaternion (w, x, y, z) 17 | """ 18 | q = np.empty([4, ]) 19 | 20 | tr = np.trace(R) 21 | if tr < 0: 22 | i = R.diagonal().argmax() 23 | j = (i + 1) % 3 24 | k = (j + 1) % 3 25 | 26 | q[i] = np.sqrt(1 - tr + 2 * R[i, i]) / 2 27 | q[j] = (R[j, i] + R[i, j]) / (4 * q[i]) 28 | q[k] = (R[k, i] + R[i, k]) / (4 * q[i]) 29 | q[3] = (R[k, j] - R[j, k]) / (4 * q[i]) 30 | else: 31 | q[3] = np.sqrt(1 + tr) / 2 32 | q[0] = (R[2, 1] - R[1, 2]) / (4 * q[3]) 33 | q[1] = (R[0, 2] - R[2, 0]) / (4 * q[3]) 34 | q[2] = (R[1, 0] - R[0, 1]) / (4 * q[3]) 35 | 36 | q /= np.linalg.norm(q) 37 | # Rearrange (x, y, z, w) to (w, x, y, z) 38 | q = q[[3, 0, 1, 2]] 39 | 40 | return q 41 | 42 | 43 | def Quaternion2Rotation(q): 44 | """ 45 | Convert a quaternion to rotation matrix 46 | 47 | Parameters 48 | ---------- 49 | q : ndarray of shape (4,) 50 | Unit quaternion (w, x, y, z) 51 | Returns 52 | ------- 53 | R : ndarray of shape (3, 3) 54 | The rotation matrix 55 | """ 56 | q /= np.linalg.norm(q) 57 | 58 | w = q[0] 59 | x = q[1] 60 | y = q[2] 61 | z = q[3] 62 | 63 | R = np.empty([3, 3]) 64 | R[0, 0] = 1 - 2 * y ** 2 - 2 * z ** 2 65 | R[0, 1] = 2 * (x * y - z * w) 66 | R[0, 2] = 2 * (x * z + y * w) 67 | 68 | R[1, 0] = 2 * (x * y + z * w) 69 | R[1, 1] = 1 - 2 * x ** 2 - 2 * z ** 2 70 | R[1, 2] = 2 * (y * z - x * w) 71 | 72 | R[2, 0] = 2 * (x * z - y * w) 73 | R[2, 1] = 2 * (y * z + x * w) 74 | R[2, 2] = 1 - 2 * x ** 2 - 2 * y ** 2 75 | 76 | return R 77 | 78 | def skewsymm(x): 79 | 80 | Sx = np.zeros((3, 3)) 81 | Sx[0, 1] = -x[2] 82 | Sx[0, 2] = x[1] 83 | Sx[1, 0] = x[2] 84 | Sx[2, 0] = -x[1] 85 | Sx[1, 2] = -x[0] 86 | Sx[2, 1] = x[0] 87 | 88 | return Sx 89 | 90 | 91 | def VectorizeInitialPose(C_T_G): 92 | 93 | R = C_T_G[:3, :3] 94 | t = C_T_G[:3, 3] 95 | q = Rotation2Quaternion(R) 96 | z = np.concatenate([t, q]) 97 | 98 | return z 99 | 100 | 101 | def MeasureReprojectionSinglePose(z, p, b, w): 102 | 103 | n_points = b.shape[1] 104 | 105 | q = z[3:7] 106 | q_norm = np.sqrt(np.sum(q ** 2)) 107 | q = q / q_norm 108 | R = Quaternion2Rotation(q) 109 | t = z[:3] 110 | 111 | b_hat = R @ p + t.reshape(3, 1) 112 | b_hat_normalized = b_hat / np.sqrt(np.sum(b_hat ** 2, axis=0)) 113 | err = np.repeat(w, 3).reshape(n_points, 3).T * (b_hat_normalized - b) 114 | 115 | return err.reshape(-1) 116 | 117 | 118 | def UpdatePose(z): 119 | 120 | p = z[0:7] 121 | q = p[3:] 122 | 123 | q = q / np.linalg.norm(q) 124 | R = Quaternion2Rotation(q) 125 | t = p[:3] 126 | P_new = np.hstack([R, t[:, np.newaxis]]) 127 | 128 | return P_new 129 | 130 | 131 | def P3PKe(m, X, inlier_thres=1e-5): 132 | """ 133 | Perspective-3-point algorithm from 134 | Ke, T., & Roumeliotis, S. I. (CVPR'17). An efficient algebraic solution to the perspective-three-point problem. 135 | 136 | 137 | Parameters 138 | ---------- 139 | m : ndarray of shape (3, 4) 140 | unit bearing vectors to each landmarks w.r.t camera 141 | X : ndarray of shape (3, 4) 142 | 3D points position w.r.t global 143 | Returns 144 | ------- 145 | R : ndarray of shape (3, 3) 146 | t : ndarray of shape (3, 1) 147 | (R, t) represents transformation from global to camera frame of reference 148 | """ 149 | w1 = X[:, 0] 150 | w2 = X[:, 1] 151 | w3 = X[:, 2] 152 | 153 | u0 = w1 - w2 154 | nu0 = np.linalg.norm(u0) 155 | if nu0 < 1e-4: 156 | return None, None 157 | k1 = u0 / nu0 158 | 159 | b1 = m[:, 0] 160 | b2 = m[:, 1] 161 | b3 = m[:, 2] 162 | 163 | k3 = np.cross(b1, b2) 164 | nk3 = np.linalg.norm(k3) 165 | if nk3 < 1e-4: 166 | return None, None 167 | k3 = k3 / nk3 168 | 169 | tz = np.cross(b1, k3) 170 | v1 = np.cross(b1, b3) 171 | v2 = np.cross(b2, b3) 172 | 173 | u1 = w1 - w3 174 | u1k1 = np.sum(u1 * k1) 175 | k3b3 = np.sum(k3 * b3) 176 | if np.abs(k3b3) < 1e-4: 177 | return None, None 178 | 179 | 180 | f11 = k3.T @ b3 181 | f13 = k3.T @ v1 182 | f15 = -u1k1 * f11 183 | nl = np.cross(u1, k1) 184 | delta = np.linalg.norm(nl) 185 | if delta < 1e-4: 186 | return None, None 187 | nl = nl / delta 188 | f11 = delta * f11 189 | f13 = delta * f13 190 | 191 | u2k1 = u1k1 - nu0 192 | f21 = np.sum(tz * v2) 193 | f22 = nk3 * k3b3 194 | f23 = np.sum(k3 * v2) 195 | f24 = u2k1 * f22 196 | f25 = -u2k1 * f21 197 | f21 = delta * f21 198 | f22 = delta * f22 199 | f23 = delta * f23 200 | 201 | g1 = f13 * f22 202 | g2 = f13 * f25 - f15 * f23 203 | g3 = f11 * f23 - f13 * f21 204 | g4 = -f13 * f24 205 | g5 = f11 * f22 206 | g6 = f11 * f25 - f15 * f21 207 | g7 = -f15 * f24 208 | alpha = np.array([g5 * g5 + g1 * g1 + g3 * g3, 209 | 2 * (g5 * g6 + g1 * g2 + g3 * g4), 210 | g6 * g6 + 2 * g5 * g7 + g2 * g2 + g4 * g4 - g1 * g1 - g3 * g3, 211 | 2 * (g6 * g7 - g1 * g2 - g3 * g4), 212 | g7 * g7 - g2 * g2 - g4 * g4]) 213 | 214 | if any(np.isnan(alpha)): 215 | return None, None 216 | 217 | sols = np.roots(alpha) 218 | 219 | Ck1nl = np.vstack((k1, nl, np.cross(k1, nl))).T 220 | Cb1k3tzT = np.vstack((b1, k3, tz)) 221 | b3p = (delta / k3b3) * b3 222 | 223 | R = np.zeros((3, 3, 4)) 224 | t = np.zeros((3, 4)) 225 | for i in range(sols.shape[0]): 226 | if np.imag(sols[i]) != 0: 227 | continue 228 | 229 | ctheta1p = np.real(sols[i]) 230 | if abs(ctheta1p) > 1: 231 | continue 232 | stheta1p = np.sqrt(1 - ctheta1p * ctheta1p) 233 | if k3b3 < 0: 234 | stheta1p = -stheta1p 235 | 236 | ctheta3 = g1 * ctheta1p + g2 237 | stheta3 = g3 * ctheta1p + g4 238 | ntheta3 = stheta1p / ((g5 * ctheta1p + g6) * ctheta1p + g7) 239 | ctheta3 = ntheta3 * ctheta3 240 | stheta3 = ntheta3 * stheta3 241 | 242 | C13 = np.array([[ctheta3, 0, -stheta3], 243 | [stheta1p * stheta3, ctheta1p, stheta1p * ctheta3], 244 | [ctheta1p * stheta3, -stheta1p, ctheta1p * ctheta3]]) 245 | 246 | Ri = (Ck1nl @ C13 @ Cb1k3tzT).T 247 | pxstheta1p = stheta1p * b3p 248 | ti = pxstheta1p - Ri @ w3 249 | ti = ti.reshape(3, 1) 250 | 251 | m_hat = Ri @ X + ti 252 | m_hat = m_hat / np.linalg.norm(m_hat, axis=0) 253 | if np.sum(np.sum(m_hat * m, axis=0) > 1.0 - inlier_thres) == 4: 254 | return Ri, ti 255 | 256 | return None, None 257 | 258 | 259 | def P3PKe_Ransac(G_p_f, C_b_f_hm, w, thres=0.01): 260 | inlier_thres = thres 261 | C_T_G_best = None 262 | inlier_best = np.zeros(G_p_f.shape[1], dtype=bool) 263 | Nsample=4 264 | inlier_score_best=0 265 | 266 | for iter in range(125): #old value was 10 267 | ## Weighted sampling based on weight factor 268 | min_set = np.argpartition(np.exp(w) * np.random.rand(w.shape[0]), -Nsample)[-Nsample:] 269 | C_R_G_hat, C_t_G_hat = P3PKe(C_b_f_hm[:, min_set], G_p_f[:, min_set], inlier_thres=thres) 270 | 271 | if C_R_G_hat is None or C_t_G_hat is None: 272 | continue 273 | 274 | # Get inlier 275 | C_b_f_hat = C_R_G_hat @ G_p_f + C_t_G_hat 276 | C_b_f_hat = C_b_f_hat / np.linalg.norm(C_b_f_hat, axis=0) 277 | inlier_mask = np.sum(C_b_f_hat * C_b_f_hm, axis=0) > (1.0 - inlier_thres) 278 | inlier_score = np.sum(w[inlier_mask]) 279 | 280 | if inlier_score > inlier_score_best: 281 | inlier_best = inlier_mask 282 | C_T_G_best = np.eye(4) 283 | C_T_G_best[:3, :3] = C_R_G_hat 284 | C_T_G_best[:3, 3:] = C_t_G_hat 285 | inlier_score_best = inlier_score 286 | 287 | return C_T_G_best, inlier_best 288 | 289 | 290 | def RunPnPNL(C_T_G, G_p_f, C_b_f, w, cutoff=0.01): 291 | ''' 292 | Weighted PnP based using weight w and bearing angular loss. 293 | Return optimized P_new = optimized C_T_G. 294 | ''' 295 | 296 | z0 = VectorizeInitialPose(C_T_G) 297 | res = least_squares( 298 | lambda x: MeasureReprojectionSinglePose(x, G_p_f, C_b_f, w), 299 | z0, 300 | verbose=0, 301 | ftol=1e-4, 302 | max_nfev=50, 303 | xtol=1e-8, 304 | loss='huber', 305 | f_scale=cutoff 306 | ) 307 | z = res.x 308 | 309 | P_new = UpdatePose(z) 310 | 311 | return P_new -------------------------------------------------------------------------------- /src/utils/read_write_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, ETH Zurich and UNC Chapel Hill. 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # 14 | # * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of 15 | # its contributors may be used to endorse or promote products derived 16 | # from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 22 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | # POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) 31 | 32 | import os 33 | import collections 34 | import numpy as np 35 | import struct 36 | import argparse 37 | import logging 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | CameraModel = collections.namedtuple( 43 | "CameraModel", ["model_id", "model_name", "num_params"]) 44 | Camera = collections.namedtuple( 45 | "Camera", ["id", "model", "width", "height", "params"]) 46 | BaseImage = collections.namedtuple( 47 | "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]) 48 | Point3D = collections.namedtuple( 49 | "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]) 50 | 51 | 52 | class Image(BaseImage): 53 | def qvec2rotmat(self): 54 | return qvec2rotmat(self.qvec) 55 | 56 | 57 | CAMERA_MODELS = { 58 | CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3), 59 | CameraModel(model_id=1, model_name="PINHOLE", num_params=4), 60 | CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4), 61 | CameraModel(model_id=3, model_name="RADIAL", num_params=5), 62 | CameraModel(model_id=4, model_name="OPENCV", num_params=8), 63 | CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8), 64 | CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12), 65 | CameraModel(model_id=7, model_name="FOV", num_params=5), 66 | CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4), 67 | CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5), 68 | CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12) 69 | } 70 | CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) 71 | for camera_model in CAMERA_MODELS]) 72 | CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model) 73 | for camera_model in CAMERA_MODELS]) 74 | 75 | 76 | def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"): 77 | """Read and unpack the next bytes from a binary file. 78 | :param fid: 79 | :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. 80 | :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. 81 | :param endian_character: Any of {@, =, <, >, !} 82 | :return: Tuple of read and unpacked values. 83 | """ 84 | data = fid.read(num_bytes) 85 | return struct.unpack(endian_character + format_char_sequence, data) 86 | 87 | 88 | def write_next_bytes(fid, data, format_char_sequence, endian_character="<"): 89 | """pack and write to a binary file. 90 | :param fid: 91 | :param data: data to send, if multiple elements are sent at the same time, 92 | they should be encapsuled either in a list or a tuple 93 | :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. 94 | should be the same length as the data list or tuple 95 | :param endian_character: Any of {@, =, <, >, !} 96 | """ 97 | if isinstance(data, (list, tuple)): 98 | bytes = struct.pack(endian_character + format_char_sequence, *data) 99 | else: 100 | bytes = struct.pack(endian_character + format_char_sequence, data) 101 | fid.write(bytes) 102 | 103 | 104 | def read_cameras_text(path): 105 | """ 106 | see: src/base/reconstruction.cc 107 | void Reconstruction::WriteCamerasText(const std::string& path) 108 | void Reconstruction::ReadCamerasText(const std::string& path) 109 | """ 110 | cameras = {} 111 | with open(path, "r") as fid: 112 | while True: 113 | line = fid.readline() 114 | if not line: 115 | break 116 | line = line.strip() 117 | if len(line) > 0 and line[0] != "#": 118 | elems = line.split() 119 | camera_id = int(elems[0]) 120 | model = elems[1] 121 | width = int(elems[2]) 122 | height = int(elems[3]) 123 | params = np.array(tuple(map(float, elems[4:]))) 124 | cameras[camera_id] = Camera(id=camera_id, model=model, 125 | width=width, height=height, 126 | params=params) 127 | return cameras 128 | 129 | 130 | def read_cameras_binary(path_to_model_file): 131 | """ 132 | see: src/base/reconstruction.cc 133 | void Reconstruction::WriteCamerasBinary(const std::string& path) 134 | void Reconstruction::ReadCamerasBinary(const std::string& path) 135 | """ 136 | cameras = {} 137 | with open(path_to_model_file, "rb") as fid: 138 | num_cameras = read_next_bytes(fid, 8, "Q")[0] 139 | for _ in range(num_cameras): 140 | camera_properties = read_next_bytes( 141 | fid, num_bytes=24, format_char_sequence="iiQQ") 142 | camera_id = camera_properties[0] 143 | model_id = camera_properties[1] 144 | model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name 145 | width = camera_properties[2] 146 | height = camera_properties[3] 147 | num_params = CAMERA_MODEL_IDS[model_id].num_params 148 | params = read_next_bytes(fid, num_bytes=8*num_params, 149 | format_char_sequence="d"*num_params) 150 | cameras[camera_id] = Camera(id=camera_id, 151 | model=model_name, 152 | width=width, 153 | height=height, 154 | params=np.array(params)) 155 | assert len(cameras) == num_cameras 156 | return cameras 157 | 158 | 159 | def write_cameras_text(cameras, path): 160 | """ 161 | see: src/base/reconstruction.cc 162 | void Reconstruction::WriteCamerasText(const std::string& path) 163 | void Reconstruction::ReadCamerasText(const std::string& path) 164 | """ 165 | HEADER = "# Camera list with one line of data per camera:\n" + \ 166 | "# CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n" + \ 167 | "# Number of cameras: {}\n".format(len(cameras)) 168 | with open(path, "w") as fid: 169 | fid.write(HEADER) 170 | for _, cam in cameras.items(): 171 | to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params] 172 | line = " ".join([str(elem) for elem in to_write]) 173 | fid.write(line + "\n") 174 | 175 | 176 | def write_cameras_binary(cameras, path_to_model_file): 177 | """ 178 | see: src/base/reconstruction.cc 179 | void Reconstruction::WriteCamerasBinary(const std::string& path) 180 | void Reconstruction::ReadCamerasBinary(const std::string& path) 181 | """ 182 | with open(path_to_model_file, "wb") as fid: 183 | write_next_bytes(fid, len(cameras), "Q") 184 | for _, cam in cameras.items(): 185 | model_id = CAMERA_MODEL_NAMES[cam.model].model_id 186 | camera_properties = [cam.id, 187 | model_id, 188 | cam.width, 189 | cam.height] 190 | write_next_bytes(fid, camera_properties, "iiQQ") 191 | for p in cam.params: 192 | write_next_bytes(fid, float(p), "d") 193 | return cameras 194 | 195 | 196 | def read_images_text(path): 197 | """ 198 | see: src/base/reconstruction.cc 199 | void Reconstruction::ReadImagesText(const std::string& path) 200 | void Reconstruction::WriteImagesText(const std::string& path) 201 | """ 202 | images = {} 203 | with open(path, "r") as fid: 204 | while True: 205 | line = fid.readline() 206 | if not line: 207 | break 208 | line = line.strip() 209 | if len(line) > 0 and line[0] != "#": 210 | elems = line.split() 211 | image_id = int(elems[0]) 212 | qvec = np.array(tuple(map(float, elems[1:5]))) 213 | tvec = np.array(tuple(map(float, elems[5:8]))) 214 | camera_id = int(elems[8]) 215 | image_name = elems[9] 216 | elems = fid.readline().split() 217 | xys = np.column_stack([tuple(map(float, elems[0::3])), 218 | tuple(map(float, elems[1::3]))]) 219 | point3D_ids = np.array(tuple(map(int, elems[2::3]))) 220 | images[image_id] = Image( 221 | id=image_id, qvec=qvec, tvec=tvec, 222 | camera_id=camera_id, name=image_name, 223 | xys=xys, point3D_ids=point3D_ids) 224 | return images 225 | 226 | 227 | def read_images_binary(path_to_model_file): 228 | """ 229 | see: src/base/reconstruction.cc 230 | void Reconstruction::ReadImagesBinary(const std::string& path) 231 | void Reconstruction::WriteImagesBinary(const std::string& path) 232 | """ 233 | images = {} 234 | with open(path_to_model_file, "rb") as fid: 235 | num_reg_images = read_next_bytes(fid, 8, "Q")[0] 236 | for _ in range(num_reg_images): 237 | binary_image_properties = read_next_bytes( 238 | fid, num_bytes=64, format_char_sequence="idddddddi") 239 | image_id = binary_image_properties[0] 240 | qvec = np.array(binary_image_properties[1:5]) 241 | tvec = np.array(binary_image_properties[5:8]) 242 | camera_id = binary_image_properties[8] 243 | image_name = "" 244 | current_char = read_next_bytes(fid, 1, "c")[0] 245 | while current_char != b"\x00": # look for the ASCII 0 entry 246 | image_name += current_char.decode("utf-8") 247 | current_char = read_next_bytes(fid, 1, "c")[0] 248 | num_points2D = read_next_bytes(fid, num_bytes=8, 249 | format_char_sequence="Q")[0] 250 | x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D, 251 | format_char_sequence="ddq"*num_points2D) 252 | xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])), 253 | tuple(map(float, x_y_id_s[1::3]))]) 254 | point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3]))) 255 | images[image_id] = Image( 256 | id=image_id, qvec=qvec, tvec=tvec, 257 | camera_id=camera_id, name=image_name, 258 | xys=xys, point3D_ids=point3D_ids) 259 | return images 260 | 261 | 262 | def write_images_text(images, path): 263 | """ 264 | see: src/base/reconstruction.cc 265 | void Reconstruction::ReadImagesText(const std::string& path) 266 | void Reconstruction::WriteImagesText(const std::string& path) 267 | """ 268 | if len(images) == 0: 269 | mean_observations = 0 270 | else: 271 | mean_observations = sum((len(img.point3D_ids) for _, img in images.items()))/len(images) 272 | HEADER = "# Image list with two lines of data per image:\n" + \ 273 | "# IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n" + \ 274 | "# POINTS2D[] as (X, Y, POINT3D_ID)\n" + \ 275 | "# Number of images: {}, mean observations per image: {}\n".format(len(images), mean_observations) 276 | 277 | with open(path, "w") as fid: 278 | fid.write(HEADER) 279 | for _, img in images.items(): 280 | image_header = [img.id, *img.qvec, *img.tvec, img.camera_id, img.name] 281 | first_line = " ".join(map(str, image_header)) 282 | fid.write(first_line + "\n") 283 | 284 | points_strings = [] 285 | for xy, point3D_id in zip(img.xys, img.point3D_ids): 286 | points_strings.append(" ".join(map(str, [*xy, point3D_id]))) 287 | fid.write(" ".join(points_strings) + "\n") 288 | 289 | 290 | def write_images_binary(images, path_to_model_file): 291 | """ 292 | see: src/base/reconstruction.cc 293 | void Reconstruction::ReadImagesBinary(const std::string& path) 294 | void Reconstruction::WriteImagesBinary(const std::string& path) 295 | """ 296 | with open(path_to_model_file, "wb") as fid: 297 | write_next_bytes(fid, len(images), "Q") 298 | for _, img in images.items(): 299 | write_next_bytes(fid, img.id, "i") 300 | write_next_bytes(fid, img.qvec.tolist(), "dddd") 301 | write_next_bytes(fid, img.tvec.tolist(), "ddd") 302 | write_next_bytes(fid, img.camera_id, "i") 303 | for char in img.name: 304 | write_next_bytes(fid, char.encode("utf-8"), "c") 305 | write_next_bytes(fid, b"\x00", "c") 306 | write_next_bytes(fid, len(img.point3D_ids), "Q") 307 | for xy, p3d_id in zip(img.xys, img.point3D_ids): 308 | write_next_bytes(fid, [*xy, p3d_id], "ddq") 309 | 310 | 311 | def read_points3D_text(path): 312 | """ 313 | see: src/base/reconstruction.cc 314 | void Reconstruction::ReadPoints3DText(const std::string& path) 315 | void Reconstruction::WritePoints3DText(const std::string& path) 316 | """ 317 | points3D = {} 318 | with open(path, "r") as fid: 319 | while True: 320 | line = fid.readline() 321 | if not line: 322 | break 323 | line = line.strip() 324 | if len(line) > 0 and line[0] != "#": 325 | elems = line.split() 326 | point3D_id = int(elems[0]) 327 | xyz = np.array(tuple(map(float, elems[1:4]))) 328 | rgb = np.array(tuple(map(int, elems[4:7]))) 329 | error = float(elems[7]) 330 | image_ids = np.array(tuple(map(int, elems[8::2]))) 331 | point2D_idxs = np.array(tuple(map(int, elems[9::2]))) 332 | points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb, 333 | error=error, image_ids=image_ids, 334 | point2D_idxs=point2D_idxs) 335 | return points3D 336 | 337 | 338 | def read_points3D_binary(path_to_model_file): 339 | """ 340 | see: src/base/reconstruction.cc 341 | void Reconstruction::ReadPoints3DBinary(const std::string& path) 342 | void Reconstruction::WritePoints3DBinary(const std::string& path) 343 | """ 344 | points3D = {} 345 | with open(path_to_model_file, "rb") as fid: 346 | num_points = read_next_bytes(fid, 8, "Q")[0] 347 | for _ in range(num_points): 348 | binary_point_line_properties = read_next_bytes( 349 | fid, num_bytes=43, format_char_sequence="QdddBBBd") 350 | point3D_id = binary_point_line_properties[0] 351 | xyz = np.array(binary_point_line_properties[1:4]) 352 | rgb = np.array(binary_point_line_properties[4:7]) 353 | error = np.array(binary_point_line_properties[7]) 354 | track_length = read_next_bytes( 355 | fid, num_bytes=8, format_char_sequence="Q")[0] 356 | track_elems = read_next_bytes( 357 | fid, num_bytes=8*track_length, 358 | format_char_sequence="ii"*track_length) 359 | image_ids = np.array(tuple(map(int, track_elems[0::2]))) 360 | point2D_idxs = np.array(tuple(map(int, track_elems[1::2]))) 361 | points3D[point3D_id] = Point3D( 362 | id=point3D_id, xyz=xyz, rgb=rgb, 363 | error=error, image_ids=image_ids, 364 | point2D_idxs=point2D_idxs) 365 | return points3D 366 | 367 | 368 | def write_points3D_text(points3D, path): 369 | """ 370 | see: src/base/reconstruction.cc 371 | void Reconstruction::ReadPoints3DText(const std::string& path) 372 | void Reconstruction::WritePoints3DText(const std::string& path) 373 | """ 374 | if len(points3D) == 0: 375 | mean_track_length = 0 376 | else: 377 | mean_track_length = sum((len(pt.image_ids) for _, pt in points3D.items()))/len(points3D) 378 | HEADER = "# 3D point list with one line of data per point:\n" + \ 379 | "# POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n" + \ 380 | "# Number of points: {}, mean track length: {}\n".format(len(points3D), mean_track_length) 381 | 382 | with open(path, "w") as fid: 383 | fid.write(HEADER) 384 | for _, pt in points3D.items(): 385 | point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error] 386 | fid.write(" ".join(map(str, point_header)) + " ") 387 | track_strings = [] 388 | for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs): 389 | track_strings.append(" ".join(map(str, [image_id, point2D]))) 390 | fid.write(" ".join(track_strings) + "\n") 391 | 392 | 393 | def write_points3D_binary(points3D, path_to_model_file): 394 | """ 395 | see: src/base/reconstruction.cc 396 | void Reconstruction::ReadPoints3DBinary(const std::string& path) 397 | void Reconstruction::WritePoints3DBinary(const std::string& path) 398 | """ 399 | with open(path_to_model_file, "wb") as fid: 400 | write_next_bytes(fid, len(points3D), "Q") 401 | for _, pt in points3D.items(): 402 | write_next_bytes(fid, pt.id, "Q") 403 | write_next_bytes(fid, pt.xyz.tolist(), "ddd") 404 | write_next_bytes(fid, pt.rgb.tolist(), "BBB") 405 | write_next_bytes(fid, pt.error, "d") 406 | track_length = pt.image_ids.shape[0] 407 | write_next_bytes(fid, track_length, "Q") 408 | for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs): 409 | write_next_bytes(fid, [image_id, point2D_id], "ii") 410 | 411 | 412 | def detect_model_format(path, ext): 413 | if os.path.isfile(os.path.join(path, "cameras" + ext)) and \ 414 | os.path.isfile(os.path.join(path, "images" + ext)) and \ 415 | os.path.isfile(os.path.join(path, "points3D" + ext)): 416 | return True 417 | 418 | return False 419 | 420 | 421 | def read_model(path, ext=""): 422 | # try to detect the extension automatically 423 | if ext == "": 424 | if detect_model_format(path, ".bin"): 425 | ext = ".bin" 426 | elif detect_model_format(path, ".txt"): 427 | ext = ".txt" 428 | else: 429 | try: 430 | cameras, images, points3D = read_model(os.path.join(path, "model/")) 431 | logger.warning( 432 | "This SfM file structure was deprecated in hloc v1.1") 433 | return cameras, images, points3D 434 | except FileNotFoundError: 435 | raise FileNotFoundError( 436 | f"Could not find binary or text COLMAP model at {path}") 437 | 438 | if ext == ".txt": 439 | cameras = read_cameras_text(os.path.join(path, "cameras" + ext)) 440 | images = read_images_text(os.path.join(path, "images" + ext)) 441 | points3D = read_points3D_text(os.path.join(path, "points3D") + ext) 442 | else: 443 | cameras = read_cameras_binary(os.path.join(path, "cameras" + ext)) 444 | images = read_images_binary(os.path.join(path, "images" + ext)) 445 | points3D = read_points3D_binary(os.path.join(path, "points3D") + ext) 446 | return cameras, images, points3D 447 | 448 | 449 | def write_model(cameras, images, points3D, path, ext=".bin"): 450 | if ext == ".txt": 451 | write_cameras_text(cameras, os.path.join(path, "cameras" + ext)) 452 | write_images_text(images, os.path.join(path, "images" + ext)) 453 | write_points3D_text(points3D, os.path.join(path, "points3D") + ext) 454 | else: 455 | write_cameras_binary(cameras, os.path.join(path, "cameras" + ext)) 456 | write_images_binary(images, os.path.join(path, "images" + ext)) 457 | write_points3D_binary(points3D, os.path.join(path, "points3D") + ext) 458 | return cameras, images, points3D 459 | 460 | 461 | def qvec2rotmat(qvec): 462 | return np.array([ 463 | [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2, 464 | 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], 465 | 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]], 466 | [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], 467 | 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2, 468 | 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]], 469 | [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], 470 | 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], 471 | 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]]) 472 | 473 | 474 | def rotmat2qvec(R): 475 | Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat 476 | K = np.array([ 477 | [Rxx - Ryy - Rzz, 0, 0, 0], 478 | [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0], 479 | [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0], 480 | [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0 481 | eigvals, eigvecs = np.linalg.eigh(K) 482 | qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)] 483 | if qvec[0] < 0: 484 | qvec *= -1 485 | return qvec 486 | 487 | 488 | def main(): 489 | parser = argparse.ArgumentParser(description="Read and write COLMAP binary and text models") 490 | parser.add_argument("--input_model", help="path to input model folder") 491 | parser.add_argument("--input_format", choices=[".bin", ".txt"], 492 | help="input model format", default="") 493 | parser.add_argument("--output_model", 494 | help="path to output model folder") 495 | parser.add_argument("--output_format", choices=[".bin", ".txt"], 496 | help="outut model format", default=".txt") 497 | args = parser.parse_args() 498 | 499 | cameras, images, points3D = read_model(path=args.input_model, ext=args.input_format) 500 | 501 | print("num_cameras:", len(cameras)) 502 | print("num_images:", len(images)) 503 | print("num_points3D:", len(points3D)) 504 | 505 | if args.output_model is not None: 506 | write_model(cameras, images, points3D, path=args.output_model, ext=args.output_format) 507 | 508 | 509 | if __name__ == "__main__": 510 | main() --------------------------------------------------------------------------------