├── LICENSE
├── README.md
├── hubconf.py
└── model
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-39.pyc
        └── network.cpython-39.pyc
    └── network.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Gabriele Berton, Carlo Masone, Barbara Caputo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Indoor-VPR
 2 | This repo contains VPR models that have been fine-tuned for indoor usage.
 3 | Currently available models come from the following repositories:
 4 | - [EigenPlaces: Training Viewpoint Robust Models for Visual Place Recognition](https://github.com/gmberton/EigenPlaces);
 5 | - [Optimal Transport Aggregation for Visual Place Recognition (Salad)](https://github.com/serizba/salad).
 6 | 
 7 | The fine-tuned procedure is inspired by [Deep Visual Geo-localization Benchmark](https://github.com/gmberton/deep-visual-geo-localization-benchmark)
 8 | 
 9 | The indoor datasets used for fine-tuning are:
10 |  - [Gangnam Station B1-B2 and Hyundai Department Store B1-1F-4F](https://openaccess.thecvf.com/content/CVPR2021/papers/Lee_Large-Scale_Localization_Datasets_in_Crowded_Indoor_Spaces_CVPR_2021_paper.pdf);
11 |  - [Baidu Mall](https://openaccess.thecvf.com/content_cvpr_2017/html/Sun_A_Dataset_for_CVPR_2017_paper.html).
12 | 
13 | 


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
 1 | dependencies = ['torch', 'torchvision']
 2 | 
 3 | import torch
 4 | from model.network import ModelSelecter
 5 | 
 6 | AVAILABLE_TRAINED_MODELS = {
 7 |     # backbone : list of available fc_output_dim, which is equivalent to descriptors dimensionality
 8 |     "eigenplaces":  {
 9 |       "ResNet50": [2048]
10 |     },
11 |     "salad": {
12 |       "Dinov2": [8448]
13 |     },
14 | }
15 | 
16 | # For each combination 2 different models are available. They differ in the dataset used for the fine-tuning,
17 | # the thresholds used to select soft and hard positives and in the way the images are handled during the training procedure.
18 | 
19 | AVAILABLE_VARIATIONS = {
20 |   "eigenplaces_ResNet50_2048": ["GB1_BAI_5_10", "GB1_BAI_10_25_S"],
21 |   "salad_Dinov2_8448": ["GB1_10_25", "HB1_GB1_2_5"],
22 | }
23 | 
24 | 
25 | def get_trained_model(method : str = "eigenplaces", backbone : str = "ResNet50", fc_output_dim : int = 2048, variation : int = 0) -> torch.nn.Module:
26 |     """Return a model fine-tuned on indoor datasets.
27 |     
28 |     Args:
29 |         method (str): which methods was used to firstly train the model.
30 |         backbone (str): which torchvision backbone to use. Must be ViT or a ResNet.
31 |         fc_output_dim (int): the output dimension of the last fc layer, equivalent to
32 |             the descriptors dimension. Must be between 32 and 2048, depending on model's availability.
33 |     
34 |     Return:
35 |         model (torch.nn.Module): a trained model.
36 |     """
37 |     print(f"Returning {method} model with backbone: {backbone} with features dimension {fc_output_dim}")
38 |     if method not in AVAILABLE_TRAINED_MODELS:
39 |       raise ValueError(f"Parameter `method` is set to {method} but it must be one of {list(AVAILABLE_TRAINED_MODELS.keys())}")
40 |       
41 |     if backbone not in AVAILABLE_TRAINED_MODELS[method]:
42 |         raise ValueError(f"Parameter `backbone` is set to {backbone} but it must be one of {list(AVAILABLE_TRAINED_MODELS[method].keys())}")
43 |       
44 |     try:
45 |         fc_output_dim = int(fc_output_dim)
46 |     except:
47 |         raise ValueError(f"Parameter `fc_output_dim` must be an integer, but it is set to {fc_output_dim}")
48 |       
49 |     if fc_output_dim not in AVAILABLE_TRAINED_MODELS[method][backbone]:
50 |         raise ValueError(f"Parameter `fc_output_dim` is set to {fc_output_dim}, but for backbone {backbone} "
51 |                          f"it must be one of {list(AVAILABLE_TRAINED_MODELS[backbone])}")
52 | 
53 |     if variation not in [0,1]:
54 |       raise ValueError(f"Parameter `variation` is set to {variation}, but must be 0 or 1")
55 | 
56 |     file_name = f"{method}_{backbone}_{fc_output_dim}"
57 |     var_name = AVAILABLE_VARIATIONS[file_name][variation]
58 |     file_name += f"_{var_name}"
59 | 
60 |     fetched_model = ModelSelecter(backbone, fc_output_dim).return_model()
61 |     fetched_model.load_state_dict(
62 |         torch.hub.load_state_dict_from_url(
63 |             f'https://github.com/Enrico-Chiavassa/Indoor-VPR/releases/download/v0.1.0/{file_name}.pth',
64 |         map_location=torch.device('cpu'))
65 |     )
66 |     return fetched_model
67 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Enrico-Chiavassa/Indoor-VPR/c939be706e3e5db34d618b1defb5055b8136ac9f/model/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/network.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Enrico-Chiavassa/Indoor-VPR/c939be706e3e5db34d618b1defb5055b8136ac9f/model/__pycache__/network.cpython-39.pyc


--------------------------------------------------------------------------------
/model/network.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn.functional as F
 4 | from torch.nn.parameter import Parameter
 5 | 
 6 | class ModelSelecter():
 7 |     def __init__(self, backbone: str, fc_output_dim: int):
 8 |         self.model = get_model(backbone=backbone, fc_output_dim=fc_output_dim)
 9 |     def return_model(self):
10 |         return self.model
11 |         
12 | def get_model(backbone: str, fc_output_dim: 2048):
13 |     if backbone == "ResNet50":
14 |         return GeoLocalizationNet(backbone=backbone, fc_output_dim=fc_output_dim)
15 |     elif backbone == "Dinov2":
16 |         return GeoLocalizationViT()
17 |     return model
18 | 
19 | class GeoLocalizationNet(torch.nn.Module):
20 |     """The used networks are composed of a backbone and an aggregation layer.
21 |     """
22 |     def __init__(self, backbone: str, fc_output_dim: int):
23 |         super().__init__()
24 |         self.model = torch.hub.load("gmberton/eigenplaces", "get_trained_model", backbone=backbone, fc_output_dim=fc_output_dim)
25 |     def forward(self, x):
26 |         x = self.model(x)
27 |         return x
28 | 
29 | class GeoLocalizationViT(torch.nn.Module):
30 |     def __init__(self):
31 |         super().__init__()
32 |         self.model = torch.hub.load("serizba/salad", "dinov2_salad", backbone="dinov2_vitb14")
33 |     def forward(self, images):
34 |         b, c, h, w = images.shape
35 |         # DINO wants height and width as multiple of 14, therefore resize them
36 |         # to the nearest multiple of 14
37 |         h = round(h / 14) * 14
38 |         w = round(w / 14) * 14
39 |         images = torchvision.transforms.functional.resize(images, [h, w], antialias=True)
40 |         return self.model(images)
41 | 


--------------------------------------------------------------------------------