├── LICENSE ├── README.md ├── hubconf.py └── model ├── __init__.py ├── __pycache__ ├── __init__.cpython-39.pyc └── network.cpython-39.pyc └── network.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Gabriele Berton, Carlo Masone, Barbara Caputo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Indoor-VPR 2 | This repo contains VPR models that have been fine-tuned for indoor usage. 3 | Currently available models come from the following repositories: 4 | - [EigenPlaces: Training Viewpoint Robust Models for Visual Place Recognition](https://github.com/gmberton/EigenPlaces); 5 | - [Optimal Transport Aggregation for Visual Place Recognition (Salad)](https://github.com/serizba/salad). 6 | 7 | The fine-tuned procedure is inspired by [Deep Visual Geo-localization Benchmark](https://github.com/gmberton/deep-visual-geo-localization-benchmark) 8 | 9 | The indoor datasets used for fine-tuning are: 10 | - [Gangnam Station B1-B2 and Hyundai Department Store B1-1F-4F](https://openaccess.thecvf.com/content/CVPR2021/papers/Lee_Large-Scale_Localization_Datasets_in_Crowded_Indoor_Spaces_CVPR_2021_paper.pdf); 11 | - [Baidu Mall](https://openaccess.thecvf.com/content_cvpr_2017/html/Sun_A_Dataset_for_CVPR_2017_paper.html). 12 | 13 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | dependencies = ['torch', 'torchvision'] 2 | 3 | import torch 4 | from model.network import ModelSelecter 5 | 6 | AVAILABLE_TRAINED_MODELS = { 7 | # backbone : list of available fc_output_dim, which is equivalent to descriptors dimensionality 8 | "eigenplaces": { 9 | "ResNet50": [2048] 10 | }, 11 | "salad": { 12 | "Dinov2": [8448] 13 | }, 14 | } 15 | 16 | # For each combination 2 different models are available. They differ in the dataset used for the fine-tuning, 17 | # the thresholds used to select soft and hard positives and in the way the images are handled during the training procedure. 18 | 19 | AVAILABLE_VARIATIONS = { 20 | "eigenplaces_ResNet50_2048": ["GB1_BAI_5_10", "GB1_BAI_10_25_S"], 21 | "salad_Dinov2_8448": ["GB1_10_25", "HB1_GB1_2_5"], 22 | } 23 | 24 | 25 | def get_trained_model(method : str = "eigenplaces", backbone : str = "ResNet50", fc_output_dim : int = 2048, variation : int = 0) -> torch.nn.Module: 26 | """Return a model fine-tuned on indoor datasets. 27 | 28 | Args: 29 | method (str): which methods was used to firstly train the model. 30 | backbone (str): which torchvision backbone to use. Must be ViT or a ResNet. 31 | fc_output_dim (int): the output dimension of the last fc layer, equivalent to 32 | the descriptors dimension. Must be between 32 and 2048, depending on model's availability. 33 | 34 | Return: 35 | model (torch.nn.Module): a trained model. 36 | """ 37 | print(f"Returning {method} model with backbone: {backbone} with features dimension {fc_output_dim}") 38 | if method not in AVAILABLE_TRAINED_MODELS: 39 | raise ValueError(f"Parameter `method` is set to {method} but it must be one of {list(AVAILABLE_TRAINED_MODELS.keys())}") 40 | 41 | if backbone not in AVAILABLE_TRAINED_MODELS[method]: 42 | raise ValueError(f"Parameter `backbone` is set to {backbone} but it must be one of {list(AVAILABLE_TRAINED_MODELS[method].keys())}") 43 | 44 | try: 45 | fc_output_dim = int(fc_output_dim) 46 | except: 47 | raise ValueError(f"Parameter `fc_output_dim` must be an integer, but it is set to {fc_output_dim}") 48 | 49 | if fc_output_dim not in AVAILABLE_TRAINED_MODELS[method][backbone]: 50 | raise ValueError(f"Parameter `fc_output_dim` is set to {fc_output_dim}, but for backbone {backbone} " 51 | f"it must be one of {list(AVAILABLE_TRAINED_MODELS[backbone])}") 52 | 53 | if variation not in [0,1]: 54 | raise ValueError(f"Parameter `variation` is set to {variation}, but must be 0 or 1") 55 | 56 | file_name = f"{method}_{backbone}_{fc_output_dim}" 57 | var_name = AVAILABLE_VARIATIONS[file_name][variation] 58 | file_name += f"_{var_name}" 59 | 60 | fetched_model = ModelSelecter(backbone, fc_output_dim).return_model() 61 | fetched_model.load_state_dict( 62 | torch.hub.load_state_dict_from_url( 63 | f'https://github.com/Enrico-Chiavassa/Indoor-VPR/releases/download/v0.1.0/{file_name}.pth', 64 | map_location=torch.device('cpu')) 65 | ) 66 | return fetched_model 67 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Enrico-Chiavassa/Indoor-VPR/c939be706e3e5db34d618b1defb5055b8136ac9f/model/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/network.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Enrico-Chiavassa/Indoor-VPR/c939be706e3e5db34d618b1defb5055b8136ac9f/model/__pycache__/network.cpython-39.pyc -------------------------------------------------------------------------------- /model/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | from torch.nn.parameter import Parameter 5 | 6 | class ModelSelecter(): 7 | def __init__(self, backbone: str, fc_output_dim: int): 8 | self.model = get_model(backbone=backbone, fc_output_dim=fc_output_dim) 9 | def return_model(self): 10 | return self.model 11 | 12 | def get_model(backbone: str, fc_output_dim: 2048): 13 | if backbone == "ResNet50": 14 | return GeoLocalizationNet(backbone=backbone, fc_output_dim=fc_output_dim) 15 | elif backbone == "Dinov2": 16 | return GeoLocalizationViT() 17 | return model 18 | 19 | class GeoLocalizationNet(torch.nn.Module): 20 | """The used networks are composed of a backbone and an aggregation layer. 21 | """ 22 | def __init__(self, backbone: str, fc_output_dim: int): 23 | super().__init__() 24 | self.model = torch.hub.load("gmberton/eigenplaces", "get_trained_model", backbone=backbone, fc_output_dim=fc_output_dim) 25 | def forward(self, x): 26 | x = self.model(x) 27 | return x 28 | 29 | class GeoLocalizationViT(torch.nn.Module): 30 | def __init__(self): 31 | super().__init__() 32 | self.model = torch.hub.load("serizba/salad", "dinov2_salad", backbone="dinov2_vitb14") 33 | def forward(self, images): 34 | b, c, h, w = images.shape 35 | # DINO wants height and width as multiple of 14, therefore resize them 36 | # to the nearest multiple of 14 37 | h = round(h / 14) * 14 38 | w = round(w / 14) * 14 39 | images = torchvision.transforms.functional.resize(images, [h, w], antialias=True) 40 | return self.model(images) 41 | --------------------------------------------------------------------------------