├── .github └── copilot-instructions.md ├── .gitignore ├── .gitmodules ├── README.md ├── checkpoints └── .gitkeep ├── configs ├── dataset │ ├── nclt │ │ ├── all_camera_lidar.yaml │ │ ├── all_camera_semantic.yaml │ │ ├── all_camera_semantic_lidar.yaml │ │ ├── all_camera_semantic_text.yaml │ │ ├── all_camera_semantic_text_lidar.yaml │ │ ├── all_camera_text.yaml │ │ ├── all_camera_text_lidar.yaml │ │ ├── camera1.yaml │ │ ├── camera1_lidar.yaml │ │ ├── camera2-front-back.yaml │ │ ├── camera2-left-right.yaml │ │ ├── camera5.yaml │ │ ├── lidar.yaml │ │ ├── semantic1.yaml │ │ ├── semantic2-front-back.yaml │ │ ├── semantic2-left-right.yaml │ │ ├── semantic5.yaml │ │ ├── text1_clip-base.yaml │ │ ├── text1_clip-large.yaml │ │ ├── text1_tfidf.yaml │ │ ├── text2-front-back_clip-base.yaml │ │ ├── text2-front-back_clip-large.yaml │ │ ├── text2-front-back_tfidf.yaml │ │ ├── text2-left-right_clip-base.yaml │ │ ├── text2-left-right_clip-large.yaml │ │ ├── text2-left-right_tfidf.yaml │ │ ├── text5_clip-base.yaml │ │ ├── text5_clip-large.yaml │ │ └── text5_tfidf.yaml │ └── oxford │ │ ├── all_camera_lidar.yaml │ │ ├── all_camera_semantic.yaml │ │ ├── all_camera_semantic_lidar.yaml │ │ ├── all_camera_semantic_text.yaml │ │ ├── all_camera_semantic_text_lidar.yaml │ │ ├── all_camera_text.yaml │ │ ├── all_camera_text_lidar.yaml │ │ ├── camera1.yaml │ │ ├── camera1_lidar.yaml │ │ ├── camera2-front-back.yaml │ │ ├── camera2-left-right.yaml │ │ ├── camera4.yaml │ │ ├── lidar.yaml │ │ ├── semantic1.yaml │ │ ├── semantic2-front-back.yaml │ │ ├── semantic2-left-right.yaml │ │ ├── semantic4.yaml │ │ ├── text1_clip-base.yaml │ │ ├── text1_clip-large.yaml │ │ ├── text1_tfidf.yaml │ │ ├── text2-front-back_clip-base.yaml │ │ ├── text2-front-back_clip-large.yaml │ │ ├── text2-front-back_tfidf.yaml │ │ ├── text2-left-right_clip-base.yaml │ │ ├── text2-left-right_clip-large.yaml │ │ ├── text2-left-right_tfidf.yaml │ │ ├── text4_clip-base.yaml │ │ ├── text4_clip-large.yaml │ │ └── text4_tfidf.yaml ├── loss │ └── batch_hard_triplet_margin.yaml ├── model │ ├── camera1.yaml │ ├── camera2_add.yaml │ ├── camera2_concat.yaml │ ├── camera2_gem.yaml │ ├── camera2_mlp-full.yaml │ ├── camera2_mlp-half.yaml │ ├── camera2_sa-add.yaml │ ├── camera2_sa-concat.yaml │ ├── convnext_camera1.yaml │ ├── convnext_semantic1.yaml │ ├── lidar.yaml │ ├── minkloc-multimodal.yaml │ ├── minkloc3dv2.yaml │ ├── mssplace-i.yaml │ ├── mssplace-li.yaml │ ├── mssplace-lis.yaml │ ├── mssplace-list.yaml │ ├── mssplace-lit.yaml │ ├── semantic1.yaml │ ├── semantic2_add.yaml │ ├── semantic2_concat.yaml │ ├── semantic2_mlp-full.yaml │ ├── semantic2_mlp-half.yaml │ ├── semantic2_sa-add.yaml │ ├── semantic2_sa-concat.yaml │ ├── text1_clip-base-mlp.yaml │ ├── text1_clip-large-mlp.yaml │ ├── text1_tfidf-mlp.yaml │ ├── text2_clip-base-mlp-add.yaml │ ├── text2_clip-base-mlp-concat.yaml │ ├── text2_clip-large-mlp-add.yaml │ ├── text2_clip-large-mlp-concat.yaml │ ├── text2_tfidf-mlp-add.yaml │ └── text2_tfidf-mlp-concat.yaml ├── optimizer │ └── adam.yaml ├── sampler │ └── batch_sampler.yaml ├── scheduler │ └── multi_step.yaml └── train_unimodal.yaml ├── docker ├── Dockerfile.cuda ├── build.sh ├── into.sh └── start.sh ├── images └── mssplace_overview.jpg ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── scripts └── evaluation │ ├── evaluate_checkpoints.py │ └── failure_cases.ipynb ├── src └── mssplace │ ├── __init__.py │ ├── datasets.py │ ├── modality_interaction_layers.py │ └── models.py └── train_unimodal.py /.github/copilot-instructions.md: -------------------------------------------------------------------------------- 1 | --- 2 | applyTo: '**' 3 | --- 4 | 5 | # Project-Specific Guidance – Research Reproducibility 6 | 7 | ## 🧪 Research Code Clarity 8 | 9 | - Prioritize code **readability and clarity** over performance. 10 | - Use **explicit, well-named variables** and **clear control flow**. 11 | 12 | ## 🧾 Reproducibility and Documentation 13 | 14 | - Ensure all code is **well-documented** with in-line comments and Google-style docstrings. 15 | - Write **comprehensive Markdown instructions** for reproducing experiments. 16 | 17 | ## 🔬 PyTorch and Python Modernity 18 | 19 | - Use **Python 3.10+** and **PyTorch 2.1+** features idiomatically. 20 | - Prefer built-in types and modern syntax for clarity. 21 | 22 | ## Goal 23 | 24 | Support reproducible research with clean, understandable code and thorough documentation. 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/* 2 | !checkpoints/.gitkeep 3 | outputs/ 4 | .ruff_cache/ 5 | 6 | # Development and organization files (keep local, don't publish) 7 | work_in_progress/ 8 | 9 | ### Defaults for Python: 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # poetry 108 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 109 | # This is especially recommended for binary packages to ensure reproducibility, and is more 110 | # commonly ignored for libraries. 111 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 112 | #poetry.lock 113 | 114 | # pdm 115 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 116 | #pdm.lock 117 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 118 | # in version control. 119 | # https://pdm.fming.dev/#use-with-ide 120 | .pdm.toml 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/OpenPlaceRecognition"] 2 | path = third_party/OpenPlaceRecognition 3 | url = https://github.com/alexmelekhin/OpenPlaceRecognition.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics 2 | 3 | This repository contains the code for the paper "MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics". 4 | 5 | ![mssplace_overview](./images/mssplace_overview.jpg) 6 | 7 | _High-level overview of the proposed multimodal MSSPlace method. The MSSPlace Model has a modular architecture and consists of four branches: the Image Encoder, Semantic Masks Encoder, Text Encoder, and Point Cloud Encoder. Each branch encodes the input data into a descriptor, capturing the essential information specific to its respective modality. Subsequently, a descriptor aggregation step is performed to combine these individual descriptors and obtain the global place descriptor, which represents the comprehensive characteristics of the vehicle place._ 8 | 9 | ## Installation 10 | 11 | Initialize submodules and build the Docker environment: 12 | 13 | ```bash 14 | git submodule update --init --recursive 15 | bash docker/build.sh 16 | bash docker/start.sh [DATASETS_DIR] # DATASETS_DIR will be mounted at /home/docker_mssplace/Datasets 17 | bash docker/into.sh 18 | ``` 19 | 20 | ### Package Installation 21 | 22 | Install the MSSPlace package in editable mode for development: 23 | 24 | ```bash 25 | pip install -e . 26 | ``` 27 | 28 | This installs the package from the `src/mssplace` directory, allowing you to import `mssplace` modules directly without path modifications. 29 | 30 | ## Quick Start 31 | 32 | Evaluate pre-trained models on Oxford RobotCar or NCLT datasets: 33 | 34 | ```bash 35 | # Download checkpoints and datasets first (see sections below) 36 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li 37 | python scripts/evaluation/evaluate_checkpoints.py --dataset nclt --model mssplace-list --verbose 38 | ``` 39 | 40 | ## Evaluation 41 | 42 | ### Performance Metrics 43 | 44 | - **AR@1**: Accuracy (%) when considering top-1 retrieval match 45 | - **AR@1%**: Accuracy (%) when considering top-1% of database as potential matches 46 | 47 | ### Model Variants 48 | 49 | | Model | Modalities | AR@1 (Oxford) | AR@1% (Oxford) | AR@1 (NCLT) | AR@1% (NCLT) | Description | 50 | |-------|------------|---------------|----------------|-------------|--------------|-------------| 51 | | `mssplace-li` | LiDAR + Images | 98.21% | 99.53% | 94.67% | 97.72% | Basic multimodal | 52 | | `mssplace-lis` | LiDAR + Images + Semantic | **98.55%** | **99.64%** | **95.37%** | **97.84%** | Adds semantic segmentation | 53 | | `mssplace-lit` | LiDAR + Images + Text | 98.22% | 99.53% | 92.36% | 96.51% | Adds text descriptions | 54 | | `mssplace-list` | LiDAR + Images + Semantic + Text | **98.55%** | **99.64%** | 94.15% | 96.97% | Complete multimodal | 55 | 56 | *Performance metrics measured on Oxford RobotCar and NCLT datasets. Best results per dataset highlighted in bold.* 57 | 58 | **Key Insights:** 59 | - `mssplace-lis` achieves the best performance on NCLT, while both `mssplace-lis` and `mssplace-list` tie for best on Oxford 60 | - Semantic segmentation consistently helps place recognition across both datasets 61 | - Text modality shows dataset-dependent behavior: hurts performance on NCLT but is neutral on Oxford 62 | - Oxford dataset appears easier than NCLT (all models achieve >98% vs 92-95% AR@1) 63 | - The complete multimodal `mssplace-list` performs well but doesn't consistently exceed semantic-only variants 64 | 65 | ### Pre-trained Checkpoints 66 | 67 | ⚠️ **Work in Progress**: Checkpoint download links will be updated soon. Please check back later for access to pre-trained models. 68 | 69 | ### Datasets 70 | 71 | ⚠️ **Work in Progress**: Preprocessed datasets will be made publicly available for download soon. Please check back later for dataset access. 72 | 73 | ### Directory Structure 74 | 75 | ``` 76 | /home/docker_mssplace/ 77 | ├── MSSPlace/ # This repository 78 | │ ├── checkpoints/ # Downloaded checkpoints 79 | │ ├── configs/ # Configuration files 80 | │ ├── docker/ # Docker environment setup 81 | │ ├── docs/ # Documentation and examples 82 | │ ├── images/ # Example images and figures 83 | │ ├── scripts/ # Organized scripts 84 | │ ├── src/ # Core source code 85 | │ └── third_party/ # External dependencies 86 | │ └── OpenPlaceRecognition/ # Core OPR library 87 | └── Datasets/ # Dataset directory (configurable with --datasets-dir) 88 | ├── pnvlad_oxford_robotcar/ 89 | └── NCLT_preprocessed/ 90 | ``` 91 | 92 | ### Key Arguments 93 | 94 | | Argument | Default | Description | 95 | |----------|---------|-------------| 96 | | `--dataset` | *Required* | `oxford` or `nclt` | 97 | | `--model` | *Required* | Model variant (see table above) | 98 | | `--datasets-dir` | `/home/docker_mssplace/Datasets` | Path to datasets directory | 99 | | `--checkpoints-dir` | `./checkpoints` | Path to model checkpoints | 100 | | `--configs-dir` | `./configs/model` | Path to model configurations | 101 | | `--batch-size` | `32` | Evaluation batch size | 102 | | `--verbose` | `False` | Enable detailed logging | 103 | 104 | **Example Usage:** 105 | ```bash 106 | # Basic evaluation 107 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li 108 | 109 | # Custom dataset location 110 | python scripts/evaluation/evaluate_checkpoints.py \ 111 | --dataset nclt --model mssplace-lis \ 112 | --datasets-dir /path/to/your/datasets \ 113 | --verbose 114 | ``` 115 | 116 | ## Training (Optional) 117 | 118 | ⚠️ **Work in Progress**: Training documentation and scripts will be updated soon. Please check back later for training instructions. 119 | 120 | ## Troubleshooting 121 | 122 | - **Missing checkpoints**: Download all `.pth` files to `checkpoints/` 123 | - **Dataset errors**: Verify directory structure matches expected format 124 | - **CUDA memory**: Reduce `--batch-size` if out-of-memory 125 | - **Dependencies**: Use provided Docker environment 126 | -------------------------------------------------------------------------------- /checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/checkpoints/.gitkeep -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | pointcloud_lidar,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: velodyne_data 12 | pointcloud_quantization_size: 0.5 13 | max_point_distance: 100.0 14 | spherical_coords: False 15 | use_intensity_values: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_semantic.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: velodyne_data 12 | pointcloud_quantization_size: 0.5 13 | max_point_distance: 100.0 14 | spherical_coords: False 15 | use_intensity_values: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_semantic_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5, 6 | pointcloud_lidar,] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: velodyne_data 13 | pointcloud_quantization_size: 0.5 14 | max_point_distance: 100.0 15 | spherical_coords: False 16 | use_intensity_values: False 17 | image_transform: null 18 | semantic_transform: null 19 | pointcloud_transform: null 20 | pointcloud_set_transform: null 21 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_semantic_text.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5, 6 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: velodyne_data 13 | pointcloud_quantization_size: 0.5 14 | max_point_distance: 100.0 15 | spherical_coords: False 16 | use_intensity_values: False 17 | image_transform: null 18 | semantic_transform: null 19 | pointcloud_transform: null 20 | pointcloud_set_transform: null 21 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_semantic_text_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5, 6 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5, 7 | pointcloud_lidar,] 8 | positive_threshold: 10.0 9 | negative_threshold: 50.0 10 | images_dirname: images_small 11 | masks_dirname: segmentation_masks_small 12 | text_embeddings_dirname: clip-vit-base-patch32 13 | pointclouds_dirname: velodyne_data 14 | pointcloud_quantization_size: 0.5 15 | max_point_distance: 100.0 16 | spherical_coords: False 17 | use_intensity_values: False 18 | image_transform: null 19 | semantic_transform: null 20 | pointcloud_transform: null 21 | pointcloud_set_transform: null 22 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_text.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: velodyne_data 12 | pointcloud_quantization_size: 0.5 13 | max_point_distance: 100.0 14 | spherical_coords: False 15 | use_intensity_values: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/nclt/all_camera_text_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5, 5 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5, 6 | pointcloud_lidar,] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: velodyne_data 13 | pointcloud_quantization_size: 0.5 14 | max_point_distance: 100.0 15 | spherical_coords: False 16 | use_intensity_values: False 17 | image_transform: null 18 | semantic_transform: null 19 | pointcloud_transform: null 20 | pointcloud_set_transform: null 21 | -------------------------------------------------------------------------------- /configs/dataset/nclt/camera1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/camera1_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, 5 | pointcloud_lidar,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: velodyne_data 12 | pointcloud_quantization_size: 0.5 13 | max_point_distance: 100.0 14 | spherical_coords: False 15 | use_intensity_values: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/nclt/camera2-front-back.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam5, image_Cam2] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/camera2-left-right.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam4] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/camera5.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [pointcloud_lidar,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/semantic1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/semantic2-front-back.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_Cam5, mask_Cam2] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/semantic2-left-right.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_Cam1, mask_Cam4] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/semantic5.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.nclt.NCLTDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: velodyne_data 10 | pointcloud_quantization_size: 0.5 11 | max_point_distance: 100.0 12 | spherical_coords: False 13 | use_intensity_values: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text1_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text1_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text1_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-front-back_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5, text_Cam2,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-front-back_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5, text_Cam2,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-front-back_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam5, text_Cam2,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-left-right_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam4,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-left-right_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam4,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text2-left-right_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam4,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text5_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text5_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/nclt/text5_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.NCLTDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: velodyne_data 11 | pointcloud_quantization_size: 0.5 12 | max_point_distance: 100.0 13 | spherical_coords: False 14 | use_intensity_values: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | pointcloud_lidar,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: null 12 | pointcloud_quantization_size: 0.01 13 | max_point_distance: null 14 | spherical_coords: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_semantic.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: null 12 | pointcloud_quantization_size: 0.01 13 | max_point_distance: null 14 | spherical_coords: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_semantic_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right, 6 | pointcloud_lidar,] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: null 13 | pointcloud_quantization_size: 0.01 14 | max_point_distance: null 15 | spherical_coords: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_semantic_text.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right, 6 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: null 13 | pointcloud_quantization_size: 0.01 14 | max_point_distance: null 15 | spherical_coords: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_semantic_text_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right, 6 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right, 7 | pointcloud_lidar,] 8 | positive_threshold: 10.0 9 | negative_threshold: 50.0 10 | images_dirname: images_small 11 | masks_dirname: segmentation_masks_small 12 | text_embeddings_dirname: clip-vit-base-patch32 13 | pointclouds_dirname: null 14 | pointcloud_quantization_size: 0.01 15 | max_point_distance: null 16 | spherical_coords: False 17 | image_transform: null 18 | semantic_transform: null 19 | pointcloud_transform: null 20 | pointcloud_set_transform: null 21 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_text.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: null 12 | pointcloud_quantization_size: 0.01 13 | max_point_distance: null 14 | spherical_coords: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/oxford/all_camera_text_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right, 5 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right, 6 | pointcloud_lidar,] 7 | positive_threshold: 10.0 8 | negative_threshold: 50.0 9 | images_dirname: images_small 10 | masks_dirname: segmentation_masks_small 11 | text_embeddings_dirname: clip-vit-base-patch32 12 | pointclouds_dirname: null 13 | pointcloud_quantization_size: 0.01 14 | max_point_distance: null 15 | spherical_coords: False 16 | image_transform: null 17 | semantic_transform: null 18 | pointcloud_transform: null 19 | pointcloud_set_transform: null 20 | -------------------------------------------------------------------------------- /configs/dataset/oxford/camera1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/camera1_lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, 5 | pointcloud_lidar,] 6 | positive_threshold: 10.0 7 | negative_threshold: 50.0 8 | images_dirname: images_small 9 | masks_dirname: segmentation_masks_small 10 | text_embeddings_dirname: clip-vit-base-patch32 11 | pointclouds_dirname: null 12 | pointcloud_quantization_size: 0.01 13 | max_point_distance: null 14 | spherical_coords: False 15 | image_transform: null 16 | semantic_transform: null 17 | pointcloud_transform: null 18 | pointcloud_set_transform: null 19 | -------------------------------------------------------------------------------- /configs/dataset/oxford/camera2-front-back.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_rear] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/camera2-left-right.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_mono_left, image_mono_right] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/camera4.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [pointcloud_lidar,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/semantic1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_stereo_centre,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/semantic2-front-back.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_stereo_centre, mask_mono_rear] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/semantic2-left-right.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_mono_left, mask_mono_right] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/semantic4.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.datasets.OxfordDataset 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | pointclouds_dirname: null 10 | pointcloud_quantization_size: 0.01 11 | max_point_distance: null 12 | spherical_coords: False 13 | image_transform: null 14 | semantic_transform: null 15 | pointcloud_transform: null 16 | pointcloud_set_transform: null 17 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text1_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text1_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text1_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-front-back_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_rear,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-front-back_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_rear,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-front-back_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_rear,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-left-right_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_mono_left, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-left-right_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_mono_left, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text2-left-right_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_mono_left, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text4_clip-base.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-base-patch32 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text4_clip-large.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: clip-vit-large-patch14 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/dataset/oxford/text4_tfidf.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.datasets.OxfordDatasetWithText 2 | 3 | dataset_root: /path/to/dataset 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,] 5 | positive_threshold: 10.0 6 | negative_threshold: 50.0 7 | images_dirname: images_small 8 | masks_dirname: segmentation_masks_small 9 | text_embeddings_dirname: tfidf_pca 10 | pointclouds_dirname: null 11 | pointcloud_quantization_size: 0.01 12 | max_point_distance: null 13 | spherical_coords: False 14 | image_transform: null 15 | semantic_transform: null 16 | pointcloud_transform: null 17 | pointcloud_set_transform: null 18 | -------------------------------------------------------------------------------- /configs/loss/batch_hard_triplet_margin.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.losses.BatchHardTripletMarginLoss 2 | 3 | margin: 0.2 4 | -------------------------------------------------------------------------------- /configs/model/camera1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.ResNet18 2 | 3 | in_channels: 3 4 | out_channels: 256 5 | num_top_down: 0 6 | pooling: gem 7 | -------------------------------------------------------------------------------- /configs/model/camera2_add.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: opr.modules.Add 14 | -------------------------------------------------------------------------------- /configs/model/camera2_concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: opr.modules.Concat 14 | -------------------------------------------------------------------------------- /configs/model/camera2_gem.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: opr.modules.GeMFusion 14 | -------------------------------------------------------------------------------- /configs/model/camera2_mlp-full.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.Concat 16 | - _target_: opr.modules.MLP 17 | in_features: 512 18 | out_features: 512 19 | -------------------------------------------------------------------------------- /configs/model/camera2_mlp-half.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.Concat 16 | - _target_: opr.modules.MLP 17 | in_features: 512 18 | out_features: 256 19 | -------------------------------------------------------------------------------- /configs/model/camera2_sa-add.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.SelfAttention 16 | embed_size: 256 17 | - _target_: opr.modules.Add 18 | -------------------------------------------------------------------------------- /configs/model/camera2_sa-concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 3 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: True 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.SelfAttention 16 | embed_size: 256 17 | - _target_: opr.modules.Concat 18 | -------------------------------------------------------------------------------- /configs/model/convnext_camera1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor 5 | in_channels: 3 6 | pretrained: True 7 | head: 8 | _target_: torch.nn.Sequential 9 | _args_: 10 | - _target_: opr.modules.GeM 11 | - _target_: torch.nn.Linear 12 | in_features: 768 13 | out_features: 256 14 | -------------------------------------------------------------------------------- /configs/model/convnext_semantic1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor 5 | in_channels: 1 6 | pretrained: False 7 | head: 8 | _target_: torch.nn.Sequential 9 | _args_: 10 | - _target_: opr.modules.GeM 11 | - _target_: torch.nn.Linear 12 | in_features: 768 13 | out_features: 256 14 | -------------------------------------------------------------------------------- /configs/model/lidar.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.MinkLoc3Dv2 2 | 3 | in_channels: 1 4 | out_channels: 256 5 | num_top_down: 2 6 | conv0_kernel_size: 5 7 | block: ECABasicBlock 8 | layers: [1, 1, 1, 1] 9 | planes: [64, 128, 64, 32] 10 | pooling: gem 11 | -------------------------------------------------------------------------------- /configs/model/minkloc-multimodal.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.LateFusionModel 2 | 3 | image_module: 4 | _target_: opr.models.place_recognition.ResNet18 5 | in_channels: 3 6 | out_channels: 128 7 | num_top_down: 0 8 | pooling: gem 9 | 10 | cloud_module: 11 | _target_: opr.models.place_recognition.MinkLoc3D 12 | in_channels: 1 13 | out_channels: 128 14 | num_top_down: 1 15 | conv0_kernel_size: 5 16 | block: ECABasicBlock 17 | layers: [1, 1, 1] 18 | planes: [32, 64, 64] 19 | pooling: gem 20 | 21 | fusion_module: 22 | _target_: opr.modules.Concat 23 | -------------------------------------------------------------------------------- /configs/model/minkloc3dv2.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.MinkLoc3Dv2 2 | 3 | in_channels: 1 4 | out_channels: 256 5 | num_top_down: 2 6 | conv0_kernel_size: 5 7 | block: ECABasicBlock 8 | layers: [1, 1, 1, 1] 9 | planes: [64, 128, 64, 32] 10 | pooling: gem 11 | -------------------------------------------------------------------------------- /configs/model/mssplace-i.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.ImageModel 2 | backbone: 3 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 4 | in_channels: 3 5 | lateral_dim: 256 6 | fh_num_bottom_up: 4 7 | fh_num_top_down: 0 8 | pretrained: True 9 | head: 10 | _target_: opr.modules.GeM 11 | fusion: 12 | _target_: opr.modules.Add 13 | -------------------------------------------------------------------------------- /configs/model/mssplace-li.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.LateFusionModel 2 | 3 | image_module: 4 | _target_: opr.models.place_recognition.base.ImageModel 5 | backbone: 6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 7 | in_channels: 3 8 | lateral_dim: 256 9 | fh_num_bottom_up: 4 10 | fh_num_top_down: 0 11 | pretrained: True 12 | head: 13 | _target_: opr.modules.GeM 14 | fusion: 15 | _target_: opr.modules.Add 16 | 17 | cloud_module: 18 | _target_: opr.models.place_recognition.MinkLoc3Dv2 19 | in_channels: 1 20 | out_channels: 256 21 | num_top_down: 2 22 | conv0_kernel_size: 5 23 | block: ECABasicBlock 24 | layers: [1, 1, 1, 1] 25 | planes: [64, 128, 64, 32] 26 | pooling: gem 27 | 28 | fusion_module: 29 | _target_: opr.modules.Concat 30 | -------------------------------------------------------------------------------- /configs/model/mssplace-lis.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.LateFusionModel 2 | 3 | image_module: 4 | _target_: opr.models.place_recognition.base.ImageModel 5 | backbone: 6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 7 | in_channels: 3 8 | lateral_dim: 256 9 | fh_num_bottom_up: 4 10 | fh_num_top_down: 0 11 | pretrained: True 12 | head: 13 | _target_: opr.modules.GeM 14 | fusion: 15 | _target_: opr.modules.Add 16 | 17 | cloud_module: 18 | _target_: opr.models.place_recognition.MinkLoc3Dv2 19 | in_channels: 1 20 | out_channels: 256 21 | num_top_down: 2 22 | conv0_kernel_size: 5 23 | block: ECABasicBlock 24 | layers: [1, 1, 1, 1] 25 | planes: [64, 128, 64, 32] 26 | pooling: gem 27 | 28 | semantic_module: 29 | _target_: opr.models.place_recognition.base.SemanticModel 30 | backbone: 31 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 32 | in_channels: 1 33 | lateral_dim: 256 34 | fh_num_bottom_up: 4 35 | fh_num_top_down: 0 36 | pretrained: False 37 | head: 38 | _target_: opr.modules.GeM 39 | fusion: 40 | _target_: opr.modules.Add 41 | 42 | fusion_module: 43 | _target_: opr.modules.Concat 44 | -------------------------------------------------------------------------------- /configs/model/mssplace-list.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.LateFusionModel 2 | 3 | image_module: 4 | _target_: opr.models.place_recognition.base.ImageModel 5 | backbone: 6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 7 | in_channels: 3 8 | lateral_dim: 256 9 | fh_num_bottom_up: 4 10 | fh_num_top_down: 0 11 | pretrained: True 12 | head: 13 | _target_: opr.modules.GeM 14 | fusion: 15 | _target_: opr.modules.Add 16 | 17 | cloud_module: 18 | _target_: opr.models.place_recognition.MinkLoc3Dv2 19 | in_channels: 1 20 | out_channels: 256 21 | num_top_down: 2 22 | conv0_kernel_size: 5 23 | block: ECABasicBlock 24 | layers: [1, 1, 1, 1] 25 | planes: [64, 128, 64, 32] 26 | pooling: gem 27 | 28 | semantic_module: 29 | _target_: opr.models.place_recognition.base.SemanticModel 30 | backbone: 31 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 32 | in_channels: 1 33 | lateral_dim: 256 34 | fh_num_bottom_up: 4 35 | fh_num_top_down: 0 36 | pretrained: False 37 | head: 38 | _target_: opr.modules.GeM 39 | fusion: 40 | _target_: opr.modules.Add 41 | 42 | text_module: 43 | _target_: src.models.TextModel 44 | model: 45 | _target_: opr.modules.MLP 46 | in_features: 512 47 | out_features: 256 48 | drop: 0.5 49 | fusion: 50 | _target_: opr.modules.Add 51 | 52 | fusion_module: 53 | _target_: opr.modules.Concat 54 | -------------------------------------------------------------------------------- /configs/model/mssplace-lit.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.LateFusionModel 2 | 3 | image_module: 4 | _target_: opr.models.place_recognition.base.ImageModel 5 | backbone: 6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 7 | in_channels: 3 8 | lateral_dim: 256 9 | fh_num_bottom_up: 4 10 | fh_num_top_down: 0 11 | pretrained: True 12 | head: 13 | _target_: opr.modules.GeM 14 | fusion: 15 | _target_: opr.modules.Add 16 | 17 | cloud_module: 18 | _target_: opr.models.place_recognition.MinkLoc3Dv2 19 | in_channels: 1 20 | out_channels: 256 21 | num_top_down: 2 22 | conv0_kernel_size: 5 23 | block: ECABasicBlock 24 | layers: [1, 1, 1, 1] 25 | planes: [64, 128, 64, 32] 26 | pooling: gem 27 | 28 | text_module: 29 | _target_: src.models.TextModel 30 | model: 31 | _target_: opr.modules.MLP 32 | in_features: 512 33 | out_features: 256 34 | drop: 0.5 35 | fusion: 36 | _target_: opr.modules.Add 37 | 38 | fusion_module: 39 | _target_: opr.modules.Concat 40 | -------------------------------------------------------------------------------- /configs/model/semantic1.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.SemanticResNet18 2 | 3 | in_channels: 1 4 | out_channels: 256 5 | num_top_down: 0 6 | pooling: gem 7 | -------------------------------------------------------------------------------- /configs/model/semantic2_add.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: opr.modules.Add 14 | -------------------------------------------------------------------------------- /configs/model/semantic2_concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: opr.modules.Concat 14 | -------------------------------------------------------------------------------- /configs/model/semantic2_mlp-full.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.Concat 16 | - _target_: opr.modules.MLP 17 | in_features: 512 18 | out_features: 512 19 | -------------------------------------------------------------------------------- /configs/model/semantic2_mlp-half.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.Concat 16 | - _target_: opr.modules.MLP 17 | in_features: 512 18 | out_features: 256 19 | -------------------------------------------------------------------------------- /configs/model/semantic2_sa-add.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.SelfAttention 16 | embed_size: 256 17 | - _target_: opr.modules.Add 18 | -------------------------------------------------------------------------------- /configs/model/semantic2_sa-concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.models.place_recognition.base.SemanticModel 2 | 3 | backbone: 4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor 5 | in_channels: 1 6 | lateral_dim: 256 7 | fh_num_bottom_up: 4 8 | fh_num_top_down: 0 9 | pretrained: False 10 | head: 11 | _target_: opr.modules.GeM 12 | fusion: 13 | _target_: torch.nn.Sequential 14 | _args_: 15 | - _target_: opr.modules.SelfAttention 16 | embed_size: 256 17 | - _target_: opr.modules.Concat 18 | -------------------------------------------------------------------------------- /configs/model/text1_clip-base-mlp.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 512 6 | out_features: 256 7 | drop: 0.5 8 | fusion: null 9 | -------------------------------------------------------------------------------- /configs/model/text1_clip-large-mlp.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 768 6 | out_features: 256 7 | drop: 0.5 8 | fusion: null 9 | -------------------------------------------------------------------------------- /configs/model/text1_tfidf-mlp.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 128 6 | out_features: 128 7 | drop: 0.5 8 | fusion: null 9 | -------------------------------------------------------------------------------- /configs/model/text2_clip-base-mlp-add.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 512 6 | out_features: 256 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Add 10 | -------------------------------------------------------------------------------- /configs/model/text2_clip-base-mlp-concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 512 6 | out_features: 256 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Concat 10 | -------------------------------------------------------------------------------- /configs/model/text2_clip-large-mlp-add.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 768 6 | out_features: 256 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Add 10 | -------------------------------------------------------------------------------- /configs/model/text2_clip-large-mlp-concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 768 6 | out_features: 256 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Concat 10 | -------------------------------------------------------------------------------- /configs/model/text2_tfidf-mlp-add.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 128 6 | out_features: 128 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Add 10 | -------------------------------------------------------------------------------- /configs/model/text2_tfidf-mlp-concat.yaml: -------------------------------------------------------------------------------- 1 | _target_: src.models.TextModel 2 | 3 | model: 4 | _target_: opr.modules.MLP 5 | in_features: 128 6 | out_features: 128 7 | drop: 0.5 8 | fusion: 9 | _target_: opr.modules.Concat 10 | -------------------------------------------------------------------------------- /configs/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | _convert_: all 3 | 4 | lr: 0.001 5 | weight_decay: 0.0001 6 | -------------------------------------------------------------------------------- /configs/sampler/batch_sampler.yaml: -------------------------------------------------------------------------------- 1 | _target_: opr.samplers.BatchSampler 2 | 3 | batch_size: 16 4 | batch_size_limit: 128 5 | batch_expansion_rate: 1.4 6 | max_batches: null 7 | positives_per_group: 2 8 | seed: ${seed} 9 | drop_last: True 10 | -------------------------------------------------------------------------------- /configs/scheduler/multi_step.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.lr_scheduler.MultiStepLR 2 | gamma: 0.1 3 | milestones: [40, 60] 4 | -------------------------------------------------------------------------------- /configs/train_unimodal.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - dataset: nclt/lidar 4 | - sampler: batch_sampler 5 | - model: lidar 6 | - loss: batch_hard_triplet_margin 7 | - optimizer: adam 8 | - scheduler: multi_step 9 | 10 | wandb: 11 | disabled: false 12 | project: CVPR2024 13 | 14 | debug: false 15 | device: cuda 16 | seed: 3121999 17 | num_workers: 4 18 | 19 | exp_name: ??? 20 | 21 | epochs: 80 22 | batch_expansion_threshold: 0.7 23 | -------------------------------------------------------------------------------- /docker/Dockerfile.cuda: -------------------------------------------------------------------------------- 1 | FROM alexmelekhin/open-place-recognition:base 2 | 3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \ 4 | libcairo2-dev \ 5 | libgirepository1.0-dev \ 6 | libdbus-1-dev \ 7 | libdbus-glib-1-dev \ 8 | && rm -rf /var/lib/apt/lists/* 9 | 10 | # to install "dvc[gdrive]" we need to install "distro" package first 11 | ARG DISTRO_VERSION=1.9.0 12 | RUN pip install distro==${DISTRO_VERSION} 13 | 14 | # install other requirements from requirements.txt 15 | COPY requirements.txt . 16 | RUN pip install -r requirements.txt && \ 17 | rm requirements.txt 18 | 19 | # add user and his password 20 | ENV USER=docker_mssplace 21 | ARG UID=1000 22 | ARG GID=1000 23 | # default password 24 | ARG PW=user 25 | 26 | RUN useradd -m ${USER} --uid=${UID} && echo "${USER}:${PW}" | chpasswd && adduser ${USER} sudo 27 | WORKDIR /home/${USER} 28 | 29 | # create some directories for mounting volumes 30 | RUN mkdir MSSPlace && chown -R ${UID}:${GID} /home/${USER} 31 | RUN mkdir Datasets && chown -R ${UID}:${GID} /home/${USER} 32 | 33 | USER ${UID}:${GID} 34 | 35 | # install OpenPlaceRecognition library 36 | COPY --chown=${UID}:${GID} ./third_party/OpenPlaceRecognition ./OpenPlaceRecognition 37 | RUN cd OpenPlaceRecognition && \ 38 | pip install --user . && \ 39 | cd .. && \ 40 | rm -rf OpenPlaceRecognition 41 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | orange=`tput setaf 3` 4 | reset_color=`tput sgr0` 5 | 6 | ARCH=`uname -m` 7 | if [ $ARCH != "x86_64" ]; then 8 | echo "${orange}${ARCH}${reset_color} architecture is not supported" 9 | exit 1 10 | fi 11 | 12 | if command -v nvidia-smi &> /dev/null; then 13 | echo "Detected ${orange}CUDA${reset_color} hardware" 14 | DOCKERFILE=Dockerfile.cuda 15 | DEVICE=cuda 16 | else 17 | echo "${orange}CPU-only${reset_color} build is not supported" 18 | exit 1 19 | fi 20 | 21 | echo "Building for ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}" 22 | 23 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd) 24 | 25 | docker build $PROJECT_ROOT_DIR \ 26 | -f $PROJECT_ROOT_DIR/docker/$DOCKERFILE \ 27 | --build-arg UID=$(id -u) \ 28 | --build-arg GID=$(id -g) \ 29 | -t mssplace:latest 30 | -------------------------------------------------------------------------------- /docker/into.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker exec --user docker_mssplace -it ${USER}_mssplace \ 4 | /bin/bash -c "cd /home/docker_mssplace; echo ${USER}_mssplace container; echo ; /bin/bash" 5 | -------------------------------------------------------------------------------- /docker/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | orange=`tput setaf 3` 4 | reset_color=`tput sgr0` 5 | 6 | get_real_path(){ 7 | if [ "${1:0:1}" == "/" ]; then 8 | echo "$1" 9 | else 10 | realpath -m "$PWD"/"$1" 11 | fi 12 | } 13 | 14 | ARCH=`uname -m` 15 | if [ $ARCH == "x86_64" ]; then 16 | if command -v nvidia-smi &> /dev/null; then 17 | DEVICE=cuda 18 | ARGS="--ipc host --gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" 19 | else 20 | echo "${orange}CPU-only${reset_color} build is not supported" 21 | exit 1 22 | fi 23 | else 24 | echo "${orange}${ARCH}${reset_color} architecture is not supported" 25 | exit 1 26 | fi 27 | 28 | if [ $# != 1 ]; then 29 | echo "Usage: 30 | bash start.sh [DATASETS_DIR] 31 | " 32 | exit 1 33 | fi 34 | 35 | DATASETS_DIR=$(get_real_path "$1") 36 | 37 | if [ ! -d $DATASETS_DIR ]; then 38 | echo "Error: DATASETS_DIR=$DATASETS_DIR is not an existing directory." 39 | exit 1 40 | fi 41 | 42 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd) 43 | 44 | echo "Running on ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}" 45 | 46 | docker run -it -d --rm \ 47 | $ARGS \ 48 | --privileged \ 49 | --name ${USER}_mssplace \ 50 | --net host \ 51 | -v $PROJECT_ROOT_DIR:/home/docker_mssplace/MSSPlace:rw \ 52 | -v $DATASETS_DIR:/home/docker_mssplace/Datasets:rw \ 53 | mssplace:latest 54 | 55 | docker exec --user root \ 56 | ${USER}_mssplace bash -c "/etc/init.d/ssh start" 57 | -------------------------------------------------------------------------------- /images/mssplace_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/images/mssplace_overview.jpg -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "mssplace" 7 | version = "0.1.0" 8 | description = "Multi-Sensor Place Recognition with Visual and Text Semantics" 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = {text = "MIT"} 12 | authors = [ 13 | {name = "MSSPlace Team"} 14 | ] 15 | keywords = ["place recognition", "multimodal", "computer vision", "robotics"] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "Intended Audience :: Science/Research", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 27 | "Topic :: Scientific/Engineering :: Image Recognition", 28 | ] 29 | 30 | # Dependencies - using requirements.txt for now, can be moved here later 31 | dependencies = [] 32 | 33 | [tool.setuptools.packages.find] 34 | where = ["src"] 35 | 36 | [tool.setuptools.package-dir] 37 | "" = "src" 38 | 39 | [tool.ruff] 40 | line-length = 110 41 | src = ["src"] 42 | 43 | [tool.ruff.format] 44 | quote-style = "double" 45 | indent-style = "space" 46 | skip-magic-trailing-comma = false 47 | line-ending = "auto" 48 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ruff==0.11.11 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | albumentations==1.3.1 2 | antlr4-python3-runtime==4.9.3 3 | appdirs==1.4.4 4 | certifi==2025.4.26 5 | chardet==3.0.4 6 | charset-normalizer==3.3.2 7 | click==8.1.7 8 | contourpy==1.1.1 9 | cycler==0.12.1 10 | dbus-python==1.2.16 11 | distro==1.4.0 12 | docker-pycreds==0.4.0 13 | fonttools==4.45.1 14 | gitdb==4.0.11 15 | GitPython==3.1.44 16 | hydra-core==1.3.2 17 | idna==3.10 18 | imageio==2.33.0 19 | importlib-resources==6.1.1 20 | joblib==1.3.2 21 | kaleido==0.2.1 22 | kiwisolver==1.4.5 23 | lazy_loader==0.3 24 | loguru==0.7.2 25 | matplotlib==3.7.4 26 | networkx==3.1 27 | ninja==1.11.1.1 28 | numpy==1.24.4 29 | numpy-quaternion==2024.0.8 30 | omegaconf==2.3.0 31 | opencv-python==4.8.1.78 32 | opencv-python-headless==4.8.1.78 33 | packaging==23.2 34 | pandas==2.0.3 35 | Pillow==11.2.1 36 | plotly==5.18.0 37 | protobuf==4.25.1 38 | psutil==5.9.6 39 | PyGObject==3.36.0 40 | pyparsing==3.1.1 41 | python-dateutil==2.8.2 42 | pytorch-metric-learning==2.3.0 43 | pytz==2023.3.post1 44 | PyWavelets==1.4.1 45 | PyYAML==6.0.1 46 | qudida==0.0.4 47 | requests==2.32.3 48 | scikit-image==0.21.0 49 | scikit-learn==1.6.1 50 | scipy==1.10.1 51 | seaborn==0.13.2 52 | sentry-sdk==2.29.1 53 | setproctitle==1.3.3 54 | six==1.14.0 55 | smmap==5.0.1 56 | ssh-import-id==5.10 57 | tenacity==8.2.3 58 | threadpoolctl==3.2.0 59 | tifffile==2023.7.10 60 | tqdm==4.67.1 61 | typing_extensions==4.8.0 62 | tzdata==2023.3 63 | urllib3==2.4.0 64 | wandb==0.16.0 65 | zipp==3.21.0 66 | -------------------------------------------------------------------------------- /scripts/evaluation/evaluate_checkpoints.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Checkpoint Evaluation Script for MSSPlace Models 4 | 5 | This script evaluates pre-trained MSSPlace model checkpoints on Oxford and NCLT 6 | datasets to verify the results reported in our paper. It provides a clean interface 7 | for testing different model variants with comprehensive logging and error handling. 8 | 9 | Note: This script evaluates released checkpoints only. For full experimental 10 | reproduction including training from scratch, refer to the training scripts 11 | (not publicly released). 12 | 13 | Key Features: 14 | - Supports text-enabled datasets for all model variants 15 | - Dynamic sensor setup configuration per model type 16 | - Loguru-based logging with colored output 17 | - Automatic sensor setup selection based on model name 18 | 19 | Usage: 20 | python evaluate_checkpoints.py --dataset oxford --model mssplace-li 21 | python evaluate_checkpoints.py --dataset nclt --model mssplace-lis --batch-size 16 22 | python evaluate_checkpoints.py --dataset oxford --model mssplace-lit --verbose 23 | 24 | Requirements: 25 | - PyTorch 2.1+ 26 | - Python 3.10+ 27 | - Hydra/OmegaConf for configuration management 28 | - Custom OPR (Open Place Recognition) library 29 | - loguru for enhanced logging 30 | - Custom datasets module with text support 31 | 32 | Author: Generated from what_is_in_checkpoint.ipynb 33 | Date: May 23, 2025 34 | """ 35 | 36 | import argparse 37 | import sys 38 | from datetime import datetime 39 | from pathlib import Path 40 | 41 | import torch 42 | from omegaconf import OmegaConf 43 | from hydra.utils import instantiate 44 | from torch.utils.data import DataLoader 45 | from loguru import logger 46 | 47 | # Import custom modules from the OPR library 48 | from opr.testers.place_recognition.model import ModelTester, RetrievalResultsCollection 49 | 50 | # Import text-enabled datasets from the installed mssplace package 51 | from mssplace.datasets import NCLTDatasetWithText, OxfordDatasetWithText 52 | 53 | 54 | # Configuration constants following the original notebook structure 55 | DATASET_CHOICES = ["oxford", "nclt"] 56 | 57 | MODEL_CHOICES = [ 58 | "mssplace-li", 59 | "mssplace-lis", 60 | "mssplace-lit", 61 | "mssplace-list", 62 | "mssplace-i", 63 | "minkloc-multimodal", 64 | "minkloc3dv2", 65 | ] 66 | 67 | MODEL_CONFIG_NAMES = { 68 | "mssplace-li": "mssplace-li.yaml", 69 | "mssplace-lis": "mssplace-lis.yaml", 70 | "mssplace-lit": "mssplace-lit.yaml", 71 | "mssplace-list": "mssplace-list.yaml", 72 | "mssplace-i": "mssplace-i.yaml", 73 | "minkloc-multimodal": "minkloc-multimodal.yaml", 74 | "minkloc3dv2": "minkloc3dv2.yaml", 75 | } 76 | 77 | CHECKPOINT_NAMES = { 78 | "oxford": { 79 | "mssplace-li": "oxford_mssplace_li.pth", 80 | "mssplace-lis": "oxford_mssplace_lis.pth", 81 | "mssplace-lit": "oxford_mssplace_lit.pth", 82 | "mssplace-list": "oxford_mssplace_list.pth", 83 | }, 84 | "nclt": { 85 | "mssplace-li": "nclt_mssplace_li.pth", 86 | "mssplace-lis": "nclt_mssplace_lis.pth", 87 | "mssplace-lit": "nclt_mssplace_lit.pth", 88 | "mssplace-list": "nclt_mssplace_list.pth", 89 | "mssplace-i": "nclt_mssplace_i.pth", 90 | "minkloc-multimodal": "nclt_minkloc_multimodal.pth", 91 | "minkloc3dv2": "nclt_minkloc3dv2.pth", 92 | }, 93 | } 94 | 95 | SENSOR_SETUPS = { 96 | "oxford": { 97 | "mssplace-li": [ 98 | "pointcloud_lidar", 99 | "image_stereo_centre", 100 | "image_mono_left", 101 | "image_mono_rear", 102 | "image_mono_right" 103 | ], 104 | "mssplace-lis": [ 105 | "pointcloud_lidar", 106 | "image_stereo_centre", 107 | "image_mono_left", 108 | "image_mono_rear", 109 | "image_mono_right", 110 | "mask_stereo_centre", 111 | "mask_mono_left", 112 | "mask_mono_rear", 113 | "mask_mono_right", 114 | ], 115 | "mssplace-lit": [ 116 | "pointcloud_lidar", 117 | "image_stereo_centre", 118 | "image_mono_left", 119 | "image_mono_rear", 120 | "image_mono_right", 121 | "text_stereo_centre", 122 | "text_mono_left", 123 | "text_mono_rear", 124 | "text_mono_right", 125 | ], 126 | "mssplace-list": [ 127 | "pointcloud_lidar", 128 | "image_stereo_centre", 129 | "image_mono_left", 130 | "image_mono_rear", 131 | "image_mono_right", 132 | "mask_stereo_centre", 133 | "mask_mono_left", 134 | "mask_mono_rear", 135 | "mask_mono_right", 136 | "text_stereo_centre", 137 | "text_mono_left", 138 | "text_mono_rear", 139 | "text_mono_right", 140 | ], 141 | }, 142 | "nclt": { 143 | "mssplace-li": [ 144 | "pointcloud_lidar", 145 | "image_Cam1", 146 | "image_Cam2", 147 | "image_Cam3", 148 | "image_Cam4", 149 | "image_Cam5" 150 | ], 151 | "mssplace-lis": [ 152 | "pointcloud_lidar", 153 | "image_Cam1", 154 | "image_Cam2", 155 | "image_Cam3", 156 | "image_Cam4", 157 | "image_Cam5", 158 | "mask_Cam1", 159 | "mask_Cam2", 160 | "mask_Cam3", 161 | "mask_Cam4", 162 | "mask_Cam5" 163 | ], 164 | "mssplace-lit": [ 165 | "pointcloud_lidar", 166 | "image_Cam1", 167 | "image_Cam2", 168 | "image_Cam3", 169 | "image_Cam4", 170 | "image_Cam5", 171 | "text_Cam1", 172 | "text_Cam2", 173 | "text_Cam3", 174 | "text_Cam4", 175 | "text_Cam5" 176 | ], 177 | "mssplace-list": [ 178 | "pointcloud_lidar", 179 | "image_Cam1", 180 | "image_Cam2", 181 | "image_Cam3", 182 | "image_Cam4", 183 | "image_Cam5", 184 | "mask_Cam1", 185 | "mask_Cam2", 186 | "mask_Cam3", 187 | "mask_Cam4", 188 | "mask_Cam5", 189 | "text_Cam1", 190 | "text_Cam2", 191 | "text_Cam3", 192 | "text_Cam4", 193 | "text_Cam5" 194 | ], 195 | "mssplace-i": [ 196 | "image_Cam1", 197 | "image_Cam2", 198 | "image_Cam3", 199 | "image_Cam4", 200 | "image_Cam5", 201 | ], 202 | "minkloc-multimodal": [ 203 | "pointcloud_lidar", 204 | "image_Cam5", 205 | ], 206 | "minkloc3dv2": [ 207 | "pointcloud_lidar", 208 | ], 209 | } 210 | } 211 | 212 | 213 | def setup_logging(verbose: bool = False) -> None: 214 | """ 215 | Configure loguru logging for the script. 216 | 217 | Args: 218 | verbose: If True, set logging level to DEBUG, otherwise INFO 219 | """ 220 | # Remove default logger 221 | logger.remove() 222 | 223 | # Configure loguru with appropriate level 224 | log_level = "DEBUG" if verbose else "INFO" 225 | logger.add( 226 | sys.stdout, 227 | level=log_level, 228 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}" 229 | ) 230 | 231 | 232 | def validate_paths(datasets_dir: Path, checkpoint_dir: Path, config_dir: Path) -> None: 233 | """ 234 | Validate that all required directories exist. 235 | 236 | Args: 237 | datasets_dir: Path to datasets directory 238 | checkpoint_dir: Path to checkpoints directory 239 | config_dir: Path to configs directory 240 | 241 | Raises: 242 | FileNotFoundError: If any required directory is missing 243 | """ 244 | if not datasets_dir.exists(): 245 | raise FileNotFoundError(f"Datasets directory does not exist: {datasets_dir}") 246 | if not checkpoint_dir.exists(): 247 | raise FileNotFoundError(f"Checkpoints directory does not exist: {checkpoint_dir}") 248 | if not config_dir.exists(): 249 | raise FileNotFoundError(f"Configs directory does not exist: {config_dir}") 250 | 251 | 252 | def get_dataset_path(dataset_name: str, datasets_dir: Path) -> Path: 253 | """ 254 | Get the specific dataset path based on dataset name. 255 | 256 | Args: 257 | dataset_name: Name of the dataset ('oxford' or 'nclt') 258 | datasets_dir: Base datasets directory path 259 | 260 | Returns: 261 | Path to the specific dataset directory 262 | """ 263 | if dataset_name == "oxford": 264 | return datasets_dir / "pnvlad_oxford_robotcar" 265 | elif dataset_name == "nclt": 266 | return datasets_dir / "NCLT_preprocessed" 267 | else: 268 | raise ValueError(f"Unknown dataset: {dataset_name}") 269 | 270 | 271 | def load_checkpoint(checkpoint_path: Path, device: str = "cpu") -> dict: 272 | """ 273 | Load model checkpoint from file. 274 | 275 | Args: 276 | checkpoint_path: Path to checkpoint file 277 | device: Device to load checkpoint to 278 | 279 | Returns: 280 | Dictionary containing model state dict 281 | 282 | Raises: 283 | FileNotFoundError: If checkpoint file doesn't exist 284 | """ 285 | if not checkpoint_path.exists(): 286 | raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}") 287 | 288 | logger.info(f"Loading checkpoint from: {checkpoint_path}") 289 | checkpoint = torch.load(checkpoint_path, map_location=device) 290 | 291 | # Handle different checkpoint formats 292 | if "model_state_dict" in checkpoint: 293 | checkpoint = checkpoint["model_state_dict"] 294 | 295 | logger.info(f"Checkpoint loaded with {len(checkpoint.keys())} parameter groups") 296 | return checkpoint 297 | 298 | 299 | def load_model_config(config_path: Path) -> OmegaConf: 300 | """ 301 | Load model configuration from YAML file. 302 | 303 | Args: 304 | config_path: Path to configuration file 305 | 306 | Returns: 307 | OmegaConf configuration object 308 | 309 | Raises: 310 | FileNotFoundError: If config file doesn't exist 311 | """ 312 | if not config_path.exists(): 313 | raise FileNotFoundError(f"Config not found at {config_path}") 314 | 315 | logger.info(f"Loading config from: {config_path}") 316 | config = OmegaConf.load(config_path) 317 | 318 | # Log config details for reproducibility 319 | config_dict = OmegaConf.to_container(config, resolve=True) 320 | logger.debug(f"Model configuration: {config_dict}") 321 | 322 | return config 323 | 324 | 325 | def create_dataset(dataset_name: str, data_dir: Path, sensor_setup: list[str]) -> torch.utils.data.Dataset: 326 | """ 327 | Create dataset instance based on dataset name with specified sensor setup. 328 | 329 | Always uses text-enabled dataset classes (*DatasetWithText) to ensure 330 | compatibility with all model variants, including text-based models. 331 | 332 | Args: 333 | dataset_name: Name of dataset ('oxford' or 'nclt') 334 | data_dir: Path to dataset directory 335 | sensor_setup: List of sensors/modalities to load 336 | 337 | Returns: 338 | Dataset instance ready for testing (text-enabled) 339 | """ 340 | logger.info(f"Creating {dataset_name} dataset from: {data_dir}") 341 | logger.debug(f"Sensor setup: {sensor_setup}") 342 | 343 | if dataset_name == "oxford": 344 | dataset = OxfordDatasetWithText( 345 | dataset_root=data_dir, 346 | subset="test", 347 | data_to_load=sensor_setup, 348 | pointcloud_quantization_size=0.01, 349 | ) 350 | elif dataset_name == "nclt": 351 | dataset = NCLTDatasetWithText( 352 | dataset_root=data_dir, 353 | subset="test", 354 | data_to_load=sensor_setup, 355 | ) 356 | else: 357 | raise ValueError(f"Unknown dataset: {dataset_name}") 358 | 359 | logger.info(f"Dataset created with {len(dataset)} samples") 360 | return dataset 361 | 362 | 363 | def evaluate_model( 364 | model: torch.nn.Module, 365 | dataloader: DataLoader, 366 | device: str, 367 | distance_threshold: float = 25.0, 368 | memory_batch_size: int | None = None, 369 | verbose: bool = True, 370 | ) -> tuple[dict[str, float], RetrievalResultsCollection]: 371 | """ 372 | Evaluate model performance using comprehensive ModelTester. 373 | 374 | This function leverages the advanced ModelTester class to provide detailed 375 | place recognition analysis beyond simple aggregate metrics. It supports 376 | memory-efficient evaluation and returns comprehensive results suitable 377 | for research analysis and reproducibility. 378 | 379 | Args: 380 | model: PyTorch model to evaluate 381 | dataloader: DataLoader for test data 382 | device: Device to run evaluation on 383 | distance_threshold: Distance threshold for positive matches (meters) 384 | memory_batch_size: If specified, compute distance matrix in batches 385 | to reduce peak memory usage. Useful for large datasets. 386 | verbose: Whether to show detailed progress information 387 | 388 | Returns: 389 | Tuple containing: 390 | - dict: Aggregate metrics (recall_at_n, recall_at_one_percent, etc.) 391 | - RetrievalResultsCollection: Detailed per-query results for analysis 392 | 393 | Note: 394 | The memory_batch_size parameter trades computation speed for memory 395 | efficiency. For datasets with >10k samples, consider using batch 396 | sizes of 1000-5000 depending on available RAM. 397 | """ 398 | logger.info("Starting comprehensive model evaluation with ModelTester...") 399 | 400 | # Initialize ModelTester with memory-efficient settings 401 | tester = ModelTester( 402 | model=model, 403 | dataloader=dataloader, 404 | dist_thresh=distance_threshold, 405 | at_n=25, # Standard benchmark value 406 | device=device, 407 | verbose=verbose, 408 | batch_size=memory_batch_size, # Enable memory-efficient computation 409 | ) 410 | 411 | # Run comprehensive evaluation 412 | results_collection = tester.run() 413 | 414 | # Extract aggregate metrics for backward compatibility 415 | aggregate_metrics = results_collection.aggregate_metrics() 416 | 417 | # Convert to format expected by existing display logic 418 | recall_at_n_array = aggregate_metrics["recall_at_n"] 419 | recall_at_one_percent = aggregate_metrics["recall_at_one_percent"] 420 | 421 | # For top1_distance, use the aggregate value or compute if None 422 | top1_distance = aggregate_metrics.get("top1_distance") 423 | if top1_distance is None: 424 | # Fallback: compute mean embedding distance of correct top-1 matches 425 | top1_distances = [] 426 | for result in results_collection.results: 427 | if result.queries_with_matches > 0 and result.top1_distance is not None: 428 | top1_distances.append(result.top1_distance) 429 | top1_distance = sum(top1_distances) / len(top1_distances) if top1_distances else 0.0 430 | 431 | # Create backward-compatible metrics dict 432 | backward_compatible_metrics = { 433 | "recall_at_n": recall_at_n_array, 434 | "recall_at_one_percent": recall_at_one_percent, 435 | "mean_top1_descriptor_distance": top1_distance, 436 | } 437 | 438 | logger.info("Comprehensive model evaluation completed") 439 | logger.info(f"Processed {results_collection.num_pairs} track pairs with " 440 | f"{results_collection.num_queries} total queries") 441 | 442 | return backward_compatible_metrics, results_collection 443 | 444 | 445 | def save_evaluation_results( 446 | results_collection: RetrievalResultsCollection, 447 | dataset_name: str, 448 | model_name: str, 449 | results_dir: Path, 450 | additional_metadata: dict | None = None, 451 | ) -> Path: 452 | """ 453 | Save detailed evaluation results to disk for later analysis. 454 | 455 | Creates a structured filename and saves both the raw results collection 456 | and additional metadata for research reproducibility. 457 | 458 | Args: 459 | results_collection: Detailed results from ModelTester 460 | dataset_name: Name of the evaluated dataset 461 | model_name: Name of the evaluated model 462 | results_dir: Directory to save results 463 | additional_metadata: Optional dict with extra information to save 464 | 465 | Returns: 466 | Path to the saved results file 467 | 468 | Note: 469 | Results are saved in JSON format with timestamp for uniqueness. 470 | The file includes both detailed per-query results and aggregate metrics. 471 | """ 472 | from datetime import datetime 473 | 474 | # Create results directory if it doesn't exist 475 | results_dir.mkdir(parents=True, exist_ok=True) 476 | 477 | # Generate timestamped filename 478 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 479 | filename = f"{dataset_name}_{model_name}_results_{timestamp}.json" 480 | results_path = results_dir / filename 481 | 482 | logger.info(f"Saving detailed evaluation results to: {results_path}") 483 | 484 | # Save the results collection (includes built-in JSON serialization) 485 | results_collection.save(str(results_path)) 486 | 487 | # If metadata provided, save it alongside 488 | if additional_metadata: 489 | metadata_path = results_dir / f"{dataset_name}_{model_name}_metadata_{timestamp}.json" 490 | import json 491 | with open(metadata_path, 'w') as f: 492 | json.dump(additional_metadata, f, indent=2) 493 | logger.info(f"Saved evaluation metadata to: {metadata_path}") 494 | 495 | return results_path 496 | 497 | 498 | def format_percentage(value: float) -> str: 499 | """ 500 | Format a decimal value as a percentage with 2 decimal places (truncated, not rounded). 501 | 502 | Args: 503 | value: Decimal value between 0 and 1 504 | 505 | Returns: 506 | Formatted percentage string 507 | """ 508 | # Truncate to 2 decimal places without rounding, as in original notebook 509 | integer_part = int(value * 100) 510 | decimal_part = int((value * 100) % 1 * 100) 511 | return f"{integer_part}.{decimal_part:02d}%" 512 | 513 | 514 | def main() -> None: 515 | """ 516 | Main function that orchestrates the checkpoint testing process. 517 | """ 518 | parser = argparse.ArgumentParser( 519 | description="Test MSSPlace model checkpoints on Oxford and NCLT datasets", 520 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 521 | ) 522 | 523 | parser.add_argument( 524 | "--dataset", 525 | type=str, 526 | choices=DATASET_CHOICES, 527 | required=True, 528 | help="Dataset to test on" 529 | ) 530 | 531 | parser.add_argument( 532 | "--model", 533 | type=str, 534 | choices=MODEL_CHOICES, 535 | required=True, 536 | help="Model variant to test" 537 | ) 538 | 539 | parser.add_argument( 540 | "--datasets-dir", 541 | type=Path, 542 | default=Path("/home/docker_mssplace/Datasets"), 543 | help="Path to datasets directory" 544 | ) 545 | 546 | parser.add_argument( 547 | "--checkpoints-dir", 548 | type=Path, 549 | default=Path(__file__).parent.parent.parent / "checkpoints", 550 | help="Path to checkpoints directory" 551 | ) 552 | 553 | parser.add_argument( 554 | "--configs-dir", 555 | type=Path, 556 | default=Path(__file__).parent.parent.parent / "configs" / "model", 557 | help="Path to model configs directory" 558 | ) 559 | 560 | parser.add_argument( 561 | "--batch-size", 562 | type=int, 563 | default=32, 564 | help="Batch size for evaluation" 565 | ) 566 | 567 | parser.add_argument( 568 | "--num-workers", 569 | type=int, 570 | default=4, 571 | help="Number of dataloader workers" 572 | ) 573 | 574 | parser.add_argument( 575 | "--distance-threshold", 576 | type=float, 577 | default=25.0, 578 | help="Distance threshold for positive matches (meters)" 579 | ) 580 | 581 | parser.add_argument( 582 | "--device", 583 | type=str, 584 | default="cuda" if torch.cuda.is_available() else "cpu", 585 | help="Device to run evaluation on" 586 | ) 587 | 588 | parser.add_argument( 589 | "--verbose", 590 | action="store_true", 591 | help="Enable verbose logging" 592 | ) 593 | 594 | parser.add_argument( 595 | "--memory-batch-size", 596 | type=int, 597 | default=None, 598 | help="Batch size for memory-efficient distance computation (reduces memory usage)" 599 | ) 600 | 601 | parser.add_argument( 602 | "--save-results", 603 | action="store_true", 604 | help="Save detailed evaluation results to JSON file for later analysis" 605 | ) 606 | 607 | parser.add_argument( 608 | "--results-dir", 609 | type=Path, 610 | default=Path("./evaluation_results"), 611 | help="Directory to save detailed evaluation results" 612 | ) 613 | 614 | args = parser.parse_args() 615 | 616 | # Setup logging 617 | setup_logging(args.verbose) 618 | logger.info(f"Starting checkpoint testing for {args.model} on {args.dataset}") 619 | 620 | try: 621 | # Validate all required paths exist 622 | validate_paths(args.datasets_dir, args.checkpoints_dir, args.configs_dir) 623 | 624 | # Get specific paths for this dataset/model combination 625 | checkpoint_name = CHECKPOINT_NAMES[args.dataset][args.model] 626 | config_name = MODEL_CONFIG_NAMES[args.model] 627 | sensor_setup = SENSOR_SETUPS[args.dataset][args.model] 628 | 629 | checkpoint_path = args.checkpoints_dir / checkpoint_name 630 | config_path = args.configs_dir / config_name 631 | data_dir = get_dataset_path(args.dataset, args.datasets_dir) 632 | 633 | # Load checkpoint and config 634 | checkpoint = load_checkpoint(checkpoint_path, device="cpu") 635 | config = load_model_config(config_path) 636 | 637 | # Initialize model and load weights 638 | logger.info("Initializing model...") 639 | model = instantiate(config) 640 | model.load_state_dict(checkpoint, strict=True) 641 | 642 | num_parameters = sum(p.numel() for p in model.parameters()) 643 | logger.info(f"Model loaded successfully with {num_parameters:,} parameters") 644 | 645 | # Create dataset and dataloader 646 | dataset = create_dataset(args.dataset, data_dir, sensor_setup) 647 | dataloader = DataLoader( 648 | dataset=dataset, 649 | batch_size=args.batch_size, 650 | shuffle=False, 651 | num_workers=args.num_workers, 652 | pin_memory=True if args.device == "cuda" else False, 653 | collate_fn=dataset.collate_fn, 654 | drop_last=False, 655 | ) 656 | 657 | # Evaluate model using comprehensive ModelTester 658 | metrics, results_collection = evaluate_model( 659 | model=model, 660 | dataloader=dataloader, 661 | device=args.device, 662 | distance_threshold=args.distance_threshold, 663 | memory_batch_size=args.memory_batch_size, 664 | verbose=args.verbose, 665 | ) 666 | 667 | # Extract metrics for display (backward compatibility) 668 | recall_at_n = metrics["recall_at_n"] 669 | recall_at_one_percent = metrics["recall_at_one_percent"] 670 | mean_top1_descriptor_distance = metrics["mean_top1_descriptor_distance"] 671 | 672 | # Optionally save detailed results for research analysis 673 | if args.save_results: 674 | evaluation_metadata = { 675 | "dataset": args.dataset, 676 | "model": args.model, 677 | "device": args.device, 678 | "distance_threshold": args.distance_threshold, 679 | "batch_size": args.batch_size, 680 | "num_workers": args.num_workers, 681 | "memory_batch_size": args.memory_batch_size, 682 | "model_parameters": num_parameters, 683 | "dataset_size": len(dataset), 684 | "evaluation_timestamp": datetime.now().isoformat(), 685 | } 686 | 687 | results_path = save_evaluation_results( 688 | results_collection=results_collection, 689 | dataset_name=args.dataset, 690 | model_name=args.model, 691 | results_dir=args.results_dir, 692 | additional_metadata=evaluation_metadata, 693 | ) 694 | 695 | logger.info(f"Detailed results saved for future analysis: {results_path}") 696 | 697 | # Display results with enhanced information 698 | print("\n" + "="*60) 699 | print("COMPREHENSIVE EVALUATION RESULTS") 700 | print("="*60) 701 | print(f"Dataset: {args.dataset}") 702 | print(f"Model: {args.model}") 703 | print(f"Device: {args.device}") 704 | print(f"Distance threshold: {args.distance_threshold}m") 705 | if args.memory_batch_size: 706 | print(f"Memory batch size: {args.memory_batch_size} (memory-efficient mode)") 707 | print("-"*60) 708 | 709 | # Traditional metrics (backward compatibility) 710 | print("PLACE RECOGNITION METRICS:") 711 | print(f" AR@1 = {format_percentage(recall_at_n[0])}") 712 | print(f" AR@1% = {format_percentage(recall_at_one_percent)}") 713 | print(f" Mean top-1 descriptor distance: {mean_top1_descriptor_distance:.6f}") 714 | 715 | # Enhanced insights from detailed analysis 716 | print("\nDETAILED ANALYSIS:") 717 | aggregate_metrics = results_collection.aggregate_metrics() 718 | print(f" Total track pairs evaluated: {results_collection.num_pairs}") 719 | print(f" Total query samples: {results_collection.num_queries}") 720 | print(f" Queries with ground truth matches: {aggregate_metrics['queries_with_matches']}") 721 | print(f" Overall accuracy (correct top-1): {format_percentage(aggregate_metrics['overall_accuracy'])}") 722 | 723 | # Additional recall metrics for research insight 724 | if len(recall_at_n) >= 5: 725 | print(f" AR@5 = {format_percentage(recall_at_n[4])}") 726 | if len(recall_at_n) >= 10: 727 | print(f" AR@10 = {format_percentage(recall_at_n[9])}") 728 | 729 | print("="*60) 730 | 731 | logger.info("Checkpoint testing completed successfully") 732 | 733 | except Exception as e: 734 | logger.error(f"Error during checkpoint testing: {e}") 735 | sys.exit(1) 736 | 737 | 738 | if __name__ == "__main__": 739 | main() 740 | -------------------------------------------------------------------------------- /src/mssplace/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/src/mssplace/__init__.py -------------------------------------------------------------------------------- /src/mssplace/datasets.py: -------------------------------------------------------------------------------- 1 | """Datasets implementation.""" 2 | from pathlib import Path 3 | from typing import Any, Dict, List, Literal, Optional, Tuple, Union 4 | 5 | import cv2 6 | import MinkowskiEngine as ME # type: ignore 7 | import numpy as np 8 | import torch 9 | from torch import Tensor 10 | from opr.datasets.base import BasePlaceRecognitionDataset 11 | from opr.utils import cartesian_to_spherical 12 | 13 | 14 | def collate_data_dict( 15 | dataset: BasePlaceRecognitionDataset, data_list: List[Dict[str, Tensor]] 16 | ) -> Dict[str, Tensor]: 17 | """Pack input data list into batch.""" 18 | result: Dict[str, Tensor] = {} 19 | result["idxs"] = torch.stack([e["idx"] for e in data_list], dim=0) 20 | for data_key in data_list[0].keys(): 21 | if data_key == "idx": 22 | continue 23 | elif data_key == "utm": 24 | result["utms"] = torch.stack([e["utm"] for e in data_list], dim=0) 25 | elif data_key.startswith("image_"): 26 | result[f"images_{data_key[6:]}"] = torch.stack([e[data_key] for e in data_list]) 27 | elif data_key.startswith("mask_"): 28 | result[f"masks_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list]) 29 | elif data_key.startswith("text_"): 30 | result[f"texts_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list]) 31 | elif data_key == "pointcloud_lidar_coords": 32 | coords_list = [e["pointcloud_lidar_coords"] for e in data_list] 33 | feats_list = [e["pointcloud_lidar_feats"] for e in data_list] 34 | n_points = [int(e.shape[0]) for e in coords_list] 35 | coords_tensor = torch.cat(coords_list, dim=0).unsqueeze(0) # (1,batch_size*n_points,3) 36 | if dataset.pointcloud_set_transform is not None: 37 | # Apply the same transformation on all dataset elements 38 | coords_tensor = dataset.pointcloud_set_transform(coords_tensor) 39 | coords_list = torch.split(coords_tensor.squeeze(0), split_size_or_sections=n_points, dim=0) 40 | quantized_coords_list = [] 41 | quantized_feats_list = [] 42 | for coords, feats in zip(coords_list, feats_list): 43 | quantized_coords, quantized_feats = ME.utils.sparse_quantize( 44 | coordinates=coords, 45 | features=feats, 46 | quantization_size=dataset._pointcloud_quantization_size, 47 | ) 48 | quantized_coords_list.append(quantized_coords) 49 | quantized_feats_list.append(quantized_feats) 50 | 51 | result["pointclouds_lidar_coords"] = ME.utils.batched_coordinates(quantized_coords_list) 52 | result["pointclouds_lidar_feats"] = torch.cat(quantized_feats_list) 53 | elif data_key == "pointcloud_lidar_feats": 54 | continue 55 | else: 56 | raise ValueError(f"Unknown data key: {data_key!r}") 57 | return result 58 | 59 | 60 | class NCLTDatasetWithText(BasePlaceRecognitionDataset): 61 | """NCLT dataset implementation with text embeddings.""" 62 | 63 | _images_dirname: str 64 | _masks_dirname: str 65 | _pointclouds_dirname: str 66 | _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] 67 | _max_point_distance: Optional[float] 68 | _spherical_coords: bool 69 | _use_intensity_values: bool 70 | _valid_data: Tuple[str, ...] = ( 71 | "image_Cam0", 72 | "image_Cam1", 73 | "image_Cam2", 74 | "image_Cam3", 75 | "image_Cam4", 76 | "image_Cam5", 77 | "pointcloud_lidar", 78 | "mask_Cam0", 79 | "mask_Cam1", 80 | "mask_Cam2", 81 | "mask_Cam3", 82 | "mask_Cam4", 83 | "mask_Cam5", 84 | "text_Cam0", 85 | "text_Cam1", 86 | "text_Cam2", 87 | "text_Cam3", 88 | "text_Cam4", 89 | "text_Cam5", 90 | ) 91 | 92 | def __init__( 93 | self, 94 | dataset_root: Union[str, Path], 95 | subset: Literal["train", "val", "test"], 96 | data_to_load: Union[str, Tuple[str, ...]], 97 | positive_threshold: float = 10.0, 98 | negative_threshold: float = 50.0, 99 | images_dirname: str = "images_small", 100 | masks_dirname: str = "segmentation_masks_small", 101 | text_embeddings_dirname: str = "clip-vit-base-patch32", 102 | pointclouds_dirname: str = "velodyne_data", 103 | pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.5, 104 | max_point_distance: Optional[float] = None, 105 | spherical_coords: bool = False, 106 | use_intensity_values: bool = False, 107 | image_transform: Optional[Any] = None, 108 | semantic_transform: Optional[Any] = None, 109 | pointcloud_transform: Optional[Any] = None, 110 | pointcloud_set_transform: Optional[Any] = None, 111 | ) -> None: 112 | """NCLT dataset implementation. 113 | 114 | Args: 115 | dataset_root (Union[str, Path]): Path to the dataset root directory. 116 | subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train". 117 | data_to_load (Union[str, Tuple[str, ...]]): The list of data to load. 118 | Check the documentation for the list of available data. 119 | positive_threshold (float): The UTM distance threshold value for positive samples. 120 | Defaults to 10.0. 121 | negative_threshold (float): The UTM distance threshold value for negative samples. 122 | Defaults to 50.0. 123 | images_dirname (str): Images directory name. It should be specified explicitly 124 | if custom preprocessing was done. Defaults to "images". 125 | masks_dirname (str): Masks directory name. It should be specified explicitly 126 | if custom preprocessing was done. Defaults to "segmentation_masks". 127 | text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32". 128 | pointclouds_dirname (str): Point clouds directory name. It should be specified 129 | explicitly if custom preprocessing was done. Defaults to "velodyne_data". 130 | pointcloud_quantization_size (float, optional): The quantization size for point clouds. 131 | Defaults to 0.01. 132 | max_point_distance (float, optional): The maximum distance of points from the origin. 133 | Defaults to None. 134 | spherical_coords (bool): Whether to use spherical coordinates for point clouds. 135 | Defaults to False. 136 | use_intensity_values (bool): Whether to use intensity values for point clouds. Defaults to False. 137 | image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used. 138 | Defaults to None. 139 | semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform 140 | will be used. Defaults to None. 141 | pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform 142 | will be used. Defaults to None. 143 | pointcloud_set_transform (Any, optional): Point clouds set transform. If None, 144 | DefaultCloudSetTransform will be used. Defaults to None. 145 | 146 | Raises: 147 | ValueError: If data_to_load contains invalid data source names. 148 | FileNotFoundError: If images, masks or pointclouds directory does not exist. 149 | """ 150 | super().__init__( 151 | dataset_root, 152 | subset, 153 | data_to_load, 154 | positive_threshold, 155 | negative_threshold, 156 | image_transform, 157 | semantic_transform, 158 | pointcloud_transform, 159 | pointcloud_set_transform, 160 | ) 161 | 162 | if subset == "test": 163 | self.dataset_df["in_query"] = True # for compatibility with Oxford Dataset 164 | 165 | if any(elem not in self._valid_data for elem in self.data_to_load): 166 | raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}") 167 | 168 | _track_name = self.dataset_df.iloc[0]["track"] 169 | 170 | if any(elem.startswith("image") for elem in self.data_to_load): 171 | self._images_dirname = images_dirname 172 | if not (self.dataset_root / _track_name / self._images_dirname).exists(): 173 | raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.") 174 | 175 | if any(elem.startswith("mask") for elem in self.data_to_load): 176 | self._masks_dirname = masks_dirname 177 | if not (self.dataset_root / _track_name / self._masks_dirname).exists(): 178 | raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.") 179 | 180 | if any(elem.startswith("text") for elem in self.data_to_load): 181 | self._text_embeddings_dirname = text_embeddings_dirname 182 | if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists(): 183 | raise FileNotFoundError( 184 | f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist." 185 | ) 186 | 187 | if "pointcloud_lidar" in self.data_to_load: 188 | self._pointclouds_dirname = pointclouds_dirname 189 | if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists(): 190 | raise FileNotFoundError( 191 | f"Pointclouds directory {self._pointclouds_dirname!r} does not exist." 192 | ) 193 | 194 | self._pointcloud_quantization_size = pointcloud_quantization_size 195 | self._max_point_distance = max_point_distance 196 | self._spherical_coords = spherical_coords 197 | self._use_intensity_values = use_intensity_values 198 | 199 | def __getitem__(self, idx: int) -> Dict[str, Tensor]: # noqa: D105 200 | row = self.dataset_df.iloc[idx] 201 | data = {"idx": torch.tensor(idx, dtype=int)} 202 | data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64)) 203 | track_dir = self.dataset_root / str(row["track"]) 204 | 205 | for data_source in self.data_to_load: 206 | if data_source.startswith("image_"): 207 | cam_name = data_source[6:] # remove "image_" prefix 208 | image_ts = int(row["image"]) 209 | im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png" 210 | im = cv2.imread(str(im_filepath)) 211 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 212 | im = self.image_transform(im) 213 | data[data_source] = im 214 | elif data_source.startswith("mask_"): 215 | cam_name = data_source[5:] # remove "mask_" prefix 216 | image_ts = int(row["image"]) 217 | mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png" 218 | mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED) 219 | mask = self.semantic_transform(mask) 220 | data[data_source] = mask 221 | elif data_source.startswith("text_"): 222 | cam_name = data_source[5:] # remove "text_" prefix 223 | image_ts = int(row["image"]) 224 | text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt" 225 | text_embedding = torch.load(text_filepath, map_location="cpu").squeeze() 226 | data[data_source] = text_embedding 227 | elif data_source == "pointcloud_lidar": 228 | pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin" 229 | pointcloud = self._load_pc(pc_filepath) 230 | data[f"{data_source}_coords"] = self.pointcloud_transform(pointcloud[:, :3]) 231 | if self._use_intensity_values: 232 | data[f"{data_source}_feats"] = pointcloud[:, 3].unsqueeze(1) 233 | else: 234 | data[f"{data_source}_feats"] = torch.ones_like(pointcloud[:, :1]) 235 | 236 | return data 237 | 238 | def _load_pc(self, filepath: Union[str, Path]) -> Tensor: 239 | if self._use_intensity_values: 240 | raise NotImplementedError("Intensity values are not supported yet.") 241 | pc = np.fromfile(filepath, dtype=np.float32).reshape(-1, 3) 242 | if self._max_point_distance is not None: 243 | pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance] 244 | if self._spherical_coords: 245 | pc = cartesian_to_spherical(pc, dataset_name="nclt") 246 | pc_tensor = torch.tensor(pc, dtype=torch.float) 247 | return pc_tensor 248 | 249 | def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: 250 | """Pack input data list into batch. 251 | 252 | Args: 253 | data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader. 254 | 255 | Returns: 256 | Dict[str, Tensor]: dictionary of batched data. 257 | """ 258 | return collate_data_dict(self, data_list) 259 | 260 | 261 | class OxfordDatasetWithText(BasePlaceRecognitionDataset): 262 | """PointNetVLAD Oxford RobotCar dataset implementation with text embeddings.""" 263 | 264 | _images_dirname: str 265 | _masks_dirname: str 266 | _pointclouds_dirname: str 267 | _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] 268 | _max_point_distance: Optional[float] 269 | _spherical_coords: bool 270 | _valid_data: Tuple[str, ...] = ( 271 | "image_stereo_centre", 272 | "image_mono_left", 273 | "image_mono_rear", 274 | "image_mono_right", 275 | "pointcloud_lidar", 276 | "mask_stereo_centre", 277 | "mask_mono_left", 278 | "mask_mono_rear", 279 | "mask_mono_right", 280 | "text_stereo_centre", 281 | "text_mono_left", 282 | "text_mono_rear", 283 | "text_mono_right", 284 | ) 285 | 286 | def __init__( 287 | self, 288 | dataset_root: Union[str, Path], 289 | subset: Literal["train", "val", "test"], 290 | data_to_load: Union[str, Tuple[str, ...]], 291 | positive_threshold: float = 10.0, 292 | negative_threshold: float = 50.0, 293 | images_dirname: str = "images_small", 294 | masks_dirname: str = "segmentation_masks_small", 295 | text_embeddings_dirname: str = "clip-vit-base-patch32", 296 | pointclouds_dirname: Optional[str] = None, 297 | pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.01, 298 | max_point_distance: Optional[float] = None, 299 | spherical_coords: bool = False, 300 | image_transform: Optional[Any] = None, 301 | semantic_transform: Optional[Any] = None, 302 | pointcloud_transform: Optional[Any] = None, 303 | pointcloud_set_transform: Optional[Any] = None, 304 | ) -> None: 305 | """Oxford RobotCar dataset implementation. 306 | 307 | Original dataset site: https://robotcar-dataset.robots.ox.ac.uk/ 308 | 309 | We use the preprocessed version of the dataset that was introduced 310 | in PointNetVLAD paper: https://arxiv.org/abs/1804.03492. 311 | 312 | Args: 313 | dataset_root (Union[str, Path]): Path to the dataset root directory. 314 | subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train". 315 | data_to_load (Union[str, Tuple[str, ...]]): The list of data to load. 316 | Check the documentation for the list of available data. 317 | positive_threshold (float): The UTM distance threshold value for positive samples. 318 | Defaults to 10.0. 319 | negative_threshold (float): The UTM distance threshold value for negative samples. 320 | Defaults to 50.0. 321 | images_dirname (str): Images directory name. It should be specified explicitly 322 | if custom preprocessing was done. Defaults to "images_small". 323 | masks_dirname (str): Masks directory name. It should be specified explicitly 324 | if custom preprocessing was done. Defaults to "segmentation_masks_small". 325 | text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32". 326 | pointclouds_dirname (Optional[str]): Point clouds directory name. It should be specified 327 | explicitly if custom preprocessing was done. Defaults to None, which sets the dirnames 328 | like in original PointNetVLAD dataset configuration. 329 | pointcloud_quantization_size (float, optional): The quantization size for point clouds. 330 | Defaults to 0.01. 331 | max_point_distance (float, optional): The maximum distance of points from the origin. 332 | Defaults to None. 333 | spherical_coords (bool): Whether to use spherical coordinates for point clouds. 334 | Defaults to False. 335 | image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used. 336 | Defaults to None. 337 | semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform 338 | will be used. Defaults to None. 339 | pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform 340 | will be used. Defaults to None. 341 | pointcloud_set_transform (Any, optional): Point clouds set transform. If None, 342 | DefaultCloudSetTransform will be used. Defaults to None. 343 | 344 | Raises: 345 | ValueError: If data_to_load contains invalid data source names. 346 | FileNotFoundError: If images, masks or pointclouds directory does not exist. 347 | """ 348 | super().__init__( 349 | dataset_root, 350 | subset, 351 | data_to_load, 352 | positive_threshold, 353 | negative_threshold, 354 | image_transform, 355 | semantic_transform, 356 | pointcloud_transform, 357 | pointcloud_set_transform, 358 | ) 359 | 360 | if any(elem not in self._valid_data for elem in self.data_to_load): 361 | raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}") 362 | 363 | _track_name = self.dataset_df.iloc[0]["track"] 364 | 365 | if any(elem.startswith("image") for elem in self.data_to_load): 366 | self._images_dirname = images_dirname 367 | if not (self.dataset_root / _track_name / self._images_dirname).exists(): 368 | raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.") 369 | 370 | if any(elem.startswith("mask") for elem in self.data_to_load): 371 | self._masks_dirname = masks_dirname 372 | if not (self.dataset_root / _track_name / self._masks_dirname).exists(): 373 | raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.") 374 | 375 | if any(elem.startswith("text") for elem in self.data_to_load): 376 | self._text_embeddings_dirname = text_embeddings_dirname 377 | if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists(): 378 | raise FileNotFoundError( 379 | f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist." 380 | ) 381 | 382 | if "pointcloud_lidar" in self.data_to_load: 383 | if pointclouds_dirname is not None: 384 | self._pointclouds_dirname = pointclouds_dirname 385 | elif subset in ("train", "val"): 386 | self._pointclouds_dirname = "pointcloud_20m_10overlap" 387 | else: 388 | self._pointclouds_dirname = "pointcloud_20m" 389 | if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists(): 390 | raise FileNotFoundError( 391 | f"Pointclouds directory {self._pointclouds_dirname!r} does not exist." 392 | ) 393 | 394 | self._pointcloud_quantization_size = pointcloud_quantization_size 395 | self._max_point_distance = max_point_distance 396 | self._spherical_coords = spherical_coords 397 | 398 | def __getitem__(self, idx: int) -> Dict[str, Tensor]: # noqa: D105 399 | row = self.dataset_df.iloc[idx] 400 | data = {"idx": torch.tensor(idx, dtype=int)} 401 | data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64)) 402 | track_dir = self.dataset_root / str(row["track"]) 403 | 404 | for data_source in self.data_to_load: 405 | if data_source.startswith("image_"): 406 | cam_name = data_source[6:] # remove "image_" prefix 407 | image_ts = int(row[cam_name]) 408 | im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png" 409 | im = cv2.imread(str(im_filepath)) 410 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 411 | im = self.image_transform(im) 412 | data[data_source] = im 413 | elif data_source.startswith("mask_"): 414 | cam_name = data_source[5:] # remove "mask_" prefix 415 | image_ts = int(row[cam_name]) 416 | mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png" 417 | mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED) 418 | mask = self.semantic_transform(mask) 419 | data[data_source] = mask 420 | elif data_source.startswith("text_"): 421 | cam_name = data_source[5:] # remove "text_" prefix 422 | image_ts = int(row[cam_name]) 423 | text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt" 424 | text_embedding = torch.load(text_filepath, map_location="cpu").squeeze() 425 | data[data_source] = text_embedding 426 | elif data_source == "pointcloud_lidar": 427 | pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin" 428 | coords = self._load_pc(pc_filepath) 429 | coords = self.pointcloud_transform(coords) 430 | if self._spherical_coords: 431 | raise NotImplementedError("Spherical coords are not implemented yet.") 432 | data[f"{data_source}_coords"] = coords 433 | data[f"{data_source}_feats"] = torch.ones_like(coords[:, :1]) 434 | 435 | return data 436 | 437 | def _load_pc(self, filepath: Union[str, Path]) -> Tensor: 438 | pc = np.fromfile(filepath, dtype=np.float64).reshape(-1, 3) 439 | if self._max_point_distance is not None: 440 | pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance] 441 | pc_tensor = torch.tensor(pc, dtype=torch.float) 442 | return pc_tensor 443 | 444 | def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: 445 | """Pack input data list into batch. 446 | 447 | Args: 448 | data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader. 449 | 450 | Returns: 451 | Dict[str, Tensor]: dictionary of batched data. 452 | """ 453 | return collate_data_dict(self, data_list) 454 | -------------------------------------------------------------------------------- /src/mssplace/modality_interaction_layers.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import MinkowskiEngine as ME # noqa: N817 4 | import torch 5 | from torch import nn, Tensor 6 | from opr.models.place_recognition.base import ImageModel, SemanticModel, CloudModel 7 | from opr.modules import Concat 8 | 9 | _modalities = ("image", "cloud", "semantic", "text") 10 | 11 | 12 | class LateFusionModel(nn.Module): 13 | """Meta-model for multimodal Place Recognition architectures with late fusion.""" 14 | 15 | def __init__( 16 | self, 17 | image_module: Optional[ImageModel] = None, 18 | semantic_module: Optional[SemanticModel] = None, 19 | cloud_module: Optional[CloudModel] = None, 20 | text_module: Optional[nn.Module] = None, 21 | soc_module: Optional[nn.Module] = None, 22 | fusion_module: Optional[nn.Module] = None, 23 | ) -> None: 24 | """Meta-model for multimodal Place Recognition architectures with late fusion. 25 | 26 | Args: 27 | image_module (ImageModule, optional): Image modality branch. Defaults to None. 28 | semantic_module (SemanticModel, optional): Semantic modality branch. Defaults to None. 29 | cloud_module (CloudModule, optional): Cloud modality branch. Defaults to None. 30 | soc_module (nn.Module, optional): Module to fuse different modalities. 31 | fusion_module (FusionModule, optional): Module to fuse different modalities. 32 | If None, will be set to opr.modules.Concat(). Defaults to None. 33 | """ 34 | super().__init__() 35 | 36 | self.image_module = image_module 37 | self.semantic_module = semantic_module 38 | self.cloud_module = cloud_module 39 | self.text_module = text_module 40 | self.soc_module = soc_module 41 | if fusion_module: 42 | self.fusion_module = fusion_module 43 | else: 44 | self.fusion_module = Concat() 45 | 46 | def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: # noqa: D102 47 | out_dict: dict[str, Tensor] = {} 48 | 49 | if self.image_module is not None: 50 | out_dict["image"] = self.image_module(batch)["final_descriptor"] 51 | 52 | if self.semantic_module is not None: 53 | out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"] 54 | 55 | if self.cloud_module is not None: 56 | out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"] 57 | 58 | if self.text_module is not None: 59 | out_dict["text"] = self.text_module(batch)["final_descriptor"] 60 | 61 | if self.soc_module is not None: 62 | out_dict["soc"] = self.soc_module(batch)["final_descriptor"] 63 | 64 | out_dict = self.fusion_module(out_dict) 65 | 66 | if not isinstance(out_dict, dict): 67 | out_dict = {"final_descriptor": out_dict} 68 | 69 | return out_dict 70 | 71 | 72 | class MiddleFusionModel(LateFusionModel): 73 | def __init__( 74 | self, 75 | image_module: Optional[ImageModel] = None, 76 | semantic_module: Optional[SemanticModel] = None, 77 | cloud_module: Optional[CloudModel] = None, 78 | soc_module: Optional[nn.Module] = None, 79 | fusion_module: Optional[nn.Module] = None, 80 | ) -> None: 81 | super().__init__(image_module, semantic_module, cloud_module, soc_module, fusion_module) 82 | self.cloud_dim_reduction = ME.MinkowskiAvgPooling(kernel_size=3, stride=3, dimension=3) 83 | self.final_fusion = Concat() 84 | 85 | def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: # noqa: D102 86 | ### step 1: feature extraction 87 | if self.image_module is not None: 88 | img_features = {} 89 | img_features_shapes = {} 90 | for key, value in batch.items(): 91 | if key.startswith("images_"): 92 | img_features[key] = self.image_module.backbone(value) 93 | img_features_shapes[key] = img_features[key].shape 94 | img_features[key] = ( 95 | img_features[key] 96 | .view(img_features[key].shape[0], img_features[key].shape[1], -1) 97 | .permute(0, 2, 1) 98 | ) # (B, N_feats, Desc_dim) 99 | if self.semantic_module is not None: 100 | semantic_features = {} 101 | semantic_features_shapes = {} 102 | for key, value in batch.items(): 103 | if key.startswith("masks_"): 104 | semantic_features[key] = self.semantic_module.backbone(value) 105 | semantic_features_shapes[key] = semantic_features[key].shape 106 | semantic_features[key] = ( 107 | semantic_features[key] 108 | .view(semantic_features[key].shape[0], semantic_features[key].shape[1], -1) 109 | .permute(0, 2, 1) 110 | ) # (B, N_feats, Desc_dim) 111 | if self.cloud_module is not None: 112 | sparse_voxel = ME.SparseTensor( 113 | features=batch["pointclouds_lidar_feats"], coordinates=batch["pointclouds_lidar_coords"] 114 | ) 115 | sparse_cloud_features = self.cloud_module.backbone(sparse_voxel) 116 | sparse_cloud_features = self.cloud_dim_reduction(sparse_cloud_features) 117 | # TODO: add text model 118 | 119 | ### step 2: transformer interaction 120 | tokens_dict = {} 121 | if self.image_module is not None: 122 | tokens_dict["image"] = torch.cat(list(img_features.values()), dim=1) 123 | if self.semantic_module is not None: 124 | tokens_dict["semantic"] = torch.cat(list(semantic_features.values()), dim=1) 125 | if self.cloud_module is not None: 126 | min_coordinate = torch.tensor( 127 | [ 128 | torch.min(sparse_cloud_features.C[:, 1]), 129 | torch.min(sparse_cloud_features.C[:, 2]), 130 | torch.min(sparse_cloud_features.C[:, 3]), 131 | ] 132 | ) 133 | dense_cloud_features, _, _ = sparse_cloud_features.dense(min_coordinate=min_coordinate) 134 | dense_cloud_shape = dense_cloud_features.shape 135 | dense_cloud_features = dense_cloud_features.view( 136 | dense_cloud_features.shape[0], dense_cloud_features.shape[1], -1 137 | ).permute(0, 2, 1) # (B, N_feats, Desc_dim) 138 | tokens_dict["cloud"] = dense_cloud_features 139 | tokens_dict = self.fusion_module(tokens_dict) 140 | 141 | ### step 3: back into initial states and finish processing 142 | out_dict = {} 143 | if self.image_module is not None: 144 | image_feat_lens = [s[-1] * s[-2] for s in img_features_shapes.values()] 145 | img_features_list = torch.split(tokens_dict["image"], image_feat_lens, dim=1) 146 | for key, feats in zip(list(img_features.keys()), img_features_list): 147 | img_features[key] = feats.permute(0, 2, 1).view(*img_features_shapes[key]) 148 | img_features[key] = self.image_module.head(img_features[key]) 149 | out_dict["image"] = self.image_module.fusion(img_features) 150 | if self.cloud_module is not None: 151 | dense_cloud_features = tokens_dict["cloud"].permute(0, 2, 1).view(*dense_cloud_shape) 152 | out_dict["cloud"] = self.cloud_module.head(ME.to_sparse(dense_cloud_features)) 153 | out_dict["final_descriptor"] = self.final_fusion(out_dict) 154 | return out_dict 155 | 156 | 157 | class TransformerModalityInteraction(nn.Module): 158 | def __init__( 159 | self, 160 | desc_dim: int = 256, 161 | image: bool = True, 162 | cloud: bool = True, 163 | semantic: bool = False, 164 | text: bool = False, 165 | use_modality_embeddings: bool = False, 166 | n_heads: int = 4, 167 | n_layers: int = 4, 168 | hidden_dim: int = 1024, 169 | dropout: float = 0.0, 170 | activation: str = "gelu", 171 | ) -> None: 172 | super().__init__() 173 | 174 | self.use_modality_embeddings = use_modality_embeddings 175 | 176 | self.modalities = [] 177 | if image: 178 | self.modalities.append("image") 179 | if cloud: 180 | self.modalities.append("cloud") 181 | if semantic: 182 | self.modalities.append("semantic") 183 | if text: 184 | self.modalities.append("text") 185 | 186 | if self.use_modality_embeddings: 187 | self.modality_embeddings = nn.ParameterDict( 188 | { 189 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None, 190 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None, 191 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None, 192 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None, 193 | } 194 | ) 195 | 196 | transformer_encoder_layer = nn.TransformerEncoderLayer( 197 | d_model=desc_dim, 198 | nhead=n_heads, 199 | dim_feedforward=hidden_dim, 200 | dropout=dropout, 201 | activation=activation, 202 | batch_first=True, 203 | ) 204 | self.transformer_encoder = nn.TransformerEncoder( 205 | transformer_encoder_layer, num_layers=n_layers, enable_nested_tensor=False 206 | ) 207 | 208 | def forward(self, data: dict[str, Tensor]) -> Tensor: 209 | descriptors = [] 210 | 211 | for key in self.modalities: 212 | if self.use_modality_embeddings: 213 | descriptors.append(data[key] + self.modality_embeddings[key]) 214 | else: 215 | descriptors.append(data[key]) 216 | 217 | descriptors = torch.stack(descriptors, dim=1) 218 | # desc_lens = [d.shape[1] for d in descriptors] 219 | # descriptors = torch.cat(descriptors, dim=1) 220 | descriptors = torch.unbind(self.transformer_encoder(descriptors), dim=1) 221 | # descriptors = torch.split(self.transformer_encoder(descriptors), desc_lens, dim=1) 222 | out_dict = {} 223 | for i, key in enumerate(self.modalities): 224 | out_dict[key] = descriptors[i] 225 | out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1) 226 | return out_dict 227 | 228 | 229 | class SelfAttentionModalityInteraction(nn.Module): 230 | def __init__( 231 | self, 232 | desc_dim: int = 256, 233 | image: bool = True, 234 | cloud: bool = True, 235 | semantic: bool = False, 236 | text: bool = False, 237 | use_modality_embeddings: bool = False, 238 | n_heads: int = 4, 239 | dropout: float = 0.0, 240 | ) -> None: 241 | super().__init__() 242 | 243 | self.use_modality_embeddings = use_modality_embeddings 244 | 245 | self.modalities = [] 246 | if image: 247 | self.modalities.append("image") 248 | if cloud: 249 | self.modalities.append("cloud") 250 | if semantic: 251 | self.modalities.append("semantic") 252 | if text: 253 | self.modalities.append("text") 254 | 255 | if self.use_modality_embeddings: 256 | self.modality_embeddings = nn.ParameterDict( 257 | { 258 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None, 259 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None, 260 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None, 261 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None, 262 | } 263 | ) 264 | 265 | self.self_attention = nn.MultiheadAttention( 266 | embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True 267 | ) 268 | 269 | def forward(self, data: dict[str, Tensor]) -> Tensor: 270 | descriptors = [] 271 | 272 | for key in self.modalities: 273 | if self.use_modality_embeddings: 274 | descriptors.append(data[key] + self.modality_embeddings[key]) 275 | else: 276 | descriptors.append(data[key]) 277 | 278 | # descriptors = torch.stack(descriptors, dim=1) 279 | # descriptors = torch.unbind(self.self_attention(descriptors, descriptors, descriptors)[0], dim=1) 280 | desc_lens = [d.shape[1] for d in descriptors] 281 | descriptors = torch.cat(descriptors, dim=1) 282 | descriptors = torch.split( 283 | self.self_attention(descriptors, descriptors, descriptors, need_weights=False)[0], 284 | desc_lens, 285 | dim=1, 286 | ) 287 | out_dict = {} 288 | for i, key in enumerate(self.modalities): 289 | out_dict[key] = descriptors[i] 290 | # out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1) 291 | return out_dict 292 | 293 | 294 | class CrossAttentionModalityInteraction(nn.Module): 295 | def __init__( 296 | self, 297 | desc_dim: int = 256, 298 | image: bool = True, 299 | cloud: bool = True, 300 | semantic: bool = False, 301 | text: bool = False, 302 | use_modality_embeddings: bool = False, 303 | n_heads: int = 4, 304 | dropout: float = 0.0, 305 | ) -> None: 306 | super().__init__() 307 | 308 | self.use_modality_embeddings = use_modality_embeddings 309 | 310 | self.modalities = [] 311 | if image: 312 | self.modalities.append("image") 313 | if cloud: 314 | self.modalities.append("cloud") 315 | if semantic: 316 | self.modalities.append("semantic") 317 | if text: 318 | self.modalities.append("text") 319 | 320 | if self.use_modality_embeddings: 321 | self.modality_embeddings = nn.ParameterDict( 322 | { 323 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None, 324 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None, 325 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None, 326 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None, 327 | } 328 | ) 329 | 330 | self.cross_attn_dict = nn.ModuleDict({}) 331 | for key in self.modalities: 332 | self.cross_attn_dict[key] = nn.MultiheadAttention( 333 | embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True 334 | ) 335 | 336 | def forward(self, data: dict[str, Tensor]) -> dict[str, Tensor]: 337 | out_dict = {} 338 | 339 | for query_modality in self.modalities: 340 | query = data[query_modality].unsqueeze(1) 341 | if self.use_modality_embeddings: 342 | query += self.modality_embeddings[query_modality] 343 | 344 | # Prepare keys and values from other modalities 345 | keys = [] 346 | for key_modality in self.modalities: 347 | if key_modality != query_modality: 348 | key_value = data[key_modality] 349 | if self.use_modality_embeddings: 350 | key_value += self.modality_embeddings[key_modality] 351 | keys.append(key_value) 352 | # Stack keys and values from all other modalities 353 | keys = values = torch.stack(keys, dim=1) 354 | 355 | # Apply cross-attention 356 | attn_output, _ = self.cross_attn_dict[query_modality](query=query, key=keys, value=values) 357 | out_dict[query_modality] = attn_output 358 | 359 | out_dict["final_descriptor"] = torch.cat(out_dict.values(), dim=-1) 360 | 361 | return out_dict 362 | -------------------------------------------------------------------------------- /src/mssplace/models.py: -------------------------------------------------------------------------------- 1 | """Models implementation""" 2 | from typing import Dict, Optional 3 | 4 | import torch 5 | from torch import Tensor, nn 6 | from opr.modules import Concat 7 | from opr.modules.gem import SeqGeM 8 | 9 | 10 | class GeMMultiFeatureMapsFusion(nn.Module): 11 | """GeM fusion module for multiple 2D feature maps.""" 12 | 13 | def __init__(self, p: int = 3, eps: float = 1e-6) -> None: 14 | """Generalized-Mean fusion module. 15 | 16 | Args: 17 | p (int): Initial value of learnable parameter 'p', see paper for more details. Defaults to 3. 18 | eps (float): Negative values will be clamped to `eps` (ReLU). Defaults to 1e-6. 19 | """ 20 | super().__init__() 21 | self.gem = SeqGeM(p=p, eps=eps) 22 | 23 | def forward(self, data: Dict[str, Tensor]) -> Tensor: # noqa: D102 24 | data = {key: value for key, value in data.items() if value is not None} 25 | features = list(data.values()) 26 | features = [f.view(f.shape[0], f.shape[1], -1) for f in features] 27 | features = torch.cat(features, dim=-1) 28 | out = self.gem(features) 29 | if len(out.shape) == 1: 30 | out = out.unsqueeze(0) 31 | return out 32 | 33 | 34 | class TextModel(nn.Module): 35 | """Meta-model for text-based Place Recognition.""" 36 | 37 | def __init__( 38 | self, 39 | model: nn.Module, 40 | fusion: Optional[nn.Module] = None, 41 | ) -> None: 42 | """Meta-model for text-based Place Recognition. 43 | 44 | Args: 45 | model (nn.Module): Text backbone. 46 | fusion (nn.Module, optional): Module to fuse descriptors for multiple texts in batch. 47 | Defaults to None. 48 | """ 49 | super().__init__() 50 | self.model = model 51 | self.fusion = fusion 52 | 53 | def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]: # noqa: D102 54 | text_descriptors = {} 55 | for key, value in batch.items(): 56 | if key.startswith("texts_"): 57 | text_descriptors[key] = self.model(value) 58 | if len(text_descriptors) > 1: 59 | if self.fusion is None: 60 | raise ValueError("Fusion module is not defined but multiple texts are provided") 61 | descriptor = self.fusion(text_descriptors) 62 | else: 63 | if self.fusion is not None: 64 | raise ValueError("Fusion module is defined but only one text is provided") 65 | descriptor = list(text_descriptors.values())[0] 66 | out_dict: Dict[str, Tensor] = {"final_descriptor": descriptor} 67 | return out_dict 68 | 69 | 70 | class LateFusionModel(nn.Module): 71 | """Meta-model for multimodal Place Recognition architectures with late fusion.""" 72 | 73 | def __init__( 74 | self, 75 | image_module: Optional[nn.Module] = None, 76 | semantic_module: Optional[nn.Module] = None, 77 | cloud_module: Optional[nn.Module] = None, 78 | text_module: Optional[nn.Module] = None, 79 | fusion_module: Optional[nn.Module] = None, 80 | ) -> None: 81 | """Meta-model for multimodal Place Recognition architectures with late fusion. 82 | 83 | Args: 84 | image_module (nn.Module, optional): Image modality branch. Defaults to None. 85 | semantic_module (nn.Module, optional): Semantic modality branch. Defaults to None. 86 | cloud_module (nn.Module, optional): Cloud modality branch. Defaults to None. 87 | text_module (nn.Module, optional): Text modality branch. Defaults to None. 88 | fusion_module (nn.Module, optional): Module to fuse different modalities. 89 | If None, will be set to opr.modules.Concat(). Defaults to None. 90 | """ 91 | super().__init__() 92 | 93 | self.image_module = image_module 94 | self.semantic_module = semantic_module 95 | self.cloud_module = cloud_module 96 | self.text_module = text_module 97 | if fusion_module: 98 | self.fusion_module = fusion_module 99 | else: 100 | self.fusion_module = Concat() 101 | 102 | def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]: # noqa: D102 103 | out_dict: Dict[str, Tensor] = {} 104 | 105 | if self.image_module is not None: 106 | out_dict["image"] = self.image_module(batch)["final_descriptor"] 107 | 108 | if self.semantic_module is not None: 109 | out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"] 110 | 111 | if self.cloud_module is not None: 112 | out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"] 113 | 114 | if self.text_module is not None: 115 | out_dict["text"] = self.text_module(batch)["final_descriptor"] 116 | 117 | out_dict["final_descriptor"] = self.fusion_module(out_dict) 118 | 119 | return out_dict 120 | -------------------------------------------------------------------------------- /train_unimodal.py: -------------------------------------------------------------------------------- 1 | """Script to train a single-modal Place Recognition model.""" 2 | import logging 3 | import pprint 4 | import sys 5 | from datetime import datetime 6 | from pathlib import Path 7 | from typing import Dict, Literal 8 | 9 | import hydra 10 | import wandb 11 | from hydra.utils import instantiate 12 | from loguru import logger 13 | from omegaconf import DictConfig, OmegaConf 14 | from torch.utils.data import DataLoader 15 | 16 | from opr.datasets.dataloader_factory import make_dataloaders 17 | from opr.trainers.place_recognition import UnimodalPlaceRecognitionTrainer 18 | from opr.utils import set_seed 19 | 20 | REPO_ROOT = Path(__file__).resolve().parent 21 | 22 | @hydra.main(config_path="configs", config_name="train_unimodal", version_base=None) 23 | def main(cfg: DictConfig) -> None: 24 | """Training code. 25 | 26 | Args: 27 | cfg (DictConfig): config to train with 28 | """ 29 | config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 30 | logger.info(f"Config:\n{pprint.pformat(config_dict, compact=True)}") 31 | 32 | if not cfg.debug and not cfg.wandb.disabled: 33 | config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 34 | wandb.init( 35 | dir=hydra.core.hydra_config.HydraConfig.get().runtime.output_dir, 36 | name=cfg.exp_name, 37 | project=cfg.wandb.project, 38 | settings=wandb.Settings(start_method="thread"), 39 | config=config_dict, 40 | ) 41 | logger.debug(f"Initialized wandb run with name: {wandb.run.name}") 42 | 43 | logger.info(f"Output directory: {hydra.core.hydra_config.HydraConfig.get().runtime.output_dir}") 44 | checkpoints_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir) / "checkpoints" 45 | if not checkpoints_dir.exists(): 46 | checkpoints_dir.mkdir(parents=True) 47 | 48 | set_seed(seed=cfg.seed, make_deterministic=False) 49 | logger.info(f"=> Seed: {cfg.seed}") 50 | 51 | logger.debug("=> Instantiating model...") 52 | model = instantiate(cfg.model) 53 | 54 | logger.debug("=> Instantiating loss...") 55 | loss_fn = instantiate(cfg.loss) 56 | 57 | logger.debug("=> Making dataloaders...") 58 | dataloaders: Dict[Literal["train", "val", "test"], DataLoader] = make_dataloaders( 59 | dataset_cfg=cfg.dataset, 60 | batch_sampler_cfg=cfg.sampler, 61 | num_workers=cfg.num_workers, 62 | ) 63 | 64 | logger.debug("=> Instantiating optimizer...") 65 | optimizer = instantiate(cfg.optimizer, params=model.parameters()) 66 | logger.debug("=> Instantiating scheduler...") 67 | scheduler = instantiate(cfg.scheduler, optimizer=optimizer) 68 | 69 | logger.debug("=> Instantiating trainer...") 70 | trainer = UnimodalPlaceRecognitionTrainer( 71 | checkpoints_dir=checkpoints_dir, 72 | model=model, 73 | loss_fn=loss_fn, 74 | optimizer=optimizer, 75 | scheduler=scheduler, 76 | batch_expansion_threshold=cfg.batch_expansion_threshold, 77 | wandb_log=(not cfg.debug and not cfg.wandb.disabled), 78 | device=cfg.device, 79 | ) 80 | 81 | logger.info(f"=====> {trainer.__class__.__name__} is ready, starting training for {cfg.epochs} epochs.") 82 | 83 | trainer.train( 84 | epochs=cfg.epochs, 85 | train_dataloader=dataloaders["train"], 86 | val_dataloader=dataloaders["val"], 87 | test_dataloader=dataloaders["test"], 88 | ) 89 | 90 | 91 | if __name__ == "__main__": 92 | run_dir = REPO_ROOT / "outputs" / (r"${exp_name}" + f"_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}") 93 | sys.argv.append(f"hydra.run.dir={run_dir}") 94 | main() 95 | --------------------------------------------------------------------------------