├── .github
    └── copilot-instructions.md
├── .gitignore
├── .gitmodules
├── README.md
├── checkpoints
    └── .gitkeep
├── configs
    ├── dataset
    │   ├── nclt
    │   │   ├── all_camera_lidar.yaml
    │   │   ├── all_camera_semantic.yaml
    │   │   ├── all_camera_semantic_lidar.yaml
    │   │   ├── all_camera_semantic_text.yaml
    │   │   ├── all_camera_semantic_text_lidar.yaml
    │   │   ├── all_camera_text.yaml
    │   │   ├── all_camera_text_lidar.yaml
    │   │   ├── camera1.yaml
    │   │   ├── camera1_lidar.yaml
    │   │   ├── camera2-front-back.yaml
    │   │   ├── camera2-left-right.yaml
    │   │   ├── camera5.yaml
    │   │   ├── lidar.yaml
    │   │   ├── semantic1.yaml
    │   │   ├── semantic2-front-back.yaml
    │   │   ├── semantic2-left-right.yaml
    │   │   ├── semantic5.yaml
    │   │   ├── text1_clip-base.yaml
    │   │   ├── text1_clip-large.yaml
    │   │   ├── text1_tfidf.yaml
    │   │   ├── text2-front-back_clip-base.yaml
    │   │   ├── text2-front-back_clip-large.yaml
    │   │   ├── text2-front-back_tfidf.yaml
    │   │   ├── text2-left-right_clip-base.yaml
    │   │   ├── text2-left-right_clip-large.yaml
    │   │   ├── text2-left-right_tfidf.yaml
    │   │   ├── text5_clip-base.yaml
    │   │   ├── text5_clip-large.yaml
    │   │   └── text5_tfidf.yaml
    │   └── oxford
    │   │   ├── all_camera_lidar.yaml
    │   │   ├── all_camera_semantic.yaml
    │   │   ├── all_camera_semantic_lidar.yaml
    │   │   ├── all_camera_semantic_text.yaml
    │   │   ├── all_camera_semantic_text_lidar.yaml
    │   │   ├── all_camera_text.yaml
    │   │   ├── all_camera_text_lidar.yaml
    │   │   ├── camera1.yaml
    │   │   ├── camera1_lidar.yaml
    │   │   ├── camera2-front-back.yaml
    │   │   ├── camera2-left-right.yaml
    │   │   ├── camera4.yaml
    │   │   ├── lidar.yaml
    │   │   ├── semantic1.yaml
    │   │   ├── semantic2-front-back.yaml
    │   │   ├── semantic2-left-right.yaml
    │   │   ├── semantic4.yaml
    │   │   ├── text1_clip-base.yaml
    │   │   ├── text1_clip-large.yaml
    │   │   ├── text1_tfidf.yaml
    │   │   ├── text2-front-back_clip-base.yaml
    │   │   ├── text2-front-back_clip-large.yaml
    │   │   ├── text2-front-back_tfidf.yaml
    │   │   ├── text2-left-right_clip-base.yaml
    │   │   ├── text2-left-right_clip-large.yaml
    │   │   ├── text2-left-right_tfidf.yaml
    │   │   ├── text4_clip-base.yaml
    │   │   ├── text4_clip-large.yaml
    │   │   └── text4_tfidf.yaml
    ├── loss
    │   └── batch_hard_triplet_margin.yaml
    ├── model
    │   ├── camera1.yaml
    │   ├── camera2_add.yaml
    │   ├── camera2_concat.yaml
    │   ├── camera2_gem.yaml
    │   ├── camera2_mlp-full.yaml
    │   ├── camera2_mlp-half.yaml
    │   ├── camera2_sa-add.yaml
    │   ├── camera2_sa-concat.yaml
    │   ├── convnext_camera1.yaml
    │   ├── convnext_semantic1.yaml
    │   ├── lidar.yaml
    │   ├── minkloc-multimodal.yaml
    │   ├── minkloc3dv2.yaml
    │   ├── mssplace-i.yaml
    │   ├── mssplace-li.yaml
    │   ├── mssplace-lis.yaml
    │   ├── mssplace-list.yaml
    │   ├── mssplace-lit.yaml
    │   ├── semantic1.yaml
    │   ├── semantic2_add.yaml
    │   ├── semantic2_concat.yaml
    │   ├── semantic2_mlp-full.yaml
    │   ├── semantic2_mlp-half.yaml
    │   ├── semantic2_sa-add.yaml
    │   ├── semantic2_sa-concat.yaml
    │   ├── text1_clip-base-mlp.yaml
    │   ├── text1_clip-large-mlp.yaml
    │   ├── text1_tfidf-mlp.yaml
    │   ├── text2_clip-base-mlp-add.yaml
    │   ├── text2_clip-base-mlp-concat.yaml
    │   ├── text2_clip-large-mlp-add.yaml
    │   ├── text2_clip-large-mlp-concat.yaml
    │   ├── text2_tfidf-mlp-add.yaml
    │   └── text2_tfidf-mlp-concat.yaml
    ├── optimizer
    │   └── adam.yaml
    ├── sampler
    │   └── batch_sampler.yaml
    ├── scheduler
    │   └── multi_step.yaml
    └── train_unimodal.yaml
├── docker
    ├── Dockerfile.cuda
    ├── build.sh
    ├── into.sh
    └── start.sh
├── images
    └── mssplace_overview.jpg
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── scripts
    └── evaluation
    │   ├── evaluate_checkpoints.py
    │   └── failure_cases.ipynb
├── src
    └── mssplace
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── modality_interaction_layers.py
    │   └── models.py
└── train_unimodal.py


/.github/copilot-instructions.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | applyTo: '**'
 3 | ---
 4 | 
 5 | # Project-Specific Guidance – Research Reproducibility
 6 | 
 7 | ## 🧪 Research Code Clarity
 8 | 
 9 | - Prioritize code **readability and clarity** over performance.
10 | - Use **explicit, well-named variables** and **clear control flow**.
11 | 
12 | ## 🧾 Reproducibility and Documentation
13 | 
14 | - Ensure all code is **well-documented** with in-line comments and Google-style docstrings.
15 | - Write **comprehensive Markdown instructions** for reproducing experiments.
16 | 
17 | ## 🔬 PyTorch and Python Modernity
18 | 
19 | - Use **Python 3.10+** and **PyTorch 2.1+** features idiomatically.
20 | - Prefer built-in types and modern syntax for clarity.
21 | 
22 | ## Goal
23 | 
24 | Support reproducible research with clean, understandable code and thorough documentation.
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | checkpoints/*
  2 | !checkpoints/.gitkeep
  3 | outputs/
  4 | .ruff_cache/
  5 | 
  6 | # Development and organization files (keep local, don't publish)
  7 | work_in_progress/
  8 | 
  9 | ### Defaults for Python:
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | *.py[cod]
 14 | *$py.class
 15 | 
 16 | # C extensions
 17 | *.so
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | cover/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | .pybuilder/
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # poetry
108 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
110 | #   commonly ignored for libraries.
111 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112 | #poetry.lock
113 | 
114 | # pdm
115 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116 | #pdm.lock
117 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
118 | #   in version control.
119 | #   https://pdm.fming.dev/#use-with-ide
120 | .pdm.toml
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 | 
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 | 
129 | # SageMath parsed files
130 | *.sage.py
131 | 
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/OpenPlaceRecognition"]
2 | 	path = third_party/OpenPlaceRecognition
3 | 	url = https://github.com/alexmelekhin/OpenPlaceRecognition.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics
  2 | 
  3 | This repository contains the code for the paper "MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics".
  4 | 
  5 | ![mssplace_overview](./images/mssplace_overview.jpg)
  6 | 
  7 | _High-level overview of the proposed multimodal MSSPlace method. The MSSPlace Model has a modular architecture and consists of four branches: the Image Encoder, Semantic Masks Encoder, Text Encoder, and Point Cloud Encoder. Each branch encodes the input data into a descriptor, capturing the essential information specific to its respective modality. Subsequently, a descriptor aggregation step is performed to combine these individual descriptors and obtain the global place descriptor, which represents the comprehensive characteristics of the vehicle place._
  8 | 
  9 | ## Installation
 10 | 
 11 | Initialize submodules and build the Docker environment:
 12 | 
 13 | ```bash
 14 | git submodule update --init --recursive
 15 | bash docker/build.sh
 16 | bash docker/start.sh [DATASETS_DIR]  # DATASETS_DIR will be mounted at /home/docker_mssplace/Datasets
 17 | bash docker/into.sh
 18 | ```
 19 | 
 20 | ### Package Installation
 21 | 
 22 | Install the MSSPlace package in editable mode for development:
 23 | 
 24 | ```bash
 25 | pip install -e .
 26 | ```
 27 | 
 28 | This installs the package from the `src/mssplace` directory, allowing you to import `mssplace` modules directly without path modifications.
 29 | 
 30 | ## Quick Start
 31 | 
 32 | Evaluate pre-trained models on Oxford RobotCar or NCLT datasets:
 33 | 
 34 | ```bash
 35 | # Download checkpoints and datasets first (see sections below)
 36 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li
 37 | python scripts/evaluation/evaluate_checkpoints.py --dataset nclt --model mssplace-list --verbose
 38 | ```
 39 | 
 40 | ## Evaluation
 41 | 
 42 | ### Performance Metrics
 43 | 
 44 | - **AR@1**: Accuracy (%) when considering top-1 retrieval match
 45 | - **AR@1%**: Accuracy (%) when considering top-1% of database as potential matches
 46 | 
 47 | ### Model Variants
 48 | 
 49 | | Model | Modalities | AR@1 (Oxford) | AR@1% (Oxford) | AR@1 (NCLT) | AR@1% (NCLT) | Description |
 50 | |-------|------------|---------------|----------------|-------------|--------------|-------------|
 51 | | `mssplace-li` | LiDAR + Images | 98.21% | 99.53% | 94.67% | 97.72% | Basic multimodal |
 52 | | `mssplace-lis` | LiDAR + Images + Semantic | **98.55%** | **99.64%** | **95.37%** | **97.84%** | Adds semantic segmentation |
 53 | | `mssplace-lit` | LiDAR + Images + Text | 98.22% | 99.53% | 92.36% | 96.51% | Adds text descriptions |
 54 | | `mssplace-list` | LiDAR + Images + Semantic + Text | **98.55%** | **99.64%** | 94.15% | 96.97% | Complete multimodal |
 55 | 
 56 | *Performance metrics measured on Oxford RobotCar and NCLT datasets. Best results per dataset highlighted in bold.*
 57 | 
 58 | **Key Insights:**
 59 | - `mssplace-lis` achieves the best performance on NCLT, while both `mssplace-lis` and `mssplace-list` tie for best on Oxford
 60 | - Semantic segmentation consistently helps place recognition across both datasets
 61 | - Text modality shows dataset-dependent behavior: hurts performance on NCLT but is neutral on Oxford
 62 | - Oxford dataset appears easier than NCLT (all models achieve >98% vs 92-95% AR@1)
 63 | - The complete multimodal `mssplace-list` performs well but doesn't consistently exceed semantic-only variants
 64 | 
 65 | ### Pre-trained Checkpoints
 66 | 
 67 | ⚠️ **Work in Progress**: Checkpoint download links will be updated soon. Please check back later for access to pre-trained models.
 68 | 
 69 | ### Datasets
 70 | 
 71 | ⚠️ **Work in Progress**: Preprocessed datasets will be made publicly available for download soon. Please check back later for dataset access.
 72 | 
 73 | ### Directory Structure
 74 | 
 75 | ```
 76 | /home/docker_mssplace/
 77 | ├── MSSPlace/                           # This repository
 78 | │   ├── checkpoints/                   # Downloaded checkpoints
 79 | │   ├── configs/                       # Configuration files
 80 | │   ├── docker/                        # Docker environment setup
 81 | │   ├── docs/                          # Documentation and examples
 82 | │   ├── images/                        # Example images and figures
 83 | │   ├── scripts/                       # Organized scripts
 84 | │   ├── src/                           # Core source code
 85 | │   └── third_party/                   # External dependencies
 86 | │       └── OpenPlaceRecognition/      # Core OPR library
 87 | └── Datasets/                           # Dataset directory (configurable with --datasets-dir)
 88 |     ├── pnvlad_oxford_robotcar/
 89 |     └── NCLT_preprocessed/
 90 | ```
 91 | 
 92 | ### Key Arguments
 93 | 
 94 | | Argument | Default | Description |
 95 | |----------|---------|-------------|
 96 | | `--dataset` | *Required* | `oxford` or `nclt` |
 97 | | `--model` | *Required* | Model variant (see table above) |
 98 | | `--datasets-dir` | `/home/docker_mssplace/Datasets` | Path to datasets directory |
 99 | | `--checkpoints-dir` | `./checkpoints` | Path to model checkpoints |
100 | | `--configs-dir` | `./configs/model` | Path to model configurations |
101 | | `--batch-size` | `32` | Evaluation batch size |
102 | | `--verbose` | `False` | Enable detailed logging |
103 | 
104 | **Example Usage:**
105 | ```bash
106 | # Basic evaluation
107 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li
108 | 
109 | # Custom dataset location
110 | python scripts/evaluation/evaluate_checkpoints.py \
111 |     --dataset nclt --model mssplace-lis \
112 |     --datasets-dir /path/to/your/datasets \
113 |     --verbose
114 | ```
115 | 
116 | ## Training (Optional)
117 | 
118 | ⚠️ **Work in Progress**: Training documentation and scripts will be updated soon. Please check back later for training instructions.
119 | 
120 | ## Troubleshooting
121 | 
122 | - **Missing checkpoints**: Download all `.pth` files to `checkpoints/`
123 | - **Dataset errors**: Verify directory structure matches expected format
124 | - **CUDA memory**: Reduce `--batch-size` if out-of-memory
125 | - **Dependencies**: Use provided Docker environment
126 | 


--------------------------------------------------------------------------------
/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                pointcloud_lidar,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
 6 |                pointcloud_lidar,]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_text.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
 6 |                text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_text_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
 6 |                text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,
 7 |                pointcloud_lidar,]
 8 | positive_threshold: 10.0
 9 | negative_threshold: 50.0
10 | images_dirname: images_small
11 | masks_dirname: segmentation_masks_small
12 | text_embeddings_dirname: clip-vit-base-patch32
13 | pointclouds_dirname: velodyne_data
14 | pointcloud_quantization_size: 0.5
15 | max_point_distance: 100.0
16 | spherical_coords: False
17 | use_intensity_values: False
18 | image_transform: null
19 | semantic_transform: null
20 | pointcloud_transform: null
21 | pointcloud_set_transform: null
22 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_text.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_text_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
 5 |                text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,
 6 |                pointcloud_lidar,]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/camera1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/camera1_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1,
 5 |                pointcloud_lidar,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/camera2-front-back.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam5, image_Cam2]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/camera2-left-right.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam4]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/camera5.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [pointcloud_lidar,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic2-front-back.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_Cam5, mask_Cam2]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic2-left-right.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_Cam1, mask_Cam4]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic5.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.nclt.NCLTDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5, text_Cam2,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5, text_Cam2,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam5, text_Cam2,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam4,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam4,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam4,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.NCLTDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                pointcloud_lidar,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
 6 |                pointcloud_lidar,]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_text.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
 6 |                text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_text_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
 6 |                text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,
 7 |                pointcloud_lidar,]
 8 | positive_threshold: 10.0
 9 | negative_threshold: 50.0
10 | images_dirname: images_small
11 | masks_dirname: segmentation_masks_small
12 | text_embeddings_dirname: clip-vit-base-patch32
13 | pointclouds_dirname: null
14 | pointcloud_quantization_size: 0.01
15 | max_point_distance: null
16 | spherical_coords: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_text.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_text_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
 5 |                text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,
 6 |                pointcloud_lidar,]
 7 | positive_threshold: 10.0
 8 | negative_threshold: 50.0
 9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/camera1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/camera1_lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre,
 5 |                pointcloud_lidar,]
 6 | positive_threshold: 10.0
 7 | negative_threshold: 50.0
 8 | images_dirname: images_small
 9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/camera2-front-back.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_rear]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/camera2-left-right.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_mono_left, image_mono_right]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/camera4.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [pointcloud_lidar,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_stereo_centre,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic2-front-back.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_stereo_centre, mask_mono_rear]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic2-left-right.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_mono_left, mask_mono_right]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic4.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.datasets.OxfordDataset
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_rear,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_rear,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_rear,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_mono_left, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_mono_left, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_mono_left, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_clip-base.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_clip-large.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_tfidf.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.datasets.OxfordDatasetWithText
 2 | 
 3 | dataset_root: /path/to/dataset
 4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
 5 | positive_threshold: 10.0
 6 | negative_threshold: 50.0
 7 | images_dirname: images_small
 8 | masks_dirname: segmentation_masks_small
 9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 | 


--------------------------------------------------------------------------------
/configs/loss/batch_hard_triplet_margin.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.losses.BatchHardTripletMarginLoss
2 | 
3 | margin: 0.2
4 | 


--------------------------------------------------------------------------------
/configs/model/camera1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.ResNet18
2 | 
3 | in_channels: 3
4 | out_channels: 256
5 | num_top_down: 0
6 | pooling: gem
7 | 


--------------------------------------------------------------------------------
/configs/model/camera2_add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: opr.modules.Add
14 | 


--------------------------------------------------------------------------------
/configs/model/camera2_concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: opr.modules.Concat
14 | 


--------------------------------------------------------------------------------
/configs/model/camera2_gem.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: opr.modules.GeMFusion
14 | 


--------------------------------------------------------------------------------
/configs/model/camera2_mlp-full.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.Concat
16 |     - _target_: opr.modules.MLP
17 |       in_features: 512
18 |       out_features: 512
19 | 


--------------------------------------------------------------------------------
/configs/model/camera2_mlp-half.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.Concat
16 |     - _target_: opr.modules.MLP
17 |       in_features: 512
18 |       out_features: 256
19 | 


--------------------------------------------------------------------------------
/configs/model/camera2_sa-add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.SelfAttention
16 |       embed_size: 256
17 |     - _target_: opr.modules.Add
18 | 


--------------------------------------------------------------------------------
/configs/model/camera2_sa-concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 3
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: True
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.SelfAttention
16 |       embed_size: 256
17 |     - _target_: opr.modules.Concat
18 | 


--------------------------------------------------------------------------------
/configs/model/convnext_camera1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor
 5 |   in_channels: 3
 6 |   pretrained: True
 7 | head:
 8 |   _target_: torch.nn.Sequential
 9 |   _args_:
10 |     - _target_: opr.modules.GeM
11 |     - _target_: torch.nn.Linear
12 |       in_features: 768
13 |       out_features: 256
14 | 


--------------------------------------------------------------------------------
/configs/model/convnext_semantic1.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor
 5 |   in_channels: 1
 6 |   pretrained: False
 7 | head:
 8 |   _target_: torch.nn.Sequential
 9 |   _args_:
10 |     - _target_: opr.modules.GeM
11 |     - _target_: torch.nn.Linear
12 |       in_features: 768
13 |       out_features: 256
14 | 


--------------------------------------------------------------------------------
/configs/model/lidar.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.MinkLoc3Dv2
 2 | 
 3 | in_channels: 1
 4 | out_channels: 256
 5 | num_top_down: 2
 6 | conv0_kernel_size: 5
 7 | block: ECABasicBlock
 8 | layers: [1, 1, 1, 1]
 9 | planes: [64, 128, 64, 32]
10 | pooling: gem
11 | 


--------------------------------------------------------------------------------
/configs/model/minkloc-multimodal.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.LateFusionModel
 2 | 
 3 | image_module:
 4 |   _target_: opr.models.place_recognition.ResNet18
 5 |   in_channels: 3
 6 |   out_channels: 128
 7 |   num_top_down: 0
 8 |   pooling: gem
 9 | 
10 | cloud_module:
11 |   _target_: opr.models.place_recognition.MinkLoc3D
12 |   in_channels: 1
13 |   out_channels: 128
14 |   num_top_down: 1
15 |   conv0_kernel_size: 5
16 |   block: ECABasicBlock
17 |   layers: [1, 1, 1]
18 |   planes: [32, 64, 64]
19 |   pooling: gem
20 | 
21 | fusion_module:
22 |   _target_: opr.modules.Concat
23 | 


--------------------------------------------------------------------------------
/configs/model/minkloc3dv2.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.MinkLoc3Dv2
 2 | 
 3 | in_channels: 1
 4 | out_channels: 256
 5 | num_top_down: 2
 6 | conv0_kernel_size: 5
 7 | block: ECABasicBlock
 8 | layers: [1, 1, 1, 1]
 9 | planes: [64, 128, 64, 32]
10 | pooling: gem
11 | 


--------------------------------------------------------------------------------
/configs/model/mssplace-i.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.ImageModel
 2 | backbone:
 3 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 4 |   in_channels: 3
 5 |   lateral_dim: 256
 6 |   fh_num_bottom_up: 4
 7 |   fh_num_top_down: 0
 8 |   pretrained: True
 9 | head:
10 |   _target_: opr.modules.GeM
11 | fusion:
12 |   _target_: opr.modules.Add
13 | 


--------------------------------------------------------------------------------
/configs/model/mssplace-li.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.LateFusionModel
 2 | 
 3 | image_module:
 4 |   _target_: opr.models.place_recognition.base.ImageModel
 5 |   backbone:
 6 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 7 |     in_channels: 3
 8 |     lateral_dim: 256
 9 |     fh_num_bottom_up: 4
10 |     fh_num_top_down: 0
11 |     pretrained: True
12 |   head:
13 |     _target_: opr.modules.GeM
14 |   fusion:
15 |     _target_: opr.modules.Add
16 | 
17 | cloud_module:
18 |   _target_: opr.models.place_recognition.MinkLoc3Dv2
19 |   in_channels: 1
20 |   out_channels: 256
21 |   num_top_down: 2
22 |   conv0_kernel_size: 5
23 |   block: ECABasicBlock
24 |   layers: [1, 1, 1, 1]
25 |   planes: [64, 128, 64, 32]
26 |   pooling: gem
27 | 
28 | fusion_module:
29 |   _target_: opr.modules.Concat
30 | 


--------------------------------------------------------------------------------
/configs/model/mssplace-lis.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.LateFusionModel
 2 | 
 3 | image_module:
 4 |   _target_: opr.models.place_recognition.base.ImageModel
 5 |   backbone:
 6 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 7 |     in_channels: 3
 8 |     lateral_dim: 256
 9 |     fh_num_bottom_up: 4
10 |     fh_num_top_down: 0
11 |     pretrained: True
12 |   head:
13 |     _target_: opr.modules.GeM
14 |   fusion:
15 |     _target_: opr.modules.Add
16 | 
17 | cloud_module:
18 |   _target_: opr.models.place_recognition.MinkLoc3Dv2
19 |   in_channels: 1
20 |   out_channels: 256
21 |   num_top_down: 2
22 |   conv0_kernel_size: 5
23 |   block: ECABasicBlock
24 |   layers: [1, 1, 1, 1]
25 |   planes: [64, 128, 64, 32]
26 |   pooling: gem
27 | 
28 | semantic_module:
29 |   _target_: opr.models.place_recognition.base.SemanticModel
30 |   backbone:
31 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
32 |     in_channels: 1
33 |     lateral_dim: 256
34 |     fh_num_bottom_up: 4
35 |     fh_num_top_down: 0
36 |     pretrained: False
37 |   head:
38 |     _target_: opr.modules.GeM
39 |   fusion:
40 |     _target_: opr.modules.Add
41 | 
42 | fusion_module:
43 |   _target_: opr.modules.Concat
44 | 


--------------------------------------------------------------------------------
/configs/model/mssplace-list.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.LateFusionModel
 2 | 
 3 | image_module:
 4 |   _target_: opr.models.place_recognition.base.ImageModel
 5 |   backbone:
 6 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 7 |     in_channels: 3
 8 |     lateral_dim: 256
 9 |     fh_num_bottom_up: 4
10 |     fh_num_top_down: 0
11 |     pretrained: True
12 |   head:
13 |     _target_: opr.modules.GeM
14 |   fusion:
15 |     _target_: opr.modules.Add
16 | 
17 | cloud_module:
18 |   _target_: opr.models.place_recognition.MinkLoc3Dv2
19 |   in_channels: 1
20 |   out_channels: 256
21 |   num_top_down: 2
22 |   conv0_kernel_size: 5
23 |   block: ECABasicBlock
24 |   layers: [1, 1, 1, 1]
25 |   planes: [64, 128, 64, 32]
26 |   pooling: gem
27 | 
28 | semantic_module:
29 |   _target_: opr.models.place_recognition.base.SemanticModel
30 |   backbone:
31 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
32 |     in_channels: 1
33 |     lateral_dim: 256
34 |     fh_num_bottom_up: 4
35 |     fh_num_top_down: 0
36 |     pretrained: False
37 |   head:
38 |     _target_: opr.modules.GeM
39 |   fusion:
40 |     _target_: opr.modules.Add
41 | 
42 | text_module:
43 |   _target_: src.models.TextModel
44 |   model:
45 |     _target_: opr.modules.MLP
46 |     in_features: 512
47 |     out_features: 256
48 |     drop: 0.5
49 |   fusion:
50 |     _target_: opr.modules.Add
51 | 
52 | fusion_module:
53 |   _target_: opr.modules.Concat
54 | 


--------------------------------------------------------------------------------
/configs/model/mssplace-lit.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.LateFusionModel
 2 | 
 3 | image_module:
 4 |   _target_: opr.models.place_recognition.base.ImageModel
 5 |   backbone:
 6 |     _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 7 |     in_channels: 3
 8 |     lateral_dim: 256
 9 |     fh_num_bottom_up: 4
10 |     fh_num_top_down: 0
11 |     pretrained: True
12 |   head:
13 |     _target_: opr.modules.GeM
14 |   fusion:
15 |     _target_: opr.modules.Add
16 | 
17 | cloud_module:
18 |   _target_: opr.models.place_recognition.MinkLoc3Dv2
19 |   in_channels: 1
20 |   out_channels: 256
21 |   num_top_down: 2
22 |   conv0_kernel_size: 5
23 |   block: ECABasicBlock
24 |   layers: [1, 1, 1, 1]
25 |   planes: [64, 128, 64, 32]
26 |   pooling: gem
27 | 
28 | text_module:
29 |   _target_: src.models.TextModel
30 |   model:
31 |     _target_: opr.modules.MLP
32 |     in_features: 512
33 |     out_features: 256
34 |     drop: 0.5
35 |   fusion:
36 |     _target_: opr.modules.Add
37 | 
38 | fusion_module:
39 |   _target_: opr.modules.Concat
40 | 


--------------------------------------------------------------------------------
/configs/model/semantic1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.SemanticResNet18
2 | 
3 | in_channels: 1
4 | out_channels: 256
5 | num_top_down: 0
6 | pooling: gem
7 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: opr.modules.Add
14 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: opr.modules.Concat
14 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_mlp-full.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.Concat
16 |     - _target_: opr.modules.MLP
17 |       in_features: 512
18 |       out_features: 512
19 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_mlp-half.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.Concat
16 |     - _target_: opr.modules.MLP
17 |       in_features: 512
18 |       out_features: 256
19 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_sa-add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.SelfAttention
16 |       embed_size: 256
17 |     - _target_: opr.modules.Add
18 | 


--------------------------------------------------------------------------------
/configs/model/semantic2_sa-concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.models.place_recognition.base.SemanticModel
 2 | 
 3 | backbone:
 4 |   _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
 5 |   in_channels: 1
 6 |   lateral_dim: 256
 7 |   fh_num_bottom_up: 4
 8 |   fh_num_top_down: 0
 9 |   pretrained: False
10 | head:
11 |   _target_: opr.modules.GeM
12 | fusion:
13 |   _target_: torch.nn.Sequential
14 |   _args_:
15 |     - _target_: opr.modules.SelfAttention
16 |       embed_size: 256
17 |     - _target_: opr.modules.Concat
18 | 


--------------------------------------------------------------------------------
/configs/model/text1_clip-base-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 | 
3 | model:
4 |   _target_: opr.modules.MLP
5 |   in_features: 512
6 |   out_features: 256
7 |   drop: 0.5
8 | fusion: null
9 | 


--------------------------------------------------------------------------------
/configs/model/text1_clip-large-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 | 
3 | model:
4 |   _target_: opr.modules.MLP
5 |   in_features: 768
6 |   out_features: 256
7 |   drop: 0.5
8 | fusion: null
9 | 


--------------------------------------------------------------------------------
/configs/model/text1_tfidf-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 | 
3 | model:
4 |   _target_: opr.modules.MLP
5 |   in_features: 128
6 |   out_features: 128
7 |   drop: 0.5
8 | fusion: null
9 | 


--------------------------------------------------------------------------------
/configs/model/text2_clip-base-mlp-add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 512
 6 |   out_features: 256
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Add
10 | 


--------------------------------------------------------------------------------
/configs/model/text2_clip-base-mlp-concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 512
 6 |   out_features: 256
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Concat
10 | 


--------------------------------------------------------------------------------
/configs/model/text2_clip-large-mlp-add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 768
 6 |   out_features: 256
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Add
10 | 


--------------------------------------------------------------------------------
/configs/model/text2_clip-large-mlp-concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 768
 6 |   out_features: 256
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Concat
10 | 


--------------------------------------------------------------------------------
/configs/model/text2_tfidf-mlp-add.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 128
 6 |   out_features: 128
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Add
10 | 


--------------------------------------------------------------------------------
/configs/model/text2_tfidf-mlp-concat.yaml:
--------------------------------------------------------------------------------
 1 | _target_: src.models.TextModel
 2 | 
 3 | model:
 4 |   _target_: opr.modules.MLP
 5 |   in_features: 128
 6 |   out_features: 128
 7 |   drop: 0.5
 8 | fusion:
 9 |   _target_: opr.modules.Concat
10 | 


--------------------------------------------------------------------------------
/configs/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | _convert_: all
3 | 
4 | lr: 0.001
5 | weight_decay: 0.0001
6 | 


--------------------------------------------------------------------------------
/configs/sampler/batch_sampler.yaml:
--------------------------------------------------------------------------------
 1 | _target_: opr.samplers.BatchSampler
 2 | 
 3 | batch_size: 16
 4 | batch_size_limit: 128
 5 | batch_expansion_rate: 1.4
 6 | max_batches: null
 7 | positives_per_group: 2
 8 | seed: ${seed}
 9 | drop_last: True
10 | 


--------------------------------------------------------------------------------
/configs/scheduler/multi_step.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.lr_scheduler.MultiStepLR
2 | gamma: 0.1
3 | milestones: [40, 60]
4 | 


--------------------------------------------------------------------------------
/configs/train_unimodal.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - dataset: nclt/lidar
 4 |   - sampler: batch_sampler
 5 |   - model: lidar
 6 |   - loss: batch_hard_triplet_margin
 7 |   - optimizer: adam
 8 |   - scheduler: multi_step
 9 | 
10 | wandb:
11 |   disabled: false
12 |   project: CVPR2024
13 | 
14 | debug: false
15 | device: cuda
16 | seed: 3121999
17 | num_workers: 4
18 | 
19 | exp_name: ???
20 | 
21 | epochs: 80
22 | batch_expansion_threshold: 0.7
23 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.cuda:
--------------------------------------------------------------------------------
 1 | FROM alexmelekhin/open-place-recognition:base
 2 | 
 3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
 4 |     libcairo2-dev \
 5 |     libgirepository1.0-dev \
 6 |     libdbus-1-dev \
 7 |     libdbus-glib-1-dev \
 8 |     && rm -rf /var/lib/apt/lists/*
 9 | 
10 | # to install "dvc[gdrive]" we need to install "distro" package first
11 | ARG DISTRO_VERSION=1.9.0
12 | RUN pip install distro==${DISTRO_VERSION}
13 | 
14 | # install other requirements from requirements.txt
15 | COPY requirements.txt .
16 | RUN pip install -r requirements.txt && \
17 |     rm requirements.txt
18 | 
19 | # add user and his password
20 | ENV USER=docker_mssplace
21 | ARG UID=1000
22 | ARG GID=1000
23 | # default password
24 | ARG PW=user
25 | 
26 | RUN useradd -m ${USER} --uid=${UID} && echo "${USER}:${PW}" | chpasswd && adduser ${USER} sudo
27 | WORKDIR /home/${USER}
28 | 
29 | # create some directories for mounting volumes
30 | RUN mkdir MSSPlace && chown -R ${UID}:${GID} /home/${USER}
31 | RUN mkdir Datasets && chown -R ${UID}:${GID} /home/${USER}
32 | 
33 | USER ${UID}:${GID}
34 | 
35 | # install OpenPlaceRecognition library
36 | COPY --chown=${UID}:${GID} ./third_party/OpenPlaceRecognition ./OpenPlaceRecognition
37 | RUN cd OpenPlaceRecognition && \
38 |     pip install --user . && \
39 |     cd .. && \
40 |     rm -rf OpenPlaceRecognition
41 | 


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | orange=`tput setaf 3`
 4 | reset_color=`tput sgr0`
 5 | 
 6 | ARCH=`uname -m`
 7 | if [ $ARCH != "x86_64" ]; then
 8 |     echo "${orange}${ARCH}${reset_color} architecture is not supported"
 9 |     exit 1
10 | fi
11 | 
12 | if command -v nvidia-smi &> /dev/null; then
13 |     echo "Detected ${orange}CUDA${reset_color} hardware"
14 |     DOCKERFILE=Dockerfile.cuda
15 |     DEVICE=cuda
16 | else
17 |     echo "${orange}CPU-only${reset_color} build is not supported"
18 |     exit 1
19 | fi
20 | 
21 | echo "Building for ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}"
22 | 
23 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd)
24 | 
25 | docker build $PROJECT_ROOT_DIR \
26 |     -f $PROJECT_ROOT_DIR/docker/$DOCKERFILE \
27 |     --build-arg UID=$(id -u) \
28 |     --build-arg GID=$(id -g) \
29 |     -t mssplace:latest
30 | 


--------------------------------------------------------------------------------
/docker/into.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker exec --user docker_mssplace -it ${USER}_mssplace \
4 |     /bin/bash -c "cd /home/docker_mssplace; echo ${USER}_mssplace container; echo ; /bin/bash"
5 | 


--------------------------------------------------------------------------------
/docker/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | orange=`tput setaf 3`
 4 | reset_color=`tput sgr0`
 5 | 
 6 | get_real_path(){
 7 |     if [ "${1:0:1}" == "/" ]; then
 8 |         echo "$1"
 9 |     else
10 |         realpath -m "$PWD"/"$1"
11 |     fi
12 | }
13 | 
14 | ARCH=`uname -m`
15 | if [ $ARCH == "x86_64" ]; then
16 |     if command -v nvidia-smi &> /dev/null; then
17 |         DEVICE=cuda
18 |         ARGS="--ipc host --gpus all -e NVIDIA_DRIVER_CAPABILITIES=all"
19 |     else
20 |         echo "${orange}CPU-only${reset_color} build is not supported"
21 |         exit 1
22 |     fi
23 | else
24 |     echo "${orange}${ARCH}${reset_color} architecture is not supported"
25 |     exit 1
26 | fi
27 | 
28 | if [ $# != 1 ]; then
29 |     echo "Usage:
30 |           bash start.sh [DATASETS_DIR]
31 |         "
32 |     exit 1
33 | fi
34 | 
35 | DATASETS_DIR=$(get_real_path "$1")
36 | 
37 | if [ ! -d $DATASETS_DIR ]; then
38 |     echo "Error: DATASETS_DIR=$DATASETS_DIR is not an existing directory."
39 |     exit 1
40 | fi
41 | 
42 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd)
43 | 
44 | echo "Running on ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}"
45 | 
46 | docker run -it -d --rm \
47 |     $ARGS \
48 |     --privileged \
49 |     --name ${USER}_mssplace \
50 |     --net host \
51 |     -v $PROJECT_ROOT_DIR:/home/docker_mssplace/MSSPlace:rw \
52 |     -v $DATASETS_DIR:/home/docker_mssplace/Datasets:rw \
53 |     mssplace:latest
54 | 
55 | docker exec --user root \
56 |     ${USER}_mssplace bash -c "/etc/init.d/ssh start"
57 | 


--------------------------------------------------------------------------------
/images/mssplace_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/images/mssplace_overview.jpg


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mssplace"
 7 | version = "0.1.0"
 8 | description = "Multi-Sensor Place Recognition with Visual and Text Semantics"
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = {text = "MIT"}
12 | authors = [
13 |     {name = "MSSPlace Team"}
14 | ]
15 | keywords = ["place recognition", "multimodal", "computer vision", "robotics"]
16 | classifiers = [
17 |     "Development Status :: 4 - Beta",
18 |     "Intended Audience :: Developers",
19 |     "Intended Audience :: Science/Research",
20 |     "License :: OSI Approved :: MIT License",
21 |     "Operating System :: OS Independent",
22 |     "Programming Language :: Python :: 3",
23 |     "Programming Language :: Python :: 3.10",
24 |     "Programming Language :: Python :: 3.11",
25 |     "Programming Language :: Python :: 3.12",
26 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
27 |     "Topic :: Scientific/Engineering :: Image Recognition",
28 | ]
29 | 
30 | # Dependencies - using requirements.txt for now, can be moved here later
31 | dependencies = []
32 | 
33 | [tool.setuptools.packages.find]
34 | where = ["src"]
35 | 
36 | [tool.setuptools.package-dir]
37 | "" = "src"
38 | 
39 | [tool.ruff]
40 | line-length = 110
41 | src = ["src"]
42 | 
43 | [tool.ruff.format]
44 | quote-style = "double"
45 | indent-style = "space"
46 | skip-magic-trailing-comma = false
47 | line-ending = "auto"
48 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | ruff==0.11.11
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | albumentations==1.3.1
 2 | antlr4-python3-runtime==4.9.3
 3 | appdirs==1.4.4
 4 | certifi==2025.4.26
 5 | chardet==3.0.4
 6 | charset-normalizer==3.3.2
 7 | click==8.1.7
 8 | contourpy==1.1.1
 9 | cycler==0.12.1
10 | dbus-python==1.2.16
11 | distro==1.4.0
12 | docker-pycreds==0.4.0
13 | fonttools==4.45.1
14 | gitdb==4.0.11
15 | GitPython==3.1.44
16 | hydra-core==1.3.2
17 | idna==3.10
18 | imageio==2.33.0
19 | importlib-resources==6.1.1
20 | joblib==1.3.2
21 | kaleido==0.2.1
22 | kiwisolver==1.4.5
23 | lazy_loader==0.3
24 | loguru==0.7.2
25 | matplotlib==3.7.4
26 | networkx==3.1
27 | ninja==1.11.1.1
28 | numpy==1.24.4
29 | numpy-quaternion==2024.0.8
30 | omegaconf==2.3.0
31 | opencv-python==4.8.1.78
32 | opencv-python-headless==4.8.1.78
33 | packaging==23.2
34 | pandas==2.0.3
35 | Pillow==11.2.1
36 | plotly==5.18.0
37 | protobuf==4.25.1
38 | psutil==5.9.6
39 | PyGObject==3.36.0
40 | pyparsing==3.1.1
41 | python-dateutil==2.8.2
42 | pytorch-metric-learning==2.3.0
43 | pytz==2023.3.post1
44 | PyWavelets==1.4.1
45 | PyYAML==6.0.1
46 | qudida==0.0.4
47 | requests==2.32.3
48 | scikit-image==0.21.0
49 | scikit-learn==1.6.1
50 | scipy==1.10.1
51 | seaborn==0.13.2
52 | sentry-sdk==2.29.1
53 | setproctitle==1.3.3
54 | six==1.14.0
55 | smmap==5.0.1
56 | ssh-import-id==5.10
57 | tenacity==8.2.3
58 | threadpoolctl==3.2.0
59 | tifffile==2023.7.10
60 | tqdm==4.67.1
61 | typing_extensions==4.8.0
62 | tzdata==2023.3
63 | urllib3==2.4.0
64 | wandb==0.16.0
65 | zipp==3.21.0
66 | 


--------------------------------------------------------------------------------
/scripts/evaluation/evaluate_checkpoints.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Checkpoint Evaluation Script for MSSPlace Models
  4 | 
  5 | This script evaluates pre-trained MSSPlace model checkpoints on Oxford and NCLT
  6 | datasets to verify the results reported in our paper. It provides a clean interface
  7 | for testing different model variants with comprehensive logging and error handling.
  8 | 
  9 | Note: This script evaluates released checkpoints only. For full experimental
 10 | reproduction including training from scratch, refer to the training scripts
 11 | (not publicly released).
 12 | 
 13 | Key Features:
 14 | - Supports text-enabled datasets for all model variants
 15 | - Dynamic sensor setup configuration per model type
 16 | - Loguru-based logging with colored output
 17 | - Automatic sensor setup selection based on model name
 18 | 
 19 | Usage:
 20 |     python evaluate_checkpoints.py --dataset oxford --model mssplace-li
 21 |     python evaluate_checkpoints.py --dataset nclt --model mssplace-lis --batch-size 16
 22 |     python evaluate_checkpoints.py --dataset oxford --model mssplace-lit --verbose
 23 | 
 24 | Requirements:
 25 |     - PyTorch 2.1+
 26 |     - Python 3.10+
 27 |     - Hydra/OmegaConf for configuration management
 28 |     - Custom OPR (Open Place Recognition) library
 29 |     - loguru for enhanced logging
 30 |     - Custom datasets module with text support
 31 | 
 32 | Author: Generated from what_is_in_checkpoint.ipynb
 33 | Date: May 23, 2025
 34 | """
 35 | 
 36 | import argparse
 37 | import sys
 38 | from datetime import datetime
 39 | from pathlib import Path
 40 | 
 41 | import torch
 42 | from omegaconf import OmegaConf
 43 | from hydra.utils import instantiate
 44 | from torch.utils.data import DataLoader
 45 | from loguru import logger
 46 | 
 47 | # Import custom modules from the OPR library
 48 | from opr.testers.place_recognition.model import ModelTester, RetrievalResultsCollection
 49 | 
 50 | # Import text-enabled datasets from the installed mssplace package
 51 | from mssplace.datasets import NCLTDatasetWithText, OxfordDatasetWithText
 52 | 
 53 | 
 54 | # Configuration constants following the original notebook structure
 55 | DATASET_CHOICES = ["oxford", "nclt"]
 56 | 
 57 | MODEL_CHOICES = [
 58 |     "mssplace-li",
 59 |     "mssplace-lis",
 60 |     "mssplace-lit",
 61 |     "mssplace-list",
 62 |     "mssplace-i",
 63 |     "minkloc-multimodal",
 64 |     "minkloc3dv2",
 65 | ]
 66 | 
 67 | MODEL_CONFIG_NAMES = {
 68 |     "mssplace-li": "mssplace-li.yaml",
 69 |     "mssplace-lis": "mssplace-lis.yaml",
 70 |     "mssplace-lit": "mssplace-lit.yaml",
 71 |     "mssplace-list": "mssplace-list.yaml",
 72 |     "mssplace-i": "mssplace-i.yaml",
 73 |     "minkloc-multimodal": "minkloc-multimodal.yaml",
 74 |     "minkloc3dv2": "minkloc3dv2.yaml",
 75 | }
 76 | 
 77 | CHECKPOINT_NAMES = {
 78 |     "oxford": {
 79 |         "mssplace-li": "oxford_mssplace_li.pth",
 80 |         "mssplace-lis": "oxford_mssplace_lis.pth",
 81 |         "mssplace-lit": "oxford_mssplace_lit.pth",
 82 |         "mssplace-list": "oxford_mssplace_list.pth",
 83 |     },
 84 |     "nclt": {
 85 |         "mssplace-li": "nclt_mssplace_li.pth",
 86 |         "mssplace-lis": "nclt_mssplace_lis.pth",
 87 |         "mssplace-lit": "nclt_mssplace_lit.pth",
 88 |         "mssplace-list": "nclt_mssplace_list.pth",
 89 |         "mssplace-i": "nclt_mssplace_i.pth",
 90 |         "minkloc-multimodal": "nclt_minkloc_multimodal.pth",
 91 |         "minkloc3dv2": "nclt_minkloc3dv2.pth",
 92 |     },
 93 | }
 94 | 
 95 | SENSOR_SETUPS = {
 96 |     "oxford": {
 97 |         "mssplace-li": [
 98 |             "pointcloud_lidar",
 99 |             "image_stereo_centre",
100 |             "image_mono_left",
101 |             "image_mono_rear",
102 |             "image_mono_right"
103 |         ],
104 |         "mssplace-lis": [
105 |             "pointcloud_lidar",
106 |             "image_stereo_centre",
107 |             "image_mono_left",
108 |             "image_mono_rear",
109 |             "image_mono_right",
110 |             "mask_stereo_centre",
111 |             "mask_mono_left",
112 |             "mask_mono_rear",
113 |             "mask_mono_right",
114 |         ],
115 |         "mssplace-lit": [
116 |             "pointcloud_lidar",
117 |             "image_stereo_centre",
118 |             "image_mono_left",
119 |             "image_mono_rear",
120 |             "image_mono_right",
121 |             "text_stereo_centre",
122 |             "text_mono_left",
123 |             "text_mono_rear",
124 |             "text_mono_right",
125 |         ],
126 |         "mssplace-list": [
127 |             "pointcloud_lidar",
128 |             "image_stereo_centre",
129 |             "image_mono_left",
130 |             "image_mono_rear",
131 |             "image_mono_right",
132 |             "mask_stereo_centre",
133 |             "mask_mono_left",
134 |             "mask_mono_rear",
135 |             "mask_mono_right",
136 |             "text_stereo_centre",
137 |             "text_mono_left",
138 |             "text_mono_rear",
139 |             "text_mono_right",
140 |         ],
141 |     },
142 |     "nclt": {
143 |         "mssplace-li": [
144 |             "pointcloud_lidar",
145 |             "image_Cam1",
146 |             "image_Cam2",
147 |             "image_Cam3",
148 |             "image_Cam4",
149 |             "image_Cam5"
150 |         ],
151 |         "mssplace-lis": [
152 |             "pointcloud_lidar",
153 |             "image_Cam1",
154 |             "image_Cam2",
155 |             "image_Cam3",
156 |             "image_Cam4",
157 |             "image_Cam5",
158 |             "mask_Cam1",
159 |             "mask_Cam2",
160 |             "mask_Cam3",
161 |             "mask_Cam4",
162 |             "mask_Cam5"
163 |         ],
164 |         "mssplace-lit": [
165 |             "pointcloud_lidar",
166 |             "image_Cam1",
167 |             "image_Cam2",
168 |             "image_Cam3",
169 |             "image_Cam4",
170 |             "image_Cam5",
171 |             "text_Cam1",
172 |             "text_Cam2",
173 |             "text_Cam3",
174 |             "text_Cam4",
175 |             "text_Cam5"
176 |         ],
177 |         "mssplace-list": [
178 |             "pointcloud_lidar",
179 |             "image_Cam1",
180 |             "image_Cam2",
181 |             "image_Cam3",
182 |             "image_Cam4",
183 |             "image_Cam5",
184 |             "mask_Cam1",
185 |             "mask_Cam2",
186 |             "mask_Cam3",
187 |             "mask_Cam4",
188 |             "mask_Cam5",
189 |             "text_Cam1",
190 |             "text_Cam2",
191 |             "text_Cam3",
192 |             "text_Cam4",
193 |             "text_Cam5"
194 |         ],
195 |         "mssplace-i": [
196 |             "image_Cam1",
197 |             "image_Cam2",
198 |             "image_Cam3",
199 |             "image_Cam4",
200 |             "image_Cam5",
201 |         ],
202 |         "minkloc-multimodal": [
203 |             "pointcloud_lidar",
204 |             "image_Cam5",
205 |         ],
206 |         "minkloc3dv2": [
207 |             "pointcloud_lidar",
208 |         ],
209 |     }
210 | }
211 | 
212 | 
213 | def setup_logging(verbose: bool = False) -> None:
214 |     """
215 |     Configure loguru logging for the script.
216 | 
217 |     Args:
218 |         verbose: If True, set logging level to DEBUG, otherwise INFO
219 |     """
220 |     # Remove default logger
221 |     logger.remove()
222 | 
223 |     # Configure loguru with appropriate level
224 |     log_level = "DEBUG" if verbose else "INFO"
225 |     logger.add(
226 |         sys.stdout,
227 |         level=log_level,
228 |         format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
229 |     )
230 | 
231 | 
232 | def validate_paths(datasets_dir: Path, checkpoint_dir: Path, config_dir: Path) -> None:
233 |     """
234 |     Validate that all required directories exist.
235 | 
236 |     Args:
237 |         datasets_dir: Path to datasets directory
238 |         checkpoint_dir: Path to checkpoints directory
239 |         config_dir: Path to configs directory
240 | 
241 |     Raises:
242 |         FileNotFoundError: If any required directory is missing
243 |     """
244 |     if not datasets_dir.exists():
245 |         raise FileNotFoundError(f"Datasets directory does not exist: {datasets_dir}")
246 |     if not checkpoint_dir.exists():
247 |         raise FileNotFoundError(f"Checkpoints directory does not exist: {checkpoint_dir}")
248 |     if not config_dir.exists():
249 |         raise FileNotFoundError(f"Configs directory does not exist: {config_dir}")
250 | 
251 | 
252 | def get_dataset_path(dataset_name: str, datasets_dir: Path) -> Path:
253 |     """
254 |     Get the specific dataset path based on dataset name.
255 | 
256 |     Args:
257 |         dataset_name: Name of the dataset ('oxford' or 'nclt')
258 |         datasets_dir: Base datasets directory path
259 | 
260 |     Returns:
261 |         Path to the specific dataset directory
262 |     """
263 |     if dataset_name == "oxford":
264 |         return datasets_dir / "pnvlad_oxford_robotcar"
265 |     elif dataset_name == "nclt":
266 |         return datasets_dir / "NCLT_preprocessed"
267 |     else:
268 |         raise ValueError(f"Unknown dataset: {dataset_name}")
269 | 
270 | 
271 | def load_checkpoint(checkpoint_path: Path, device: str = "cpu") -> dict:
272 |     """
273 |     Load model checkpoint from file.
274 | 
275 |     Args:
276 |         checkpoint_path: Path to checkpoint file
277 |         device: Device to load checkpoint to
278 | 
279 |     Returns:
280 |         Dictionary containing model state dict
281 | 
282 |     Raises:
283 |         FileNotFoundError: If checkpoint file doesn't exist
284 |     """
285 |     if not checkpoint_path.exists():
286 |         raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
287 | 
288 |     logger.info(f"Loading checkpoint from: {checkpoint_path}")
289 |     checkpoint = torch.load(checkpoint_path, map_location=device)
290 | 
291 |     # Handle different checkpoint formats
292 |     if "model_state_dict" in checkpoint:
293 |         checkpoint = checkpoint["model_state_dict"]
294 | 
295 |     logger.info(f"Checkpoint loaded with {len(checkpoint.keys())} parameter groups")
296 |     return checkpoint
297 | 
298 | 
299 | def load_model_config(config_path: Path) -> OmegaConf:
300 |     """
301 |     Load model configuration from YAML file.
302 | 
303 |     Args:
304 |         config_path: Path to configuration file
305 | 
306 |     Returns:
307 |         OmegaConf configuration object
308 | 
309 |     Raises:
310 |         FileNotFoundError: If config file doesn't exist
311 |     """
312 |     if not config_path.exists():
313 |         raise FileNotFoundError(f"Config not found at {config_path}")
314 | 
315 |     logger.info(f"Loading config from: {config_path}")
316 |     config = OmegaConf.load(config_path)
317 | 
318 |     # Log config details for reproducibility
319 |     config_dict = OmegaConf.to_container(config, resolve=True)
320 |     logger.debug(f"Model configuration: {config_dict}")
321 | 
322 |     return config
323 | 
324 | 
325 | def create_dataset(dataset_name: str, data_dir: Path, sensor_setup: list[str]) -> torch.utils.data.Dataset:
326 |     """
327 |     Create dataset instance based on dataset name with specified sensor setup.
328 | 
329 |     Always uses text-enabled dataset classes (*DatasetWithText) to ensure
330 |     compatibility with all model variants, including text-based models.
331 | 
332 |     Args:
333 |         dataset_name: Name of dataset ('oxford' or 'nclt')
334 |         data_dir: Path to dataset directory
335 |         sensor_setup: List of sensors/modalities to load
336 | 
337 |     Returns:
338 |         Dataset instance ready for testing (text-enabled)
339 |     """
340 |     logger.info(f"Creating {dataset_name} dataset from: {data_dir}")
341 |     logger.debug(f"Sensor setup: {sensor_setup}")
342 | 
343 |     if dataset_name == "oxford":
344 |         dataset = OxfordDatasetWithText(
345 |             dataset_root=data_dir,
346 |             subset="test",
347 |             data_to_load=sensor_setup,
348 |             pointcloud_quantization_size=0.01,
349 |         )
350 |     elif dataset_name == "nclt":
351 |         dataset = NCLTDatasetWithText(
352 |             dataset_root=data_dir,
353 |             subset="test",
354 |             data_to_load=sensor_setup,
355 |         )
356 |     else:
357 |         raise ValueError(f"Unknown dataset: {dataset_name}")
358 | 
359 |     logger.info(f"Dataset created with {len(dataset)} samples")
360 |     return dataset
361 | 
362 | 
363 | def evaluate_model(
364 |     model: torch.nn.Module,
365 |     dataloader: DataLoader,
366 |     device: str,
367 |     distance_threshold: float = 25.0,
368 |     memory_batch_size: int | None = None,
369 |     verbose: bool = True,
370 | ) -> tuple[dict[str, float], RetrievalResultsCollection]:
371 |     """
372 |     Evaluate model performance using comprehensive ModelTester.
373 | 
374 |     This function leverages the advanced ModelTester class to provide detailed
375 |     place recognition analysis beyond simple aggregate metrics. It supports
376 |     memory-efficient evaluation and returns comprehensive results suitable
377 |     for research analysis and reproducibility.
378 | 
379 |     Args:
380 |         model: PyTorch model to evaluate
381 |         dataloader: DataLoader for test data
382 |         device: Device to run evaluation on
383 |         distance_threshold: Distance threshold for positive matches (meters)
384 |         memory_batch_size: If specified, compute distance matrix in batches
385 |             to reduce peak memory usage. Useful for large datasets.
386 |         verbose: Whether to show detailed progress information
387 | 
388 |     Returns:
389 |         Tuple containing:
390 |         - dict: Aggregate metrics (recall_at_n, recall_at_one_percent, etc.)
391 |         - RetrievalResultsCollection: Detailed per-query results for analysis
392 | 
393 |     Note:
394 |         The memory_batch_size parameter trades computation speed for memory
395 |         efficiency. For datasets with >10k samples, consider using batch
396 |         sizes of 1000-5000 depending on available RAM.
397 |     """
398 |     logger.info("Starting comprehensive model evaluation with ModelTester...")
399 | 
400 |     # Initialize ModelTester with memory-efficient settings
401 |     tester = ModelTester(
402 |         model=model,
403 |         dataloader=dataloader,
404 |         dist_thresh=distance_threshold,
405 |         at_n=25,  # Standard benchmark value
406 |         device=device,
407 |         verbose=verbose,
408 |         batch_size=memory_batch_size,  # Enable memory-efficient computation
409 |     )
410 | 
411 |     # Run comprehensive evaluation
412 |     results_collection = tester.run()
413 | 
414 |     # Extract aggregate metrics for backward compatibility
415 |     aggregate_metrics = results_collection.aggregate_metrics()
416 | 
417 |     # Convert to format expected by existing display logic
418 |     recall_at_n_array = aggregate_metrics["recall_at_n"]
419 |     recall_at_one_percent = aggregate_metrics["recall_at_one_percent"]
420 | 
421 |     # For top1_distance, use the aggregate value or compute if None
422 |     top1_distance = aggregate_metrics.get("top1_distance")
423 |     if top1_distance is None:
424 |         # Fallback: compute mean embedding distance of correct top-1 matches
425 |         top1_distances = []
426 |         for result in results_collection.results:
427 |             if result.queries_with_matches > 0 and result.top1_distance is not None:
428 |                 top1_distances.append(result.top1_distance)
429 |         top1_distance = sum(top1_distances) / len(top1_distances) if top1_distances else 0.0
430 | 
431 |     # Create backward-compatible metrics dict
432 |     backward_compatible_metrics = {
433 |         "recall_at_n": recall_at_n_array,
434 |         "recall_at_one_percent": recall_at_one_percent,
435 |         "mean_top1_descriptor_distance": top1_distance,
436 |     }
437 | 
438 |     logger.info("Comprehensive model evaluation completed")
439 |     logger.info(f"Processed {results_collection.num_pairs} track pairs with "
440 |                 f"{results_collection.num_queries} total queries")
441 | 
442 |     return backward_compatible_metrics, results_collection
443 | 
444 | 
445 | def save_evaluation_results(
446 |     results_collection: RetrievalResultsCollection,
447 |     dataset_name: str,
448 |     model_name: str,
449 |     results_dir: Path,
450 |     additional_metadata: dict | None = None,
451 | ) -> Path:
452 |     """
453 |     Save detailed evaluation results to disk for later analysis.
454 | 
455 |     Creates a structured filename and saves both the raw results collection
456 |     and additional metadata for research reproducibility.
457 | 
458 |     Args:
459 |         results_collection: Detailed results from ModelTester
460 |         dataset_name: Name of the evaluated dataset
461 |         model_name: Name of the evaluated model
462 |         results_dir: Directory to save results
463 |         additional_metadata: Optional dict with extra information to save
464 | 
465 |     Returns:
466 |         Path to the saved results file
467 | 
468 |     Note:
469 |         Results are saved in JSON format with timestamp for uniqueness.
470 |         The file includes both detailed per-query results and aggregate metrics.
471 |     """
472 |     from datetime import datetime
473 | 
474 |     # Create results directory if it doesn't exist
475 |     results_dir.mkdir(parents=True, exist_ok=True)
476 | 
477 |     # Generate timestamped filename
478 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
479 |     filename = f"{dataset_name}_{model_name}_results_{timestamp}.json"
480 |     results_path = results_dir / filename
481 | 
482 |     logger.info(f"Saving detailed evaluation results to: {results_path}")
483 | 
484 |     # Save the results collection (includes built-in JSON serialization)
485 |     results_collection.save(str(results_path))
486 | 
487 |     # If metadata provided, save it alongside
488 |     if additional_metadata:
489 |         metadata_path = results_dir / f"{dataset_name}_{model_name}_metadata_{timestamp}.json"
490 |         import json
491 |         with open(metadata_path, 'w') as f:
492 |             json.dump(additional_metadata, f, indent=2)
493 |         logger.info(f"Saved evaluation metadata to: {metadata_path}")
494 | 
495 |     return results_path
496 | 
497 | 
498 | def format_percentage(value: float) -> str:
499 |     """
500 |     Format a decimal value as a percentage with 2 decimal places (truncated, not rounded).
501 | 
502 |     Args:
503 |         value: Decimal value between 0 and 1
504 | 
505 |     Returns:
506 |         Formatted percentage string
507 |     """
508 |     # Truncate to 2 decimal places without rounding, as in original notebook
509 |     integer_part = int(value * 100)
510 |     decimal_part = int((value * 100) % 1 * 100)
511 |     return f"{integer_part}.{decimal_part:02d}%"
512 | 
513 | 
514 | def main() -> None:
515 |     """
516 |     Main function that orchestrates the checkpoint testing process.
517 |     """
518 |     parser = argparse.ArgumentParser(
519 |         description="Test MSSPlace model checkpoints on Oxford and NCLT datasets",
520 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
521 |     )
522 | 
523 |     parser.add_argument(
524 |         "--dataset",
525 |         type=str,
526 |         choices=DATASET_CHOICES,
527 |         required=True,
528 |         help="Dataset to test on"
529 |     )
530 | 
531 |     parser.add_argument(
532 |         "--model",
533 |         type=str,
534 |         choices=MODEL_CHOICES,
535 |         required=True,
536 |         help="Model variant to test"
537 |     )
538 | 
539 |     parser.add_argument(
540 |         "--datasets-dir",
541 |         type=Path,
542 |         default=Path("/home/docker_mssplace/Datasets"),
543 |         help="Path to datasets directory"
544 |     )
545 | 
546 |     parser.add_argument(
547 |         "--checkpoints-dir",
548 |         type=Path,
549 |         default=Path(__file__).parent.parent.parent / "checkpoints",
550 |         help="Path to checkpoints directory"
551 |     )
552 | 
553 |     parser.add_argument(
554 |         "--configs-dir",
555 |         type=Path,
556 |         default=Path(__file__).parent.parent.parent / "configs" / "model",
557 |         help="Path to model configs directory"
558 |     )
559 | 
560 |     parser.add_argument(
561 |         "--batch-size",
562 |         type=int,
563 |         default=32,
564 |         help="Batch size for evaluation"
565 |     )
566 | 
567 |     parser.add_argument(
568 |         "--num-workers",
569 |         type=int,
570 |         default=4,
571 |         help="Number of dataloader workers"
572 |     )
573 | 
574 |     parser.add_argument(
575 |         "--distance-threshold",
576 |         type=float,
577 |         default=25.0,
578 |         help="Distance threshold for positive matches (meters)"
579 |     )
580 | 
581 |     parser.add_argument(
582 |         "--device",
583 |         type=str,
584 |         default="cuda" if torch.cuda.is_available() else "cpu",
585 |         help="Device to run evaluation on"
586 |     )
587 | 
588 |     parser.add_argument(
589 |         "--verbose",
590 |         action="store_true",
591 |         help="Enable verbose logging"
592 |     )
593 | 
594 |     parser.add_argument(
595 |         "--memory-batch-size",
596 |         type=int,
597 |         default=None,
598 |         help="Batch size for memory-efficient distance computation (reduces memory usage)"
599 |     )
600 | 
601 |     parser.add_argument(
602 |         "--save-results",
603 |         action="store_true",
604 |         help="Save detailed evaluation results to JSON file for later analysis"
605 |     )
606 | 
607 |     parser.add_argument(
608 |         "--results-dir",
609 |         type=Path,
610 |         default=Path("./evaluation_results"),
611 |         help="Directory to save detailed evaluation results"
612 |     )
613 | 
614 |     args = parser.parse_args()
615 | 
616 |     # Setup logging
617 |     setup_logging(args.verbose)
618 |     logger.info(f"Starting checkpoint testing for {args.model} on {args.dataset}")
619 | 
620 |     try:
621 |         # Validate all required paths exist
622 |         validate_paths(args.datasets_dir, args.checkpoints_dir, args.configs_dir)
623 | 
624 |         # Get specific paths for this dataset/model combination
625 |         checkpoint_name = CHECKPOINT_NAMES[args.dataset][args.model]
626 |         config_name = MODEL_CONFIG_NAMES[args.model]
627 |         sensor_setup = SENSOR_SETUPS[args.dataset][args.model]
628 | 
629 |         checkpoint_path = args.checkpoints_dir / checkpoint_name
630 |         config_path = args.configs_dir / config_name
631 |         data_dir = get_dataset_path(args.dataset, args.datasets_dir)
632 | 
633 |         # Load checkpoint and config
634 |         checkpoint = load_checkpoint(checkpoint_path, device="cpu")
635 |         config = load_model_config(config_path)
636 | 
637 |         # Initialize model and load weights
638 |         logger.info("Initializing model...")
639 |         model = instantiate(config)
640 |         model.load_state_dict(checkpoint, strict=True)
641 | 
642 |         num_parameters = sum(p.numel() for p in model.parameters())
643 |         logger.info(f"Model loaded successfully with {num_parameters:,} parameters")
644 | 
645 |         # Create dataset and dataloader
646 |         dataset = create_dataset(args.dataset, data_dir, sensor_setup)
647 |         dataloader = DataLoader(
648 |             dataset=dataset,
649 |             batch_size=args.batch_size,
650 |             shuffle=False,
651 |             num_workers=args.num_workers,
652 |             pin_memory=True if args.device == "cuda" else False,
653 |             collate_fn=dataset.collate_fn,
654 |             drop_last=False,
655 |         )
656 | 
657 |         # Evaluate model using comprehensive ModelTester
658 |         metrics, results_collection = evaluate_model(
659 |             model=model,
660 |             dataloader=dataloader,
661 |             device=args.device,
662 |             distance_threshold=args.distance_threshold,
663 |             memory_batch_size=args.memory_batch_size,
664 |             verbose=args.verbose,
665 |         )
666 | 
667 |         # Extract metrics for display (backward compatibility)
668 |         recall_at_n = metrics["recall_at_n"]
669 |         recall_at_one_percent = metrics["recall_at_one_percent"]
670 |         mean_top1_descriptor_distance = metrics["mean_top1_descriptor_distance"]
671 | 
672 |         # Optionally save detailed results for research analysis
673 |         if args.save_results:
674 |             evaluation_metadata = {
675 |                 "dataset": args.dataset,
676 |                 "model": args.model,
677 |                 "device": args.device,
678 |                 "distance_threshold": args.distance_threshold,
679 |                 "batch_size": args.batch_size,
680 |                 "num_workers": args.num_workers,
681 |                 "memory_batch_size": args.memory_batch_size,
682 |                 "model_parameters": num_parameters,
683 |                 "dataset_size": len(dataset),
684 |                 "evaluation_timestamp": datetime.now().isoformat(),
685 |             }
686 | 
687 |             results_path = save_evaluation_results(
688 |                 results_collection=results_collection,
689 |                 dataset_name=args.dataset,
690 |                 model_name=args.model,
691 |                 results_dir=args.results_dir,
692 |                 additional_metadata=evaluation_metadata,
693 |             )
694 | 
695 |             logger.info(f"Detailed results saved for future analysis: {results_path}")
696 | 
697 |         # Display results with enhanced information
698 |         print("\n" + "="*60)
699 |         print("COMPREHENSIVE EVALUATION RESULTS")
700 |         print("="*60)
701 |         print(f"Dataset: {args.dataset}")
702 |         print(f"Model: {args.model}")
703 |         print(f"Device: {args.device}")
704 |         print(f"Distance threshold: {args.distance_threshold}m")
705 |         if args.memory_batch_size:
706 |             print(f"Memory batch size: {args.memory_batch_size} (memory-efficient mode)")
707 |         print("-"*60)
708 | 
709 |         # Traditional metrics (backward compatibility)
710 |         print("PLACE RECOGNITION METRICS:")
711 |         print(f"  AR@1  = {format_percentage(recall_at_n[0])}")
712 |         print(f"  AR@1% = {format_percentage(recall_at_one_percent)}")
713 |         print(f"  Mean top-1 descriptor distance: {mean_top1_descriptor_distance:.6f}")
714 | 
715 |         # Enhanced insights from detailed analysis
716 |         print("\nDETAILED ANALYSIS:")
717 |         aggregate_metrics = results_collection.aggregate_metrics()
718 |         print(f"  Total track pairs evaluated: {results_collection.num_pairs}")
719 |         print(f"  Total query samples: {results_collection.num_queries}")
720 |         print(f"  Queries with ground truth matches: {aggregate_metrics['queries_with_matches']}")
721 |         print(f"  Overall accuracy (correct top-1): {format_percentage(aggregate_metrics['overall_accuracy'])}")
722 | 
723 |         # Additional recall metrics for research insight
724 |         if len(recall_at_n) >= 5:
725 |             print(f"  AR@5  = {format_percentage(recall_at_n[4])}")
726 |         if len(recall_at_n) >= 10:
727 |             print(f"  AR@10 = {format_percentage(recall_at_n[9])}")
728 | 
729 |         print("="*60)
730 | 
731 |         logger.info("Checkpoint testing completed successfully")
732 | 
733 |     except Exception as e:
734 |         logger.error(f"Error during checkpoint testing: {e}")
735 |         sys.exit(1)
736 | 
737 | 
738 | if __name__ == "__main__":
739 |     main()
740 | 


--------------------------------------------------------------------------------
/src/mssplace/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/src/mssplace/__init__.py


--------------------------------------------------------------------------------
/src/mssplace/datasets.py:
--------------------------------------------------------------------------------
  1 | """Datasets implementation."""
  2 | from pathlib import Path
  3 | from typing import Any, Dict, List, Literal, Optional, Tuple, Union
  4 | 
  5 | import cv2
  6 | import MinkowskiEngine as ME  # type: ignore
  7 | import numpy as np
  8 | import torch
  9 | from torch import Tensor
 10 | from opr.datasets.base import BasePlaceRecognitionDataset
 11 | from opr.utils import cartesian_to_spherical
 12 | 
 13 | 
 14 | def collate_data_dict(
 15 |     dataset: BasePlaceRecognitionDataset, data_list: List[Dict[str, Tensor]]
 16 | ) -> Dict[str, Tensor]:
 17 |     """Pack input data list into batch."""
 18 |     result: Dict[str, Tensor] = {}
 19 |     result["idxs"] = torch.stack([e["idx"] for e in data_list], dim=0)
 20 |     for data_key in data_list[0].keys():
 21 |         if data_key == "idx":
 22 |             continue
 23 |         elif data_key == "utm":
 24 |             result["utms"] = torch.stack([e["utm"] for e in data_list], dim=0)
 25 |         elif data_key.startswith("image_"):
 26 |             result[f"images_{data_key[6:]}"] = torch.stack([e[data_key] for e in data_list])
 27 |         elif data_key.startswith("mask_"):
 28 |             result[f"masks_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list])
 29 |         elif data_key.startswith("text_"):
 30 |             result[f"texts_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list])
 31 |         elif data_key == "pointcloud_lidar_coords":
 32 |             coords_list = [e["pointcloud_lidar_coords"] for e in data_list]
 33 |             feats_list = [e["pointcloud_lidar_feats"] for e in data_list]
 34 |             n_points = [int(e.shape[0]) for e in coords_list]
 35 |             coords_tensor = torch.cat(coords_list, dim=0).unsqueeze(0)  # (1,batch_size*n_points,3)
 36 |             if dataset.pointcloud_set_transform is not None:
 37 |                 # Apply the same transformation on all dataset elements
 38 |                 coords_tensor = dataset.pointcloud_set_transform(coords_tensor)
 39 |             coords_list = torch.split(coords_tensor.squeeze(0), split_size_or_sections=n_points, dim=0)
 40 |             quantized_coords_list = []
 41 |             quantized_feats_list = []
 42 |             for coords, feats in zip(coords_list, feats_list):
 43 |                 quantized_coords, quantized_feats = ME.utils.sparse_quantize(
 44 |                     coordinates=coords,
 45 |                     features=feats,
 46 |                     quantization_size=dataset._pointcloud_quantization_size,
 47 |                 )
 48 |                 quantized_coords_list.append(quantized_coords)
 49 |                 quantized_feats_list.append(quantized_feats)
 50 | 
 51 |             result["pointclouds_lidar_coords"] = ME.utils.batched_coordinates(quantized_coords_list)
 52 |             result["pointclouds_lidar_feats"] = torch.cat(quantized_feats_list)
 53 |         elif data_key == "pointcloud_lidar_feats":
 54 |             continue
 55 |         else:
 56 |             raise ValueError(f"Unknown data key: {data_key!r}")
 57 |     return result
 58 | 
 59 | 
 60 | class NCLTDatasetWithText(BasePlaceRecognitionDataset):
 61 |     """NCLT dataset implementation with text embeddings."""
 62 | 
 63 |     _images_dirname: str
 64 |     _masks_dirname: str
 65 |     _pointclouds_dirname: str
 66 |     _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]]
 67 |     _max_point_distance: Optional[float]
 68 |     _spherical_coords: bool
 69 |     _use_intensity_values: bool
 70 |     _valid_data: Tuple[str, ...] = (
 71 |         "image_Cam0",
 72 |         "image_Cam1",
 73 |         "image_Cam2",
 74 |         "image_Cam3",
 75 |         "image_Cam4",
 76 |         "image_Cam5",
 77 |         "pointcloud_lidar",
 78 |         "mask_Cam0",
 79 |         "mask_Cam1",
 80 |         "mask_Cam2",
 81 |         "mask_Cam3",
 82 |         "mask_Cam4",
 83 |         "mask_Cam5",
 84 |         "text_Cam0",
 85 |         "text_Cam1",
 86 |         "text_Cam2",
 87 |         "text_Cam3",
 88 |         "text_Cam4",
 89 |         "text_Cam5",
 90 |     )
 91 | 
 92 |     def __init__(
 93 |         self,
 94 |         dataset_root: Union[str, Path],
 95 |         subset: Literal["train", "val", "test"],
 96 |         data_to_load: Union[str, Tuple[str, ...]],
 97 |         positive_threshold: float = 10.0,
 98 |         negative_threshold: float = 50.0,
 99 |         images_dirname: str = "images_small",
100 |         masks_dirname: str = "segmentation_masks_small",
101 |         text_embeddings_dirname: str = "clip-vit-base-patch32",
102 |         pointclouds_dirname: str = "velodyne_data",
103 |         pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.5,
104 |         max_point_distance: Optional[float] = None,
105 |         spherical_coords: bool = False,
106 |         use_intensity_values: bool = False,
107 |         image_transform: Optional[Any] = None,
108 |         semantic_transform: Optional[Any] = None,
109 |         pointcloud_transform: Optional[Any] = None,
110 |         pointcloud_set_transform: Optional[Any] = None,
111 |     ) -> None:
112 |         """NCLT dataset implementation.
113 | 
114 |         Args:
115 |             dataset_root (Union[str, Path]): Path to the dataset root directory.
116 |             subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train".
117 |             data_to_load (Union[str, Tuple[str, ...]]): The list of data to load.
118 |                 Check the documentation for the list of available data.
119 |             positive_threshold (float): The UTM distance threshold value for positive samples.
120 |                 Defaults to 10.0.
121 |             negative_threshold (float): The UTM distance threshold value for negative samples.
122 |                 Defaults to 50.0.
123 |             images_dirname (str): Images directory name. It should be specified explicitly
124 |                 if custom preprocessing was done. Defaults to "images".
125 |             masks_dirname (str): Masks directory name. It should be specified explicitly
126 |                 if custom preprocessing was done. Defaults to "segmentation_masks".
127 |             text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32".
128 |             pointclouds_dirname (str): Point clouds directory name. It should be specified
129 |                 explicitly if custom preprocessing was done. Defaults to "velodyne_data".
130 |             pointcloud_quantization_size (float, optional): The quantization size for point clouds.
131 |                 Defaults to 0.01.
132 |             max_point_distance (float, optional): The maximum distance of points from the origin.
133 |                 Defaults to None.
134 |             spherical_coords (bool): Whether to use spherical coordinates for point clouds.
135 |                 Defaults to False.
136 |             use_intensity_values (bool): Whether to use intensity values for point clouds. Defaults to False.
137 |             image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used.
138 |                 Defaults to None.
139 |             semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform
140 |                 will be used. Defaults to None.
141 |             pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform
142 |                 will be used. Defaults to None.
143 |             pointcloud_set_transform (Any, optional): Point clouds set transform. If None,
144 |                 DefaultCloudSetTransform will be used. Defaults to None.
145 | 
146 |         Raises:
147 |             ValueError: If data_to_load contains invalid data source names.
148 |             FileNotFoundError: If images, masks or pointclouds directory does not exist.
149 |         """
150 |         super().__init__(
151 |             dataset_root,
152 |             subset,
153 |             data_to_load,
154 |             positive_threshold,
155 |             negative_threshold,
156 |             image_transform,
157 |             semantic_transform,
158 |             pointcloud_transform,
159 |             pointcloud_set_transform,
160 |         )
161 | 
162 |         if subset == "test":
163 |             self.dataset_df["in_query"] = True  # for compatibility with Oxford Dataset
164 | 
165 |         if any(elem not in self._valid_data for elem in self.data_to_load):
166 |             raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}")
167 | 
168 |         _track_name = self.dataset_df.iloc[0]["track"]
169 | 
170 |         if any(elem.startswith("image") for elem in self.data_to_load):
171 |             self._images_dirname = images_dirname
172 |             if not (self.dataset_root / _track_name / self._images_dirname).exists():
173 |                 raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.")
174 | 
175 |         if any(elem.startswith("mask") for elem in self.data_to_load):
176 |             self._masks_dirname = masks_dirname
177 |             if not (self.dataset_root / _track_name / self._masks_dirname).exists():
178 |                 raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.")
179 | 
180 |         if any(elem.startswith("text") for elem in self.data_to_load):
181 |             self._text_embeddings_dirname = text_embeddings_dirname
182 |             if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists():
183 |                 raise FileNotFoundError(
184 |                     f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist."
185 |                 )
186 | 
187 |         if "pointcloud_lidar" in self.data_to_load:
188 |             self._pointclouds_dirname = pointclouds_dirname
189 |             if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists():
190 |                 raise FileNotFoundError(
191 |                     f"Pointclouds directory {self._pointclouds_dirname!r} does not exist."
192 |                 )
193 | 
194 |         self._pointcloud_quantization_size = pointcloud_quantization_size
195 |         self._max_point_distance = max_point_distance
196 |         self._spherical_coords = spherical_coords
197 |         self._use_intensity_values = use_intensity_values
198 | 
199 |     def __getitem__(self, idx: int) -> Dict[str, Tensor]:  # noqa: D105
200 |         row = self.dataset_df.iloc[idx]
201 |         data = {"idx": torch.tensor(idx, dtype=int)}
202 |         data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64))
203 |         track_dir = self.dataset_root / str(row["track"])
204 | 
205 |         for data_source in self.data_to_load:
206 |             if data_source.startswith("image_"):
207 |                 cam_name = data_source[6:]  # remove "image_" prefix
208 |                 image_ts = int(row["image"])
209 |                 im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png"
210 |                 im = cv2.imread(str(im_filepath))
211 |                 im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
212 |                 im = self.image_transform(im)
213 |                 data[data_source] = im
214 |             elif data_source.startswith("mask_"):
215 |                 cam_name = data_source[5:]  # remove "mask_" prefix
216 |                 image_ts = int(row["image"])
217 |                 mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png"
218 |                 mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED)
219 |                 mask = self.semantic_transform(mask)
220 |                 data[data_source] = mask
221 |             elif data_source.startswith("text_"):
222 |                 cam_name = data_source[5:]  # remove "text_" prefix
223 |                 image_ts = int(row["image"])
224 |                 text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt"
225 |                 text_embedding = torch.load(text_filepath, map_location="cpu").squeeze()
226 |                 data[data_source] = text_embedding
227 |             elif data_source == "pointcloud_lidar":
228 |                 pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin"
229 |                 pointcloud = self._load_pc(pc_filepath)
230 |                 data[f"{data_source}_coords"] = self.pointcloud_transform(pointcloud[:, :3])
231 |                 if self._use_intensity_values:
232 |                     data[f"{data_source}_feats"] = pointcloud[:, 3].unsqueeze(1)
233 |                 else:
234 |                     data[f"{data_source}_feats"] = torch.ones_like(pointcloud[:, :1])
235 | 
236 |         return data
237 | 
238 |     def _load_pc(self, filepath: Union[str, Path]) -> Tensor:
239 |         if self._use_intensity_values:
240 |             raise NotImplementedError("Intensity values are not supported yet.")
241 |         pc = np.fromfile(filepath, dtype=np.float32).reshape(-1, 3)
242 |         if self._max_point_distance is not None:
243 |             pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance]
244 |         if self._spherical_coords:
245 |             pc = cartesian_to_spherical(pc, dataset_name="nclt")
246 |         pc_tensor = torch.tensor(pc, dtype=torch.float)
247 |         return pc_tensor
248 | 
249 |     def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
250 |         """Pack input data list into batch.
251 | 
252 |         Args:
253 |             data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader.
254 | 
255 |         Returns:
256 |             Dict[str, Tensor]: dictionary of batched data.
257 |         """
258 |         return collate_data_dict(self, data_list)
259 | 
260 | 
261 | class OxfordDatasetWithText(BasePlaceRecognitionDataset):
262 |     """PointNetVLAD Oxford RobotCar dataset implementation with text embeddings."""
263 | 
264 |     _images_dirname: str
265 |     _masks_dirname: str
266 |     _pointclouds_dirname: str
267 |     _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]]
268 |     _max_point_distance: Optional[float]
269 |     _spherical_coords: bool
270 |     _valid_data: Tuple[str, ...] = (
271 |         "image_stereo_centre",
272 |         "image_mono_left",
273 |         "image_mono_rear",
274 |         "image_mono_right",
275 |         "pointcloud_lidar",
276 |         "mask_stereo_centre",
277 |         "mask_mono_left",
278 |         "mask_mono_rear",
279 |         "mask_mono_right",
280 |         "text_stereo_centre",
281 |         "text_mono_left",
282 |         "text_mono_rear",
283 |         "text_mono_right",
284 |     )
285 | 
286 |     def __init__(
287 |         self,
288 |         dataset_root: Union[str, Path],
289 |         subset: Literal["train", "val", "test"],
290 |         data_to_load: Union[str, Tuple[str, ...]],
291 |         positive_threshold: float = 10.0,
292 |         negative_threshold: float = 50.0,
293 |         images_dirname: str = "images_small",
294 |         masks_dirname: str = "segmentation_masks_small",
295 |         text_embeddings_dirname: str = "clip-vit-base-patch32",
296 |         pointclouds_dirname: Optional[str] = None,
297 |         pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.01,
298 |         max_point_distance: Optional[float] = None,
299 |         spherical_coords: bool = False,
300 |         image_transform: Optional[Any] = None,
301 |         semantic_transform: Optional[Any] = None,
302 |         pointcloud_transform: Optional[Any] = None,
303 |         pointcloud_set_transform: Optional[Any] = None,
304 |     ) -> None:
305 |         """Oxford RobotCar dataset implementation.
306 | 
307 |         Original dataset site: https://robotcar-dataset.robots.ox.ac.uk/
308 | 
309 |         We use the preprocessed version of the dataset that was introduced
310 |             in PointNetVLAD paper: https://arxiv.org/abs/1804.03492.
311 | 
312 |         Args:
313 |             dataset_root (Union[str, Path]): Path to the dataset root directory.
314 |             subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train".
315 |             data_to_load (Union[str, Tuple[str, ...]]): The list of data to load.
316 |                 Check the documentation for the list of available data.
317 |             positive_threshold (float): The UTM distance threshold value for positive samples.
318 |                 Defaults to 10.0.
319 |             negative_threshold (float): The UTM distance threshold value for negative samples.
320 |                 Defaults to 50.0.
321 |             images_dirname (str): Images directory name. It should be specified explicitly
322 |                 if custom preprocessing was done. Defaults to "images_small".
323 |             masks_dirname (str): Masks directory name. It should be specified explicitly
324 |                 if custom preprocessing was done. Defaults to "segmentation_masks_small".
325 |             text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32".
326 |             pointclouds_dirname (Optional[str]): Point clouds directory name. It should be specified
327 |                 explicitly if custom preprocessing was done. Defaults to None, which sets the dirnames
328 |                 like in original PointNetVLAD dataset configuration.
329 |             pointcloud_quantization_size (float, optional): The quantization size for point clouds.
330 |                 Defaults to 0.01.
331 |             max_point_distance (float, optional): The maximum distance of points from the origin.
332 |                 Defaults to None.
333 |             spherical_coords (bool): Whether to use spherical coordinates for point clouds.
334 |                 Defaults to False.
335 |             image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used.
336 |                 Defaults to None.
337 |             semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform
338 |                 will be used. Defaults to None.
339 |             pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform
340 |                 will be used. Defaults to None.
341 |             pointcloud_set_transform (Any, optional): Point clouds set transform. If None,
342 |                 DefaultCloudSetTransform will be used. Defaults to None.
343 | 
344 |         Raises:
345 |             ValueError: If data_to_load contains invalid data source names.
346 |             FileNotFoundError: If images, masks or pointclouds directory does not exist.
347 |         """
348 |         super().__init__(
349 |             dataset_root,
350 |             subset,
351 |             data_to_load,
352 |             positive_threshold,
353 |             negative_threshold,
354 |             image_transform,
355 |             semantic_transform,
356 |             pointcloud_transform,
357 |             pointcloud_set_transform,
358 |         )
359 | 
360 |         if any(elem not in self._valid_data for elem in self.data_to_load):
361 |             raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}")
362 | 
363 |         _track_name = self.dataset_df.iloc[0]["track"]
364 | 
365 |         if any(elem.startswith("image") for elem in self.data_to_load):
366 |             self._images_dirname = images_dirname
367 |             if not (self.dataset_root / _track_name / self._images_dirname).exists():
368 |                 raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.")
369 | 
370 |         if any(elem.startswith("mask") for elem in self.data_to_load):
371 |             self._masks_dirname = masks_dirname
372 |             if not (self.dataset_root / _track_name / self._masks_dirname).exists():
373 |                 raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.")
374 | 
375 |         if any(elem.startswith("text") for elem in self.data_to_load):
376 |             self._text_embeddings_dirname = text_embeddings_dirname
377 |             if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists():
378 |                 raise FileNotFoundError(
379 |                     f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist."
380 |                 )
381 | 
382 |         if "pointcloud_lidar" in self.data_to_load:
383 |             if pointclouds_dirname is not None:
384 |                 self._pointclouds_dirname = pointclouds_dirname
385 |             elif subset in ("train", "val"):
386 |                 self._pointclouds_dirname = "pointcloud_20m_10overlap"
387 |             else:
388 |                 self._pointclouds_dirname = "pointcloud_20m"
389 |             if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists():
390 |                 raise FileNotFoundError(
391 |                     f"Pointclouds directory {self._pointclouds_dirname!r} does not exist."
392 |                 )
393 | 
394 |         self._pointcloud_quantization_size = pointcloud_quantization_size
395 |         self._max_point_distance = max_point_distance
396 |         self._spherical_coords = spherical_coords
397 | 
398 |     def __getitem__(self, idx: int) -> Dict[str, Tensor]:  # noqa: D105
399 |         row = self.dataset_df.iloc[idx]
400 |         data = {"idx": torch.tensor(idx, dtype=int)}
401 |         data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64))
402 |         track_dir = self.dataset_root / str(row["track"])
403 | 
404 |         for data_source in self.data_to_load:
405 |             if data_source.startswith("image_"):
406 |                 cam_name = data_source[6:]  # remove "image_" prefix
407 |                 image_ts = int(row[cam_name])
408 |                 im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png"
409 |                 im = cv2.imread(str(im_filepath))
410 |                 im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
411 |                 im = self.image_transform(im)
412 |                 data[data_source] = im
413 |             elif data_source.startswith("mask_"):
414 |                 cam_name = data_source[5:]  # remove "mask_" prefix
415 |                 image_ts = int(row[cam_name])
416 |                 mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png"
417 |                 mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED)
418 |                 mask = self.semantic_transform(mask)
419 |                 data[data_source] = mask
420 |             elif data_source.startswith("text_"):
421 |                 cam_name = data_source[5:]  # remove "text_" prefix
422 |                 image_ts = int(row[cam_name])
423 |                 text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt"
424 |                 text_embedding = torch.load(text_filepath, map_location="cpu").squeeze()
425 |                 data[data_source] = text_embedding
426 |             elif data_source == "pointcloud_lidar":
427 |                 pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin"
428 |                 coords = self._load_pc(pc_filepath)
429 |                 coords = self.pointcloud_transform(coords)
430 |                 if self._spherical_coords:
431 |                     raise NotImplementedError("Spherical coords are not implemented yet.")
432 |                 data[f"{data_source}_coords"] = coords
433 |                 data[f"{data_source}_feats"] = torch.ones_like(coords[:, :1])
434 | 
435 |         return data
436 | 
437 |     def _load_pc(self, filepath: Union[str, Path]) -> Tensor:
438 |         pc = np.fromfile(filepath, dtype=np.float64).reshape(-1, 3)
439 |         if self._max_point_distance is not None:
440 |             pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance]
441 |         pc_tensor = torch.tensor(pc, dtype=torch.float)
442 |         return pc_tensor
443 | 
444 |     def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
445 |         """Pack input data list into batch.
446 | 
447 |         Args:
448 |             data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader.
449 | 
450 |         Returns:
451 |             Dict[str, Tensor]: dictionary of batched data.
452 |         """
453 |         return collate_data_dict(self, data_list)
454 | 


--------------------------------------------------------------------------------
/src/mssplace/modality_interaction_layers.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import MinkowskiEngine as ME  # noqa: N817
  4 | import torch
  5 | from torch import nn, Tensor
  6 | from opr.models.place_recognition.base import ImageModel, SemanticModel, CloudModel
  7 | from opr.modules import Concat
  8 | 
  9 | _modalities = ("image", "cloud", "semantic", "text")
 10 | 
 11 | 
 12 | class LateFusionModel(nn.Module):
 13 |     """Meta-model for multimodal Place Recognition architectures with late fusion."""
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         image_module: Optional[ImageModel] = None,
 18 |         semantic_module: Optional[SemanticModel] = None,
 19 |         cloud_module: Optional[CloudModel] = None,
 20 |         text_module: Optional[nn.Module] = None,
 21 |         soc_module: Optional[nn.Module] = None,
 22 |         fusion_module: Optional[nn.Module] = None,
 23 |     ) -> None:
 24 |         """Meta-model for multimodal Place Recognition architectures with late fusion.
 25 | 
 26 |         Args:
 27 |             image_module (ImageModule, optional): Image modality branch. Defaults to None.
 28 |             semantic_module (SemanticModel, optional): Semantic modality branch. Defaults to None.
 29 |             cloud_module (CloudModule, optional): Cloud modality branch. Defaults to None.
 30 |             soc_module (nn.Module, optional): Module to fuse different modalities.
 31 |             fusion_module (FusionModule, optional): Module to fuse different modalities.
 32 |                 If None, will be set to opr.modules.Concat(). Defaults to None.
 33 |         """
 34 |         super().__init__()
 35 | 
 36 |         self.image_module = image_module
 37 |         self.semantic_module = semantic_module
 38 |         self.cloud_module = cloud_module
 39 |         self.text_module = text_module
 40 |         self.soc_module = soc_module
 41 |         if fusion_module:
 42 |             self.fusion_module = fusion_module
 43 |         else:
 44 |             self.fusion_module = Concat()
 45 | 
 46 |     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:  # noqa: D102
 47 |         out_dict: dict[str, Tensor] = {}
 48 | 
 49 |         if self.image_module is not None:
 50 |             out_dict["image"] = self.image_module(batch)["final_descriptor"]
 51 | 
 52 |         if self.semantic_module is not None:
 53 |             out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"]
 54 | 
 55 |         if self.cloud_module is not None:
 56 |             out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"]
 57 | 
 58 |         if self.text_module is not None:
 59 |             out_dict["text"] = self.text_module(batch)["final_descriptor"]
 60 | 
 61 |         if self.soc_module is not None:
 62 |             out_dict["soc"] = self.soc_module(batch)["final_descriptor"]
 63 | 
 64 |         out_dict = self.fusion_module(out_dict)
 65 | 
 66 |         if not isinstance(out_dict, dict):
 67 |             out_dict = {"final_descriptor": out_dict}
 68 | 
 69 |         return out_dict
 70 | 
 71 | 
 72 | class MiddleFusionModel(LateFusionModel):
 73 |     def __init__(
 74 |         self,
 75 |         image_module: Optional[ImageModel] = None,
 76 |         semantic_module: Optional[SemanticModel] = None,
 77 |         cloud_module: Optional[CloudModel] = None,
 78 |         soc_module: Optional[nn.Module] = None,
 79 |         fusion_module: Optional[nn.Module] = None,
 80 |     ) -> None:
 81 |         super().__init__(image_module, semantic_module, cloud_module, soc_module, fusion_module)
 82 |         self.cloud_dim_reduction = ME.MinkowskiAvgPooling(kernel_size=3, stride=3, dimension=3)
 83 |         self.final_fusion = Concat()
 84 | 
 85 |     def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:  # noqa: D102
 86 |         ### step 1: feature extraction
 87 |         if self.image_module is not None:
 88 |             img_features = {}
 89 |             img_features_shapes = {}
 90 |             for key, value in batch.items():
 91 |                 if key.startswith("images_"):
 92 |                     img_features[key] = self.image_module.backbone(value)
 93 |                     img_features_shapes[key] = img_features[key].shape
 94 |                     img_features[key] = (
 95 |                         img_features[key]
 96 |                         .view(img_features[key].shape[0], img_features[key].shape[1], -1)
 97 |                         .permute(0, 2, 1)
 98 |                     )  # (B, N_feats, Desc_dim)
 99 |         if self.semantic_module is not None:
100 |             semantic_features = {}
101 |             semantic_features_shapes = {}
102 |             for key, value in batch.items():
103 |                 if key.startswith("masks_"):
104 |                     semantic_features[key] = self.semantic_module.backbone(value)
105 |                     semantic_features_shapes[key] = semantic_features[key].shape
106 |                     semantic_features[key] = (
107 |                         semantic_features[key]
108 |                         .view(semantic_features[key].shape[0], semantic_features[key].shape[1], -1)
109 |                         .permute(0, 2, 1)
110 |                     )  # (B, N_feats, Desc_dim)
111 |         if self.cloud_module is not None:
112 |             sparse_voxel = ME.SparseTensor(
113 |                 features=batch["pointclouds_lidar_feats"], coordinates=batch["pointclouds_lidar_coords"]
114 |             )
115 |             sparse_cloud_features = self.cloud_module.backbone(sparse_voxel)
116 |             sparse_cloud_features = self.cloud_dim_reduction(sparse_cloud_features)
117 |         # TODO: add text model
118 | 
119 |         ### step 2: transformer interaction
120 |         tokens_dict = {}
121 |         if self.image_module is not None:
122 |             tokens_dict["image"] = torch.cat(list(img_features.values()), dim=1)
123 |         if self.semantic_module is not None:
124 |             tokens_dict["semantic"] = torch.cat(list(semantic_features.values()), dim=1)
125 |         if self.cloud_module is not None:
126 |             min_coordinate = torch.tensor(
127 |                 [
128 |                     torch.min(sparse_cloud_features.C[:, 1]),
129 |                     torch.min(sparse_cloud_features.C[:, 2]),
130 |                     torch.min(sparse_cloud_features.C[:, 3]),
131 |                 ]
132 |             )
133 |             dense_cloud_features, _, _ = sparse_cloud_features.dense(min_coordinate=min_coordinate)
134 |             dense_cloud_shape = dense_cloud_features.shape
135 |             dense_cloud_features = dense_cloud_features.view(
136 |                 dense_cloud_features.shape[0], dense_cloud_features.shape[1], -1
137 |             ).permute(0, 2, 1)  # (B, N_feats, Desc_dim)
138 |             tokens_dict["cloud"] = dense_cloud_features
139 |         tokens_dict = self.fusion_module(tokens_dict)
140 | 
141 |         ### step 3: back into initial states and finish processing
142 |         out_dict = {}
143 |         if self.image_module is not None:
144 |             image_feat_lens = [s[-1] * s[-2] for s in img_features_shapes.values()]
145 |             img_features_list = torch.split(tokens_dict["image"], image_feat_lens, dim=1)
146 |             for key, feats in zip(list(img_features.keys()), img_features_list):
147 |                 img_features[key] = feats.permute(0, 2, 1).view(*img_features_shapes[key])
148 |                 img_features[key] = self.image_module.head(img_features[key])
149 |             out_dict["image"] = self.image_module.fusion(img_features)
150 |         if self.cloud_module is not None:
151 |             dense_cloud_features = tokens_dict["cloud"].permute(0, 2, 1).view(*dense_cloud_shape)
152 |             out_dict["cloud"] = self.cloud_module.head(ME.to_sparse(dense_cloud_features))
153 |         out_dict["final_descriptor"] = self.final_fusion(out_dict)
154 |         return out_dict
155 | 
156 | 
157 | class TransformerModalityInteraction(nn.Module):
158 |     def __init__(
159 |         self,
160 |         desc_dim: int = 256,
161 |         image: bool = True,
162 |         cloud: bool = True,
163 |         semantic: bool = False,
164 |         text: bool = False,
165 |         use_modality_embeddings: bool = False,
166 |         n_heads: int = 4,
167 |         n_layers: int = 4,
168 |         hidden_dim: int = 1024,
169 |         dropout: float = 0.0,
170 |         activation: str = "gelu",
171 |     ) -> None:
172 |         super().__init__()
173 | 
174 |         self.use_modality_embeddings = use_modality_embeddings
175 | 
176 |         self.modalities = []
177 |         if image:
178 |             self.modalities.append("image")
179 |         if cloud:
180 |             self.modalities.append("cloud")
181 |         if semantic:
182 |             self.modalities.append("semantic")
183 |         if text:
184 |             self.modalities.append("text")
185 | 
186 |         if self.use_modality_embeddings:
187 |             self.modality_embeddings = nn.ParameterDict(
188 |                 {
189 |                     "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
190 |                     "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
191 |                     "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
192 |                     "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
193 |                 }
194 |             )
195 | 
196 |         transformer_encoder_layer = nn.TransformerEncoderLayer(
197 |             d_model=desc_dim,
198 |             nhead=n_heads,
199 |             dim_feedforward=hidden_dim,
200 |             dropout=dropout,
201 |             activation=activation,
202 |             batch_first=True,
203 |         )
204 |         self.transformer_encoder = nn.TransformerEncoder(
205 |             transformer_encoder_layer, num_layers=n_layers, enable_nested_tensor=False
206 |         )
207 | 
208 |     def forward(self, data: dict[str, Tensor]) -> Tensor:
209 |         descriptors = []
210 | 
211 |         for key in self.modalities:
212 |             if self.use_modality_embeddings:
213 |                 descriptors.append(data[key] + self.modality_embeddings[key])
214 |             else:
215 |                 descriptors.append(data[key])
216 | 
217 |         descriptors = torch.stack(descriptors, dim=1)
218 |         # desc_lens = [d.shape[1] for d in descriptors]
219 |         # descriptors = torch.cat(descriptors, dim=1)
220 |         descriptors = torch.unbind(self.transformer_encoder(descriptors), dim=1)
221 |         # descriptors = torch.split(self.transformer_encoder(descriptors), desc_lens, dim=1)
222 |         out_dict = {}
223 |         for i, key in enumerate(self.modalities):
224 |             out_dict[key] = descriptors[i]
225 |         out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1)
226 |         return out_dict
227 | 
228 | 
229 | class SelfAttentionModalityInteraction(nn.Module):
230 |     def __init__(
231 |         self,
232 |         desc_dim: int = 256,
233 |         image: bool = True,
234 |         cloud: bool = True,
235 |         semantic: bool = False,
236 |         text: bool = False,
237 |         use_modality_embeddings: bool = False,
238 |         n_heads: int = 4,
239 |         dropout: float = 0.0,
240 |     ) -> None:
241 |         super().__init__()
242 | 
243 |         self.use_modality_embeddings = use_modality_embeddings
244 | 
245 |         self.modalities = []
246 |         if image:
247 |             self.modalities.append("image")
248 |         if cloud:
249 |             self.modalities.append("cloud")
250 |         if semantic:
251 |             self.modalities.append("semantic")
252 |         if text:
253 |             self.modalities.append("text")
254 | 
255 |         if self.use_modality_embeddings:
256 |             self.modality_embeddings = nn.ParameterDict(
257 |                 {
258 |                     "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
259 |                     "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
260 |                     "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
261 |                     "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
262 |                 }
263 |             )
264 | 
265 |         self.self_attention = nn.MultiheadAttention(
266 |             embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True
267 |         )
268 | 
269 |     def forward(self, data: dict[str, Tensor]) -> Tensor:
270 |         descriptors = []
271 | 
272 |         for key in self.modalities:
273 |             if self.use_modality_embeddings:
274 |                 descriptors.append(data[key] + self.modality_embeddings[key])
275 |             else:
276 |                 descriptors.append(data[key])
277 | 
278 |         # descriptors = torch.stack(descriptors, dim=1)
279 |         # descriptors = torch.unbind(self.self_attention(descriptors, descriptors, descriptors)[0], dim=1)
280 |         desc_lens = [d.shape[1] for d in descriptors]
281 |         descriptors = torch.cat(descriptors, dim=1)
282 |         descriptors = torch.split(
283 |             self.self_attention(descriptors, descriptors, descriptors, need_weights=False)[0],
284 |             desc_lens,
285 |             dim=1,
286 |         )
287 |         out_dict = {}
288 |         for i, key in enumerate(self.modalities):
289 |             out_dict[key] = descriptors[i]
290 |         # out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1)
291 |         return out_dict
292 | 
293 | 
294 | class CrossAttentionModalityInteraction(nn.Module):
295 |     def __init__(
296 |         self,
297 |         desc_dim: int = 256,
298 |         image: bool = True,
299 |         cloud: bool = True,
300 |         semantic: bool = False,
301 |         text: bool = False,
302 |         use_modality_embeddings: bool = False,
303 |         n_heads: int = 4,
304 |         dropout: float = 0.0,
305 |     ) -> None:
306 |         super().__init__()
307 | 
308 |         self.use_modality_embeddings = use_modality_embeddings
309 | 
310 |         self.modalities = []
311 |         if image:
312 |             self.modalities.append("image")
313 |         if cloud:
314 |             self.modalities.append("cloud")
315 |         if semantic:
316 |             self.modalities.append("semantic")
317 |         if text:
318 |             self.modalities.append("text")
319 | 
320 |         if self.use_modality_embeddings:
321 |             self.modality_embeddings = nn.ParameterDict(
322 |                 {
323 |                     "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
324 |                     "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
325 |                     "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
326 |                     "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
327 |                 }
328 |             )
329 | 
330 |         self.cross_attn_dict = nn.ModuleDict({})
331 |         for key in self.modalities:
332 |             self.cross_attn_dict[key] = nn.MultiheadAttention(
333 |                 embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True
334 |             )
335 | 
336 |     def forward(self, data: dict[str, Tensor]) -> dict[str, Tensor]:
337 |         out_dict = {}
338 | 
339 |         for query_modality in self.modalities:
340 |             query = data[query_modality].unsqueeze(1)
341 |             if self.use_modality_embeddings:
342 |                 query += self.modality_embeddings[query_modality]
343 | 
344 |             # Prepare keys and values from other modalities
345 |             keys = []
346 |             for key_modality in self.modalities:
347 |                 if key_modality != query_modality:
348 |                     key_value = data[key_modality]
349 |                     if self.use_modality_embeddings:
350 |                         key_value += self.modality_embeddings[key_modality]
351 |                     keys.append(key_value)
352 |             # Stack keys and values from all other modalities
353 |             keys = values = torch.stack(keys, dim=1)
354 | 
355 |             # Apply cross-attention
356 |             attn_output, _ = self.cross_attn_dict[query_modality](query=query, key=keys, value=values)
357 |             out_dict[query_modality] = attn_output
358 | 
359 |         out_dict["final_descriptor"] = torch.cat(out_dict.values(), dim=-1)
360 | 
361 |         return out_dict
362 | 


--------------------------------------------------------------------------------
/src/mssplace/models.py:
--------------------------------------------------------------------------------
  1 | """Models implementation"""
  2 | from typing import Dict, Optional
  3 | 
  4 | import torch
  5 | from torch import Tensor, nn
  6 | from opr.modules import Concat
  7 | from opr.modules.gem import SeqGeM
  8 | 
  9 | 
 10 | class GeMMultiFeatureMapsFusion(nn.Module):
 11 |     """GeM fusion module for multiple 2D feature maps."""
 12 | 
 13 |     def __init__(self, p: int = 3, eps: float = 1e-6) -> None:
 14 |         """Generalized-Mean fusion module.
 15 | 
 16 |         Args:
 17 |             p (int): Initial value of learnable parameter 'p', see paper for more details. Defaults to 3.
 18 |             eps (float): Negative values will be clamped to `eps` (ReLU). Defaults to 1e-6.
 19 |         """
 20 |         super().__init__()
 21 |         self.gem = SeqGeM(p=p, eps=eps)
 22 | 
 23 |     def forward(self, data: Dict[str, Tensor]) -> Tensor:  # noqa: D102
 24 |         data = {key: value for key, value in data.items() if value is not None}
 25 |         features = list(data.values())
 26 |         features = [f.view(f.shape[0], f.shape[1], -1) for f in features]
 27 |         features = torch.cat(features, dim=-1)
 28 |         out = self.gem(features)
 29 |         if len(out.shape) == 1:
 30 |             out = out.unsqueeze(0)
 31 |         return out
 32 | 
 33 | 
 34 | class TextModel(nn.Module):
 35 |     """Meta-model for text-based Place Recognition."""
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         model: nn.Module,
 40 |         fusion: Optional[nn.Module] = None,
 41 |     ) -> None:
 42 |         """Meta-model for text-based Place Recognition.
 43 | 
 44 |         Args:
 45 |             model (nn.Module): Text backbone.
 46 |             fusion (nn.Module, optional): Module to fuse descriptors for multiple texts in batch.
 47 |                 Defaults to None.
 48 |         """
 49 |         super().__init__()
 50 |         self.model = model
 51 |         self.fusion = fusion
 52 | 
 53 |     def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]:  # noqa: D102
 54 |         text_descriptors = {}
 55 |         for key, value in batch.items():
 56 |             if key.startswith("texts_"):
 57 |                 text_descriptors[key] = self.model(value)
 58 |         if len(text_descriptors) > 1:
 59 |             if self.fusion is None:
 60 |                 raise ValueError("Fusion module is not defined but multiple texts are provided")
 61 |             descriptor = self.fusion(text_descriptors)
 62 |         else:
 63 |             if self.fusion is not None:
 64 |                 raise ValueError("Fusion module is defined but only one text is provided")
 65 |             descriptor = list(text_descriptors.values())[0]
 66 |         out_dict: Dict[str, Tensor] = {"final_descriptor": descriptor}
 67 |         return out_dict
 68 | 
 69 | 
 70 | class LateFusionModel(nn.Module):
 71 |     """Meta-model for multimodal Place Recognition architectures with late fusion."""
 72 | 
 73 |     def __init__(
 74 |         self,
 75 |         image_module: Optional[nn.Module] = None,
 76 |         semantic_module: Optional[nn.Module] = None,
 77 |         cloud_module: Optional[nn.Module] = None,
 78 |         text_module: Optional[nn.Module] = None,
 79 |         fusion_module: Optional[nn.Module] = None,
 80 |     ) -> None:
 81 |         """Meta-model for multimodal Place Recognition architectures with late fusion.
 82 | 
 83 |         Args:
 84 |             image_module (nn.Module, optional): Image modality branch. Defaults to None.
 85 |             semantic_module (nn.Module, optional): Semantic modality branch. Defaults to None.
 86 |             cloud_module (nn.Module, optional): Cloud modality branch. Defaults to None.
 87 |             text_module (nn.Module, optional): Text modality branch. Defaults to None.
 88 |             fusion_module (nn.Module, optional): Module to fuse different modalities.
 89 |                 If None, will be set to opr.modules.Concat(). Defaults to None.
 90 |         """
 91 |         super().__init__()
 92 | 
 93 |         self.image_module = image_module
 94 |         self.semantic_module = semantic_module
 95 |         self.cloud_module = cloud_module
 96 |         self.text_module = text_module
 97 |         if fusion_module:
 98 |             self.fusion_module = fusion_module
 99 |         else:
100 |             self.fusion_module = Concat()
101 | 
102 |     def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]:  # noqa: D102
103 |         out_dict: Dict[str, Tensor] = {}
104 | 
105 |         if self.image_module is not None:
106 |             out_dict["image"] = self.image_module(batch)["final_descriptor"]
107 | 
108 |         if self.semantic_module is not None:
109 |             out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"]
110 | 
111 |         if self.cloud_module is not None:
112 |             out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"]
113 | 
114 |         if self.text_module is not None:
115 |             out_dict["text"] = self.text_module(batch)["final_descriptor"]
116 | 
117 |         out_dict["final_descriptor"] = self.fusion_module(out_dict)
118 | 
119 |         return out_dict
120 | 


--------------------------------------------------------------------------------
/train_unimodal.py:
--------------------------------------------------------------------------------
 1 | """Script to train a single-modal Place Recognition model."""
 2 | import logging
 3 | import pprint
 4 | import sys
 5 | from datetime import datetime
 6 | from pathlib import Path
 7 | from typing import Dict, Literal
 8 | 
 9 | import hydra
10 | import wandb
11 | from hydra.utils import instantiate
12 | from loguru import logger
13 | from omegaconf import DictConfig, OmegaConf
14 | from torch.utils.data import DataLoader
15 | 
16 | from opr.datasets.dataloader_factory import make_dataloaders
17 | from opr.trainers.place_recognition import UnimodalPlaceRecognitionTrainer
18 | from opr.utils import set_seed
19 | 
20 | REPO_ROOT = Path(__file__).resolve().parent
21 | 
22 | @hydra.main(config_path="configs", config_name="train_unimodal", version_base=None)
23 | def main(cfg: DictConfig) -> None:
24 |     """Training code.
25 | 
26 |     Args:
27 |         cfg (DictConfig): config to train with
28 |     """
29 |     config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
30 |     logger.info(f"Config:\n{pprint.pformat(config_dict, compact=True)}")
31 | 
32 |     if not cfg.debug and not cfg.wandb.disabled:
33 |         config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 |         wandb.init(
35 |             dir=hydra.core.hydra_config.HydraConfig.get().runtime.output_dir,
36 |             name=cfg.exp_name,
37 |             project=cfg.wandb.project,
38 |             settings=wandb.Settings(start_method="thread"),
39 |             config=config_dict,
40 |         )
41 |         logger.debug(f"Initialized wandb run with name: {wandb.run.name}")
42 | 
43 |     logger.info(f"Output directory: {hydra.core.hydra_config.HydraConfig.get().runtime.output_dir}")
44 |     checkpoints_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir) / "checkpoints"
45 |     if not checkpoints_dir.exists():
46 |         checkpoints_dir.mkdir(parents=True)
47 | 
48 |     set_seed(seed=cfg.seed, make_deterministic=False)
49 |     logger.info(f"=> Seed: {cfg.seed}")
50 | 
51 |     logger.debug("=> Instantiating model...")
52 |     model = instantiate(cfg.model)
53 | 
54 |     logger.debug("=> Instantiating loss...")
55 |     loss_fn = instantiate(cfg.loss)
56 | 
57 |     logger.debug("=> Making dataloaders...")
58 |     dataloaders: Dict[Literal["train", "val", "test"], DataLoader] = make_dataloaders(
59 |         dataset_cfg=cfg.dataset,
60 |         batch_sampler_cfg=cfg.sampler,
61 |         num_workers=cfg.num_workers,
62 |     )
63 | 
64 |     logger.debug("=> Instantiating optimizer...")
65 |     optimizer = instantiate(cfg.optimizer, params=model.parameters())
66 |     logger.debug("=> Instantiating scheduler...")
67 |     scheduler = instantiate(cfg.scheduler, optimizer=optimizer)
68 | 
69 |     logger.debug("=> Instantiating trainer...")
70 |     trainer = UnimodalPlaceRecognitionTrainer(
71 |         checkpoints_dir=checkpoints_dir,
72 |         model=model,
73 |         loss_fn=loss_fn,
74 |         optimizer=optimizer,
75 |         scheduler=scheduler,
76 |         batch_expansion_threshold=cfg.batch_expansion_threshold,
77 |         wandb_log=(not cfg.debug and not cfg.wandb.disabled),
78 |         device=cfg.device,
79 |     )
80 | 
81 |     logger.info(f"=====> {trainer.__class__.__name__} is ready, starting training for {cfg.epochs} epochs.")
82 | 
83 |     trainer.train(
84 |         epochs=cfg.epochs,
85 |         train_dataloader=dataloaders["train"],
86 |         val_dataloader=dataloaders["val"],
87 |         test_dataloader=dataloaders["test"],
88 |     )
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     run_dir = REPO_ROOT / "outputs" / (r"${exp_name}" + f"_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}")
93 |     sys.argv.append(f"hydra.run.dir={run_dir}")
94 |     main()
95 | 


--------------------------------------------------------------------------------